Interesting. We can see here that when weights are ignored, the use of ordinal data significantly increases the f1 score. Note that we also get a fairly good f1 score even though we are using a single feature of signal. So let's proceed from now on using ordinal data approach but we'll get some real features to play with (windows etc).

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

plt.rcParams['agg.path.chunksize'] = 10000

df = pd.read_csv('train.csv',index_col=0);

df_terminus = df.index.values[-1]
batches = int(df_terminus/50)
seconds_in_batch = 50
freq = 10000

base = np.array([])
for batch in range(batches):
    mini_array = np.ones(seconds_in_batch*freq)*batch
    base = np.append(base,mini_array)

df['batch'] = base
df['batch'] = df['batch'].astype('int')

10
5000000


Time to generate some x values which are a little bit more engineered than usual.

In [96]:
X = []
y = []

#Set the sample_factor to an integer n which will sample every nth datapoint
#This significantly speeds up dataset creation.

sample_factor = 100

for batch_number in range(batches):
    print('Starting new batch: ', batch_number)
    window_size = 0.001
    indices = df[df.batch == batch_number].index.values
    padd_no = 0
    for index in indices[::sample_factor]:
        
        if index % 25 == 0:
            print(index)
        
        open_channels = df.loc[index].open_channels
        features = df.loc[index-window_size/2:index+window_size/2].signal.values

        if len(features) <= int(window_size*10000):

            padding = np.ones((int(window_size*10000)-len(features))+1)*features.mean()
            features = np.append(features,padding)
            padd_no += 1
            
        X.append(features)
        y.append(open_channels)
    print('End batch: ', batch_number, ' with padded fraction = ', padd_no/len(indices))
     
X = np.array(X)
y = np.array(y).astype('int')

Starting new batch:  0
End batch:  0  with padded fraction =  0.002858
Starting new batch:  1
End batch:  1  with padded fraction =  0.0
Starting new batch:  2
End batch:  2  with padded fraction =  0.003518
Starting new batch:  3
End batch:  3  with padded fraction =  0.008
Starting new batch:  4
End batch:  4  with padded fraction =  0.008
Starting new batch:  5
End batch:  5  with padded fraction =  0.00448
Starting new batch:  6
End batch:  6  with padded fraction =  0.004
Starting new batch:  7
End batch:  7  with padded fraction =  0.004
Starting new batch:  8
End batch:  8  with padded fraction =  0.004
Starting new batch:  9
End batch:  9  with padded fraction =  0.004


In [97]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.20, random_state=42)

In [98]:
set(y_val)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

Now let's try implementing the ordinal classifier
https://towardsdatascience.com/simple-trick-to-train-an-ordinal-regression-with-any-classifier-6911183d2a3c

In [99]:
from sklearn.base import clone


class OrdinalClassifier():
    
    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}
    
    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf
    
    def predict_proba(self, X):
        clfs_predict = {k:self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i,y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[y][:,1])
            elif y in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                 predicted.append(clfs_predict[y-1][:,1] - clfs_predict[y][:,1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[y-1][:,1])
        return np.vstack(predicted).T
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

Let's start with a multiclass logistic regression

In [100]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0)
clf_ordinal = OrdinalClassifier(clf)
clf_ordinal.fit(X_train,y_train)



In [101]:
clf_ordinal.unique_class

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [103]:
preds_val_ord = clf_ordinal.predict(X_val)

print('f1 score for vals_ords w/o weighting is ', f1_score(y_val, preds_val_ord, average='macro'))

f1 score for vals_ords w/o weighting is  0.20280004283037076


f1 score for vals_ords w/o weighting and just one feature is  0.19297993133718946
f1 score for vals_ords w/o weighting and 11 features is  0.20280004283037076

That's...not much of an improvement. Quite disappointing really! Maybe the sampling_factor is creating issues.

In [104]:
X = []
y = []

#Set the sample_factor to an integer n which will sample every nth datapoint
#This significantly speeds up dataset creation.

sample_factor = 10

for batch_number in range(batches):
    print('Starting new batch: ', batch_number)
    window_size = 0.001
    indices = df[df.batch == batch_number].index.values
    padd_no = 0
    for index in indices[::sample_factor]:
        
        if index % 25 == 0:
            print(index)
        
        open_channels = df.loc[index].open_channels
        features = df.loc[index-window_size/2:index+window_size/2].signal.values

        if len(features) <= int(window_size*10000):

            padding = np.ones((int(window_size*10000)-len(features))+1)*features.mean()
            features = np.append(features,padding)
            padd_no += 1
            
        X.append(features)
        y.append(open_channels)
    print('End batch: ', batch_number, ' with padded fraction = ', padd_no/len(indices))
     
X = np.array(X)
y = np.array(y).astype('int')

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.20, random_state=42)

clf = LogisticRegression(random_state=0)
clf_ordinal = OrdinalClassifier(clf)
clf_ordinal.fit(X_train,y_train)

preds_val_ord = clf_ordinal.predict(X_val)

print('f1 score for vals_ords w/o weighting is ', f1_score(y_val, preds_val_ord, average='macro'))

Starting new batch:  0
End batch:  0  with padded fraction =  0.027982
Starting new batch:  1
End batch:  1  with padded fraction =  0.0
Starting new batch:  2
End batch:  2  with padded fraction =  0.036606
Starting new batch:  3
End batch:  3  with padded fraction =  0.0832
Starting new batch:  4
End batch:  4  with padded fraction =  0.0832
Starting new batch:  5
End batch:  5  with padded fraction =  0.046592
Starting new batch:  6
End batch:  6  with padded fraction =  0.0416
Starting new batch:  7
End batch:  7  with padded fraction =  0.0416
Starting new batch:  8
End batch:  8  with padded fraction =  0.0416
Starting new batch:  9
End batch:  9  with padded fraction =  0.0416




f1 score for vals_ords w/o weighting is  0.20227509048786188


Sampling at a higher rate did NOT increase our f1 score significantly. At this point, I think I'll make a submission..

In [112]:
df_test = pd.read_csv('test.csv',index_col=0);

df_terminus = df_test.index.values[-1]
print(df_terminus)
df_init = df_test.index.values[0]
print(df_init)
seconds_in_batch = 50
batches = int(round((df_terminus-df_init)/seconds_in_batch))
print(batches)

freq = 10000

base = np.array([])
for batch in range(batches):
    mini_array = np.ones(seconds_in_batch*freq)*batch
    base = np.append(base,mini_array)

print(len(base))
print(len(df_test))
df_test['batch'] = base
df_test['batch'] = df_test['batch'].astype('int')

700.0
500.0001
4
2000000
2000000


In [119]:
X = []


#Set the sample_factor to an integer n which will sample every nth datapoint
#This significantly speeds up dataset creation.

sample_factor = 1

for batch_number in range(batches):
    print('Starting new batch: ', batch_number)
    window_size = 0.001
    indices = df_test[df_test.batch == batch_number].index.values
    padd_no = 0
    for index in indices[::sample_factor]:
        
        if index % 25 == 0:
            print(index)
        
        features = df_test.loc[index-window_size/2:index+window_size/2].signal.values

        if len(features) <= int(window_size*10000):

            padding = np.ones((int(window_size*10000)-len(features))+1)*features.mean()
            features = np.append(features,padding)
            padd_no += 1
            
        X.append(features)
    print('End batch: ', batch_number, ' with padded fraction = ', padd_no/len(indices))
     
X = np.array(X)

Starting new batch:  0
525.0
550.0
End batch:  0  with padded fraction =  0.257928
Starting new batch:  1
575.0
600.0
End batch:  1  with padded fraction =  0.208
Starting new batch:  2
625.0
650.0
End batch:  2  with padded fraction =  0.208
Starting new batch:  3
675.0
700.0
End batch:  3  with padded fraction =  0.208008


In [120]:
preds_test_ord = clf_ordinal.predict(X)

In [121]:
len(preds_test_ord)

2000000

In [122]:
df_test['open_channels'] = preds_test_ord

In [123]:
df_sample = pd.read_csv('sample_submission.csv',index_col=0);

  mask |= (ar1 == a)


In [124]:
df_sample.head()

Unnamed: 0_level_0,open_channels
time,Unnamed: 1_level_1
500.0001,0
500.0002,0
500.0003,0
500.0004,0
500.0005,0


In [147]:
df_test.head()
df_test['time'] = df_test.index.values

In [150]:
df_submit = df_test[['time','open_channels']]

In [157]:
df_submit.to_csv('submissionV5.csv',index=False,float_format='%.4f')

In [158]:
df_submit_read = pd.read_csv('submissionV5.csv')

In [159]:
df_submit_read.head()

Unnamed: 0,time,open_channels
0,500.0001,0
1,500.0002,0
2,500.0003,0
3,500.0004,0
4,500.0005,0


In [160]:
!kaggle competitions submit -c liverpool-ion-switching -f submissionV5.csv -m "index false, 3fig, logreg ordinal"

100%|██████████████████████████████████████| 21.0M/21.0M [00:00<00:00, 33.7MB/s]
Successfully submitted to University of Liverpool - Ion Switching