In [1]:
%matplotlib inline

In [2]:
from helper.common_imports import *
from helper import processing_new as ps

# 1. load the dataset

In [3]:
datafolder = './dataset/'
df = pd.read_csv(datafolder+'dataset.pd', index_col=0)

# 2. extract the ROI
The previous notebook has shown how the ROI could be identified. Here, the ROI is computed for all samples in the dataset. 

In [4]:
%%time
from multiprocessing import Pool
pool = Pool(processes=6)
Xc = pool.map(ps.extract_feature, df.iterrows())

classification 0:01:12.504578 for 900 frames
classification 0:01:12.914846 for 900 frames
classification 0:01:13.220621 for 900 frames
classification 0:01:13.275517 for 900 frames
classification 0:01:13.343742 for 900 frames
classification 0:01:13.475782 for 900 frames
classification 0:01:13.414863 for 900 frames
classification 0:01:13.398241 for 900 frames
classification 0:01:13.440922 for 900 frames
classification 0:01:13.414812 for 900 frames
classification 0:01:13.904753 for 900 frames
classification 0:01:13.663842 for 900 frames
CPU times: user 150 ms, sys: 76.6 ms, total: 227 ms
Wall time: 2min 27s


In [5]:
Xc = np.array(Xc)

In [6]:
Xc.shape

(12, 900, 5, 5)

Now, we have `100x100` images that represent the dataset.

# 3. test sklearn classifiers
Before going into more feature extraction or more complex models, it is interesting to test out the default classifiers offered by a package such as scikit-learn and assess their performance out of the box.

K-fold validation. The splits that would make sense are across view (S3 and S4) and across fingers.

In [7]:
S3 = df['surface'] == 'S3'
S4 = df['surface'] == 'S4'

index = df['finger'] == 'index'
thumb = df['finger'] == 'thumb'
pinky = df['finger'] == 'pinky'

def df_id(condition):
    return df.index[condition].values

kfolds = [(df_id(S3), df_id(S4)),
          (df_id(S4), df_id(S3)),
          (np.hstack([df_id(index), df_id(thumb)]), df_id(pinky)),
          (np.hstack([df_id(index), df_id(pinky)]), df_id(thumb)),
          (np.hstack([df_id(pinky), df_id(thumb)]), df_id(index)),
         ]

In [8]:
for train, test in kfolds:
    print("%s %s" % (train, test))

[0 1 2 3 4 5] [ 6  7  8  9 10 11]
[ 6  7  8  9 10 11] [0 1 2 3 4 5]
[0 1 6 7 2 3 8 9] [ 4  5 10 11]
[ 0  1  6  7  4  5 10 11] [2 3 8 9]
[ 4  5 10 11  2  3  8  9] [0 1 6 7]


create a shareable version of the dataset

In [9]:
Xc = Xc.reshape((12,900,-1))

In [10]:
from multiprocessing import Array

In [11]:
import ctypes

In [12]:
Xc_shared = Array(ctypes.c_double, Xc.reshape(-1))

In [18]:
from sklearn.neighbors import KNeighborsClassifier

In [19]:
clf = KNeighborsClassifier

In [22]:
for train_id, test_id in kfolds:

    print(train_id, test_id)
    Xc = np.frombuffer(Xc_shared.get_obj()).reshape((12,900,25))
    y = np.tile(np.r_[np.ones(900), np.zeros(900)], 6).reshape(12, 900)

    X_train = np.r_[tuple([Xc[i] for i in train_id])]
    X_test  = np.r_[tuple([Xc[i] for i in test_id])]

    y_train = y[train_id].reshape(-1)
    y_test  = y[test_id].reshape(-1) 
    clf_inst = clf()

    clf_inst.fit(X_train, y_train)
    score = clf_inst.score(X_test, y_test)
    print(score)

[0 1 2 3 4 5] [ 6  7  8  9 10 11]
0.7446296296296296
[ 6  7  8  9 10 11] [0 1 2 3 4 5]
0.7609259259259259
[0 1 6 7 2 3 8 9] [ 4  5 10 11]
0.7758333333333334
[ 0  1  6  7  4  5 10 11] [2 3 8 9]
0.8208333333333333
[ 4  5 10 11  2  3  8  9] [0 1 6 7]
0.8163888888888889


In [None]:
import multiprocessing as mp

In [None]:
class ScoreClassifier(mp.Process):
    def __init__(self, clf, kfolds_ids, Xc_shared):
        super(ScoreClassifier, self).__init__()
        self.clf = clf
        self.kfolds_ids = kfolds_ids
        self.Xc_shared = Xc_shared
        
    def run(self):
        
        print('running')
        for train_id, test_id in self.kfolds_ids:
            
            print(train_id, test_id)
            Xc = np.frombuffer(self.Xc_shared.get_obj()).reshape((12,900,25))
            y = np.tile(np.r_[np.ones(900), np.zeros(900)], 6).reshape(12, 900)

            X_train = np.r_[tuple([Xc[i] for i in train_id])]
            X_test  = np.r_[tuple([Xc[i] for i in test_id])]

            y_train = y[train_id].reshape(-1)
            y_test  = y[test_id].reshape(-1) 
            clf = self.clf()
            
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            print(score)
            #fpr, tpr, thr = metrics.roc_curve(y_test, y_pred)
            #self.clf.predict(test)
        
        return score

In [13]:
np.frombuffer(Xc_shared.get_obj()).reshape(Xc.shape)

array([[[ 1.45480198,  1.46780645,  0.95341042, ..., -1.60935571,
         -1.49075375, -1.40978487],
        [ 1.45260533,  1.40307893,  0.92138561, ..., -1.6476474 ,
         -1.51474007, -1.40377799],
        [ 1.46537587,  1.4153678 ,  1.03190732, ..., -1.5942429 ,
         -1.48762982, -1.3865475 ],
        ...,
        [ 1.44551325,  1.42087359,  1.31944732, ..., -1.44454238,
         -1.38381748, -1.38501066],
        [ 0.03709646,  0.00346619, -1.58867387, ...,  0.8840535 ,
         -1.45865097, -1.37945944],
        [ 1.45593412,  1.44655805,  0.02102594, ..., -1.07301587,
         -0.03060029, -1.37566796]],

       [[ 1.41365544,  1.43468173,  0.26455169, ..., -1.54820554,
         -1.53223902, -1.40167258],
        [ 1.43259633,  1.44485926,  0.05720058, ..., -1.52857548,
         -1.55663196, -1.4140837 ],
        [ 1.44470852,  1.44376069, -0.20317459, ..., -1.44562841,
         -1.48770089, -1.39017511],
        ...,
        [ 1.44883849,  1.46079217,  1.23283655, ..., -

In [8]:
y = np.tile(np.r_[np.ones(900), np.zeros(900)], 6).reshape(12, 900)

In [9]:
test_id = [0,1,2,3,4,5]
train_id = [6,7,8,9,10,11]

X_train = np.r_[tuple([Xc[i] for i in train_id])]
X_test  = np.r_[tuple([Xc[i] for i in test_id])]

y_train = y[train_id].reshape(-1)
y_test  = y[test_id].reshape(-1)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [11]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

In [12]:
%%time
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(name, score)

Nearest Neighbors 0.7605555555555555
Linear SVM 0.6409259259259259
RBF SVM 0.7811111111111111
Gaussian Process 0.7498148148148148
Decision Tree 0.64
Random Forest 0.7374074074074074
Neural Net 0.7566666666666667
AdaBoost 0.5564814814814815
Naive Bayes 0.6861111111111111
CPU times: user 18.6 s, sys: 323 ms, total: 19 s
Wall time: 8.86 s




In [55]:
import multiprocessing as mp

In [76]:
class ScoreClassifier(mp.Process):
    def __init__(self, clf, kfolds_ids, Xc_shared):
        super(ScoreClassifier, self).__init__()
        self.clf = clf
        self.kfolds_ids = kfolds_ids
        self.Xc_shared = Xc_shared
        
    def run(self):
        
        print('running')
        for train_id, test_id in self.kfolds_ids:
            
            print(train_id, test_id)
            Xc = np.frombuffer(self.Xc_shared.get_obj()).reshape((12,900,25))
            y = np.tile(np.r_[np.ones(900), np.zeros(900)], 6).reshape(12, 900)

            X_train = np.r_[tuple([Xc[i] for i in train_id])]
            X_test  = np.r_[tuple([Xc[i] for i in test_id])]

            y_train = y[train_id].reshape(-1)
            y_test  = y[test_id].reshape(-1) 
            clf = self.clf()
            
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            print(score)
            #fpr, tpr, thr = metrics.roc_curve(y_test, y_pred)
            #self.clf.predict(test)
        
        return score

In [78]:
SC = ScoreClassifier(KNeighborsClassifier, kfolds, Xc_shared)

In [79]:
SC.start()