In [1]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np
from timeit import default_timer as timer
import scipy.io as sio
from scipy.special import comb

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from scipy.stats import expon
from sklearn.model_selection import RandomizedSearchCV


import ensemble_learning as el

# Functions

In [2]:
def onegramsTransform(data,dim1,dim2,minb=2,maxbh=30,cv = 5):
    #Removing missing values and store the result in the same variable
    data.dropna(inplace=True)
    #Grouping data by series
    group = data.groupby('Series')
    #Getting the different classes
    classe = group['classe'].apply(lambda x : x.iloc[0]).reset_index(drop=True)
    #Providing train/test indices to split data in train/test sets. Split dataset into cv consecutive folds, and shuffling the data before it
    kf = KFold(n_splits=cv,shuffle=True)
    mscore = 0
    #b1 from 2 to 29...by default
    for b1 in range(minb,maxbh):
        hist = group.apply(lambda x : np.ndarray.tolist(np.ndarray.flatten(
                    np.histogram(x.iloc[:,dim1].values,bins=b1,density=True)[0])))
        hist_tr = pd.DataFrame(hist.values.tolist())

        clf= KNeighborsClassifier(1)
        clf.fit(hist_tr, classe)

        scores = cross_val_score(clf, hist_tr, classe, cv=kf)

        if mscore < np.median(scores):
            
            mscore = np.median(scores)
            ubh1=b1
            
    #print("Bins: ", [ubh1], " \nMean scores : ", mscore)       
            
    mscore=0
    for b1 in range(minb,maxbh):
        hist = group.apply(lambda x : np.ndarray.tolist(np.ndarray.flatten(
                    np.histogram(x.iloc[:,dim2].values,bins=b1,density=True)[0])))
        hist_tr = pd.DataFrame(hist.values.tolist())

        clf=KNeighborsClassifier(1)
        clf.fit(hist_tr, classe)

        scores = cross_val_score(clf, hist_tr, classe, cv=kf)

        if mscore < np.median(scores):
            
            mscore = np.median(scores)
            ubh2=b1

    #print("Bins: ", [ubh2], " \nMean scores : ", mscore)

    return ubh1,ubh2

In [3]:
def bigramsTransform(data,dim1,dim2,minb=2,maxb=11,cv = 5):
    data.dropna(inplace=True)
    group = data.groupby('Series')
    classe = group['classe'].apply(lambda x : x.iloc[0]).reset_index(drop=True)
    kf = KFold(n_splits=cv,shuffle=True)
    mscore = 0
    for b1 in range(minb,maxb):
        for b2 in range(minb,maxb):
            hist = group.apply(lambda x : np.ndarray.tolist(np.ndarray.flatten(
                        np.histogram2d(x.iloc[:,dim1].values,x.iloc[:,dim2].values,bins=[b1,b2],density=True)[0])))
            hist_tr = pd.DataFrame(hist.values.tolist())

            clf= KNeighborsClassifier(1)
            clf.fit(hist_tr, classe)

            scores = cross_val_score(clf, hist_tr, classe, cv=kf)

            if mscore < np.median(scores):
                
                mscore = np.median(scores)
                ub1=b1
                ub2=b2
    #print("Bins: ", [ub1,ub2], " \nMean scores : ", mscore) 
    return ub1,ub2

# Datasets

### Text

In [4]:
dataset = [["ArabicDigits/ArabicDigits"," "],#0
           ["AUSLAN/AUSLAN"," ","\t"],#1
           ["CharacterTrajectories/CharacterTrajectories","\t"], #2          
           ["CMUsubject16/CMUsubject16","\t"],#3
           ["ECG/ECG"," "],#4
           ["Libras/Libras"," "],#5
           ["PenDigits/PenDigits","\t"],#6
           ['UWave/uWave'," "],#7
           ['RobotFailure/LP1'," "],#8
           ['RobotFailure/LP2'," "],#9
           ['RobotFailure/LP3'," "],#10
           ['RobotFailure/LP4'," "],#11
           ['RobotFailure/LP5'," "],#12
           ['Wafer/Wafer'," "],#13
           ["JapaneseVowels/JapaneseVowels", " "], #14
           ["ArticularyWordRecognition/ArticularyWordRecognition", " "],#15
           ["BasicMotions/BasicMotions", " "], #16
           ["Cricket/Cricket", " "], #17
           ["DuckDuckGeese/DuckDuckGeese", " "], #18
           ["EigenWorms/EigenWorms", " "], #19
           ["FingerMovements/FingerMovements", " "], #20
           ["HandMovementDirection/HandMovementDirection", " "], #21
           ["Heartbeat/Heartbeat", " "], #22
           ["LSST/LSST", " "], #23
           ["MotorImagery/MotorImagery", " "], #24
           ["NATOPS/NATOPS", " "], #25
           ["RacketSports/RacketSports", " "], #26
           ["SelfRegulationSCP1/SelfRegulationSCP1", " "], #27
           ["SelfRegulationSCP2/SelfRegulationSCP2", " "] #28
              ]

In [5]:
i=18

train = pd.read_table("MTS_Datasets/"+dataset[i][0]+"_TRAIN",sep=dataset[i][1],header=None)


print(train.head())
# Renommage des 3 premières colonnes représentant resp. les numéros des STM,
# les indices de leurs obs et leur classe
train.rename(index=str, columns={0: "Series", 1: "index", 2:"classe"},inplace=True)
#print(train.head())
# Suppression de la colonne des index des obs des STM, infos inutiles pour les futures traitements
# (d'où l'utilisation de inplace=True) sinon le dataframe appelant la méthode drop serait inchangé et 
# une copie modifiée serait retournée en résultat
train.drop(['index'],axis=1,inplace=True) 
#print(train.head())
# Suppression et sauvegarde de la colonne des classes et celle des séries
cl = train.pop('classe') 
se = train.pop('Series')
# Insertion des colonnes précédemment supprimées à la fin du dataframe
train['classe'] = cl
train['Series'] = se
# Convertion du type des colonnes en string
train.columns = train.columns.astype(str)
print(train.head())


test = pd.read_table("MTS_Datasets/"+dataset[i][0]+"_TEST",sep=dataset[i][1],header=None)
test.rename(index=str, columns={0: "Series", 1: "index", 2:"classe"},inplace=True)
test.drop(['index'],axis=1,inplace=True)
cl = test.pop('classe')
se = test.pop('Series')
test['classe'] = cl
test['Series'] = se
test.columns = test.columns.astype(str)

   0     1     2         3         4         5         6         7     \
0     1     1     1  0.002521  0.002454  0.002432  0.002354  0.002492   
1     1     2     1  0.000277  0.000307  0.000495  0.000382  0.000322   
2     1     3     1  0.002852  0.002602  0.003119  0.002557  0.002850   
3     1     4     1  0.001380  0.001546  0.001219  0.001839  0.001307   
4     1     5     1  0.000831  0.000852  0.000850  0.001005  0.000606   

       8         9     ...     1338     1339     1340      1341     1342  \
0  0.002612  0.002352  ...  1.04540  1.55870  1.13950  0.072909  0.33849   
1  0.000446  0.000219  ...  0.31535  0.70458  0.73141  0.724850  0.89081   
2  0.002831  0.002515  ...  0.21581  0.28944  0.10059  0.265610  1.14800   
3  0.001430  0.001426  ...  0.53559  0.53612  0.12246  0.490880  0.89691   
4  0.000983  0.000567  ...  0.46779  0.58451  0.35527  0.728630  0.40299   

     1343     1344    1345     1346     1347  
0  1.5538  3.16440  4.2111   7.4078  14.6260  
1  1.7591 

### Matlab

In [315]:
mat = sio.loadmat('MTS_Datasets/WalkvsRun/WalkvsRun.mat')
mdata = mat['mts']
mdtype = mdata.dtype  
ndata = {n: mdata[n][0, 0] for n in mdtype.names}
columns = [n for n, v in ndata.items()]

c='train'
train = pd.DataFrame()
for i in range(0,len(ndata[c][0])):
    series = pd.DataFrame(np.transpose(ndata[c][0][i]))
    series['classe']=ndata['trainlabels'][0][i]
    series['Series']=i
    train = pd.concat([train,series])
train.reset_index(drop=True,inplace=True)


c='test'
test = pd.DataFrame()
for i in range(0,len(ndata[c][0])):
    series = pd.DataFrame(np.transpose(ndata[c][0][i]))
    series['classe']=ndata['testlabels'][0][i]
    series['Series']=i
    test = pd.concat([test,series])
test.reset_index(drop=True,inplace=True)

In [6]:
print(train)

              3         4         5         6         7         8         9  \
0      0.002521  0.002454  0.002432  0.002354  0.002492  0.002612  0.002352   
1      0.000277  0.000307  0.000495  0.000382  0.000322  0.000446  0.000219   
2      0.002852  0.002602  0.003119  0.002557  0.002850  0.002831  0.002515   
3      0.001380  0.001546  0.001219  0.001839  0.001307  0.001430  0.001426   
4      0.000831  0.000852  0.000850  0.001005  0.000606  0.000983  0.000567   
...         ...       ...       ...       ...       ...       ...       ...   
13495  0.000342  0.000082  0.000181  0.000261  0.000176  0.000246  0.000436   
13496  0.000188  0.000185  0.000212  0.000142  0.000195  0.000389  0.000488   
13497  0.000208  0.000223  0.000062  0.000230  0.000119  0.000325  0.000398   
13498  0.000196  0.000325  0.000297  0.000312  0.000135  0.000195  0.000174   
13499  0.000609  0.000350  0.000108  0.000443  0.000163  0.000256  0.000509   

             10        11        12  ...      1340 

# Pre_treatments

In [7]:
# Détermination du nbre d'instances ou observations composant (les dimensions des) les STM
nb_col = len(train.drop(['classe','Series'],axis=1).columns)
print(nb_col)
print(train.head())

1345
          3         4         5         6         7         8         9  \
0  0.002521  0.002454  0.002432  0.002354  0.002492  0.002612  0.002352   
1  0.000277  0.000307  0.000495  0.000382  0.000322  0.000446  0.000219   
2  0.002852  0.002602  0.003119  0.002557  0.002850  0.002831  0.002515   
3  0.001380  0.001546  0.001219  0.001839  0.001307  0.001430  0.001426   
4  0.000831  0.000852  0.000850  0.001005  0.000606  0.000983  0.000567   

         10        11        12  ...     1340      1341     1342    1343  \
0  0.002613  0.002020  0.002725  ...  1.13950  0.072909  0.33849  1.5538   
1  0.000524  0.000574  0.000393  ...  0.73141  0.724850  0.89081  1.7591   
2  0.003014  0.002938  0.002510  ...  0.10059  0.265610  1.14800  1.6512   
3  0.001719  0.001501  0.001540  ...  0.12246  0.490880  0.89691  1.9294   
4  0.001123  0.000669  0.001115  ...  0.35527  0.728630  0.40299  1.1850   

      1344    1345     1346     1347  classe  Series  
0  3.16440  4.2111   7.4078  14.

## Normalisation

In [8]:
train.iloc[:,0:nb_col] = train.groupby('Series').transform(lambda x: (x - x.min()) / (x.max()-x.min())).iloc[:,0:nb_col]
test.iloc[:,0:nb_col] = test.groupby('Series').transform(lambda x: (x - x.min()) / (x.max()-x.min())).iloc[:,0:nb_col]

In [9]:
print(train.head())

          3         4         5         6         7         8         9  \
0  0.717889  0.660827  0.703812  0.655920  0.659297  0.717589  0.681193   
1  0.067266  0.073091  0.135777  0.093295  0.081247  0.121387  0.057327   
2  0.813859  0.701341  0.905279  0.713837  0.754662  0.777870  0.728868   
3  0.387069  0.412264  0.348094  0.508987  0.343633  0.392238  0.410354   
4  0.227892  0.222283  0.239883  0.271041  0.156899  0.269199  0.159111   

         10        11        12  ...      1340      1341      1342      1343  \
0  0.757726  0.521977  0.788794  ...  0.627273  0.000000  0.125797  0.424176   
1  0.148688  0.145904  0.104723  ...  0.392069  0.340441  0.339461  0.481302   
2  0.874636  0.760728  0.725726  ...  0.028494  0.100628  0.438954  0.451278   
3  0.497085  0.386996  0.441185  ...  0.041099  0.218263  0.341821  0.528689   
4  0.323324  0.170611  0.316515  ...  0.175279  0.342415  0.150749  0.321555   

       1344      1345      1346      1347  classe  Series  
0  0.558

In [10]:
mask  = train.isna().all()

#Drop columns where all elements are missing and keep the dataframe with valid entries in the same variable
train.dropna(axis=1,how='all',inplace=True)

train.interpolate(inplace=True)
train.dropna(inplace=True)

In [11]:
test = test[test.columns[~mask]]
test.dropna(axis=1,how='all',inplace=True)
test.interpolate(inplace=True)
train.dropna(inplace=True)

In [12]:
print(train.head(271))

            3         4         5         6         7         8         9  \
0    0.717889  0.660827  0.703812  0.655920  0.659297  0.717589  0.681193   
1    0.067266  0.073091  0.135777  0.093295  0.081247  0.121387  0.057327   
2    0.813859  0.701341  0.905279  0.713837  0.754662  0.777870  0.728868   
3    0.387069  0.412264  0.348094  0.508987  0.343633  0.392238  0.410354   
4    0.227892  0.222283  0.239883  0.271041  0.156899  0.269199  0.159111   
..        ...       ...       ...       ...       ...       ...       ...   
266  0.491157  0.568848  0.528739  0.555207  0.537826  0.559593  0.585551   
267  0.257466  0.145360  0.196774  0.247361  0.152371  0.175888  0.228722   
268  0.547405  0.599507  0.741935  0.603709  0.615077  0.615469  0.605148   
269  0.084372  0.128661  0.300293  0.126961  0.202717  0.202037  0.130155   
270  0.049421  0.095298  0.086979  0.069651  0.089910  0.079462  0.054711   

           10        11        12  ...      1340      1341      1342  \
0  

## Corr coeff

In [13]:
start = timer()

# Suppression des dimensions corrélées

to_keep = np.array([7, 9, 12, 33, 35, 39, 40, 43, 47, 50, 51, 52, 55, 84, 86, 88, 114, 115, 116, 117, 118, 119, 144, 145, 146, 147, 148, 149, 177, 178, 179, 205, 206, 207, 208, 209, 235, 236, 237, 238, 239, 263, 264, 265, 266, 267, 268, 269, 292, 293, 294, 295, 296, 297, 298, 299, 326, 327, 328, 329, 358, 359, 388, 389, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 476, 477, 478, 479, 507, 508, 509, 535, 536, 537, 538, 539, 564, 565, 566, 567, 568, 569, 595, 596, 597, 598, 599, 622, 623, 624, 625, 626, 627, 628, 629, 651, 652, 653, 654, 655, 656, 657, 658, 659, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 803, 804, 805, 806, 807, 808, 809, 833, 834, 835, 836, 837, 838, 839, 866, 867, 868, 869, 894, 895, 896, 897, 898, 899, 926, 927, 928, 929, 953, 954, 955, 956, 957, 958, 959, 988, 989, 1017, 1018, 1019, 1047, 1048, 1049, 1078, 1079, 1107, 1108, 1109, 1288, 1289, 1319, 1338, 1339, 1340, 1341, 1342, 1343, 1344])

train = pd.concat([train.iloc[:, to_keep], train[["classe", "Series"]]], axis=1)
test = pd.concat([test.iloc[:, to_keep], test[["classe","Series"]]], axis=1)

print(train)

nb_col = len(train.drop(['classe','Series'],axis=1).columns)
print(nb_col)

# Calcul de la dérivée et de la somme cumulée
deriv = train.groupby('Series').transform(lambda x: x.diff()).iloc[:,0:nb_col]

cumsum = train.groupby('Series').transform(lambda x: x.cumsum()).iloc[:,0:nb_col]

data = pd.concat([train.iloc[:,0:nb_col],deriv,cumsum],axis=1)
data.columns = np.linspace(0,3*nb_col,3*nb_col,dtype=int)

train = pd.concat([data,train[['classe','Series']]], axis=1)
train.dropna(inplace=True)

#print(train)

deriv = test.groupby('Series').transform(lambda x: x.diff()).iloc[:,0:nb_col]

cumsum = test.groupby('Series').transform(lambda x: x.cumsum()).iloc[:,0:nb_col]

data = pd.concat([test.iloc[:,0:nb_col],deriv,cumsum],axis=1)
data.columns = np.linspace(0,3*nb_col,3*nb_col,dtype=int)

test = pd.concat([test.iloc[:,0:nb_col],deriv,cumsum,test[['classe','Series']]],
          axis=1)
test.dropna(inplace=True)

#print(test)


####### M-histogramme #########

# Paramètres
max_comb = comb(nb_col,2)*3*2
#percent = 0.25
percent = 1
nb_features = int(np.round(max_comb*percent))

print("Nombre de M-histogrammes : ",nb_features)

eval_int = 0
# Apprentissage vue
for app in range(0,4):
    print("*****************************")
    print("Apprentissage: ",app)
    print("*****************************")
    train_nngrams = list() #Liste des M-Histogrammes de l'ensemble d'apprentissage pour chaque vue
    test_nngrams = list() #Liste des M-Histogrammes de l'ensemble de test
    learners = list()
    combin=list()           #Liste des combinaisons dim1+dim2+TypeHistogramme
    
    # Apprentissage M-histogramme
    i=0
    while i <nb_features and i<max_comb :
        #if i%5==0:
            #print("*****************************")
            #print("TOUR : ",i)
            #print("*****************************")
        # Choosing : Dimensional features ||  Simple, Deriv or Cumsum ||  Bigrams or 1grams 
        dim1rand = np.random.randint(0,nb_col)
        dim2rand = np.random.randint(0,nb_col)
        while dim2rand == dim1rand :
            dim2rand = np.random.randint(0,nb_col)


        transform = np.random.randint(0,3)
        dim1 = dim1rand+transform*nb_col
        dim2 = dim2rand+transform*nb_col

        nngrams = np.random.randint(0,2)    

        #print("Dim1 : ",dim1,"   Dim2 : ",dim2,"    NNGrams: ",nngrams)

        if [dim1,dim2,nngrams] in combin or [dim2,dim1,nngrams] in combin : 
            continue
        else :
            combin.append([dim1,dim2,nngrams]) 
            i+=1

        if nngrams == 0:

            ub1,ub2 = bigramsTransform(train.copy(),dim1,dim2)

            train_group = train.groupby('Series')        
            htr = train_group.apply(lambda x : np.ndarray.tolist(np.ndarray.flatten(
                        np.histogram2d(x.iloc[:,dim1].values,x.iloc[:,dim2].values,bins=[ub1,ub2],density=True)[0])))
            htr = pd.DataFrame(htr.values.tolist())
            train_nngrams.append(htr)


            test_group = test.groupby('Series')
            htt = test_group.apply(lambda x : np.ndarray.tolist(np.ndarray.flatten(
                        np.histogram2d(x.iloc[:,dim1].values,x.iloc[:,dim2].values,bins=[ub1,ub2],density=True)[0])))
            htt = pd.DataFrame(htt.values.tolist())
            test_nngrams.append(htt)

            learners.append(('clf',KNeighborsClassifier(1)))


        else :

            ub1h,ub2h = onegramsTransform(train.copy(),dim1,dim2) 

            train_group = train.groupby('Series')
            h1htr = train_group.apply(lambda x : np.ndarray.tolist(np.ndarray.flatten(
                        np.histogram(x.iloc[:,dim1].values,bins=ub1h,density=True)[0])))
            h1htr = pd.DataFrame(h1htr.values.tolist())
            train_nngrams.append(h1htr)

            h2htr = train_group.apply(lambda x : np.ndarray.tolist(np.ndarray.flatten(
                        np.histogram(x.iloc[:,dim2].values,bins=ub2h,density=True)[0])))
            h2htr = pd.DataFrame(h2htr.values.tolist())
            train_nngrams.append(h2htr)


            test_group = test.groupby('Series')
            h1htt = test_group.apply(lambda x : np.ndarray.tolist(np.ndarray.flatten(
                        np.histogram(x.iloc[:,dim1].values,bins=ub1h,density=True)[0])))
            h1htt = pd.DataFrame(h1htt.values.tolist())
            test_nngrams.append(h1htt)

            h2htt = test_group.apply(lambda x : np.ndarray.tolist(np.ndarray.flatten(
                        np.histogram(x.iloc[:,dim2].values,bins=ub2h,density=True)[0])))
            h2htt = pd.DataFrame(h2htt.values.tolist())
            test_nngrams.append(h2htt)

            learners.append(('clf',KNeighborsClassifier(1)))
            learners.append(('clf',KNeighborsClassifier(1)))

    # Score en validation
    cl = train_group['classe'].apply(lambda x : x.iloc[0]).reset_index(drop=True) #Liste des classes pour chaque série, en ordre
    se = np.arange(len(train_nngrams[0]))
    #train_index : index des séries de l'ensemble d'apprentissage, y_train: classes des séries de l'ensemble d'apprentissage, indexées par les index des séries
    train_index, test_index, y_train, y_test = train_test_split(se, cl,stratify=cl,test_size=0.2) 

    x_train = list()
    x_test = list()
    for j in range(len(train_nngrams)): #Chaque item de x_train contiendra pour une vue la liste des histogrammes des séries d'entraînement
        x_train.append(train_nngrams[j].iloc[train_index,:].reset_index(drop=True))  
    for j in range(len(train_nngrams)):
        x_test.append(train_nngrams[j].iloc[test_index,:].reset_index(drop=True))

    fitted_estimators, label_encoder = el.fit_multiple_estimators(learners, x_train, y_train.reset_index(drop=True))
    y_pred = el.predict_from_multiple_estimator(fitted_estimators, label_encoder, x_test)
    score_val = np.round(accuracy_score(y_pred, y_test.reset_index(drop=True)),4)
    print("SCORE Training VUE N°",app," : ",score_val )
    
    # Score en test
    train_group = train.groupby('Series')
    train_classe = train_group['classe'].apply(lambda x : x.iloc[0]).reset_index(drop=True)

    test_group = test.groupby('Series')
    test_classe = test_group['classe'].apply(lambda x : x.iloc[0]).reset_index(drop=True)

    fitted_estimators, label_encoder = el.fit_multiple_estimators(learners, train_nngrams, train_classe)
    y_pred = el.predict_from_multiple_estimator(fitted_estimators, label_encoder, test_nngrams)
    
    score_test = np.round(accuracy_score(y_pred, test_classe),4)
    print("SCORE TEST: ", score_test)
  
    
    if eval_int < score_val :
        eval_int=score_val
        eval_fi=score_test
        
print("SCORE FINAL: ", eval_fi)

end = timer()
print(end - start)

             10        12        15        36        38        42        43  \
0      0.757726  0.788794  0.710935  0.712296  0.704155  0.614065  0.794956   
1      0.148688  0.104723  0.105541  0.121442  0.003436  0.045169  0.067982   
2      0.874636  0.725726  0.761360  0.944579  0.892221  0.803888  0.712719   
3      0.497085  0.441185  0.379947  0.460024  0.407998  0.378788  0.415022   
4      0.323324  0.316515  0.392554  0.288007  0.278975  0.338765  0.144737   
...         ...       ...       ...       ...       ...       ...       ...   
13495  0.016534  0.005645  0.042290  0.014264  0.016466  0.010814  0.012917   
13496  0.027651  0.022370  0.038753  0.024139  0.016103  0.016686  0.008661   
13497  0.028791  0.011996  0.019377  0.010826  0.029015  0.019335  0.011743   
13498  0.039054  0.008539  0.011641  0.015434  0.019803  0.020123  0.027009   
13499  0.035633  0.025616  0.008546  0.022237  0.019077  0.007018  0.011450   

             46        50        53  ...      1322 

KeyboardInterrupt: 