In [1]:
import logging
logging.basicConfig(level=logging.DEBUG)
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import utilities
import os

In [2]:
datasets_binary = ['BeetleFly','BirdChicken','Coffee','Computers','DistalPhalanxOutlineCorrect','Earthquakes','ECG200',
                   'ECGFiveDays','FordA','FordB','GunPoint','Ham','HandOutlines','Herring','ItalyPowerDemand','Lightning2',
                   'MiddlePhalanxOutlineCorrect', 'MoteStrain','PhalangesOutlinesCorrect','ProximalPhalanxOutlineCorrect',
                   'ShapeletSim','SonyAIBORobotSurface1','SonyAIBORobotSurface2','Strawberry','ToeSegmentation1','ToeSegmentation2',
                   'TwoLeadECG','Wafer','Wine','WormsTwoClass','Yoga','Chinatown','DodgerLoopGame','DodgerLoopWeekend',
                   'FreezerRegularTrain','FreezerSmallTrain','GunPointAgeSpan','GunPointMaleVersusFemale','GunPointOldVersusYoung',
                   'HouseTwenty','PowerCons','SemgHandGenderCh2']
datasets_small = ['BeetleFly', 'BirdChicken', 'Coffee', 'Computers', 'DistalPhalanxOutlineCorrect', 'Earthquakes', 'ECG200', 
                  'ECGFiveDays', 'GunPoint', 'Ham', 'Herring', 'Lightning2', 'MiddlePhalanxOutlineCorrect', 'ProximalPhalanxOutlineCorrect', 
                  'ShapeletSim', 'SonyAIBORobotSurface1', 'SonyAIBORobotSurface2', 'Strawberry', 'ToeSegmentation1', 'ToeSegmentation2', 'Wine', 
                  'WormsTwoClass', 'Chinatown', 'DodgerLoopGame', 'DodgerLoopWeekend', 'GunPointAgeSpan', 'GunPointMaleVersusFemale', 
                  'GunPointOldVersusYoung', 'HouseTwenty', 'PowerCons', 'SemgHandGenderCh2']
dataset_large=['FordA', 'FordB', 'HandOutlines', 'ItalyPowerDemand', 'MoteStrain', 'PhalangesOutlinesCorrect', 
               'TwoLeadECG', 'Wafer', 'Yoga', 'FreezerRegularTrain', 'FreezerSmallTrain']

### Implement shapelet distance

In [3]:
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics.pairwise import paired_distances
import numpy as np
import itertools
from itertools import permutations

In [4]:
def min_distance(S,T):
    assert isinstance(S,np.ndarray)
    assert isinstance(T,np.ndarray) 
    assert isinstance(S[0],float)
    assert isinstance(T[0],float)
    if len(S)>len(T):
        aux_S=T
        T=S
        S=aux_S
    
    dist_list=[]
    m=len(T)
    w=len(S)
        
    for i in range(m-w+1):
        dist=np.linalg.norm(T[i:i+w]-S)
        dist_list.append(dist)
    return min(dist_list)

In [10]:
#K(T1,T2)=K_s((s11,...,s1U), (s21,...,s2V))=min_{i,j}( min_dist(s1i,s2j) )
def pairwise_min_shapelet(T1,T2,k=None):
    #T1,T2 should be lists of subsequences
    T1=subsequences1d(T1,k)
    T2=subsequences1d(T2,k)
    assert isinstance(T1,np.ndarray)
    assert isinstance(T2,np.ndarray) 
    assert isinstance(T1[0],np.ndarray)
    assert isinstance(T2[0],np.ndarray)
    assert isinstance(T1[0][0],float)
    assert isinstance(T2[0][0],float)
    
    c = list(itertools.product(T1, T2))
    c=np.array(c)
    dist_list=[]
    dist_list = [np.linalg.norm(grp[0]-grp[1]) for grp in c]
   
    return min(dist_list)   

def subsequences(T, k=None): 
    assert isinstance(T,np.ndarray)
    assert isinstance(T[0],np.ndarray)
    assert isinstance(T[0][0],float)
    
    m,n = T.shape
    if k==None:
        k=int(np.log(n+1))+1
    # INPUTS :
    # a is array
    # L is length of array along axis=1 to be cut for forming each subarray

    # Length of 3D output array along its axis=1
    #print(T.shape)
    nd0 = T.shape[1] - k + 1

    # Store shape and strides info
    s0,s1 = T.strides
    if k==None:
        k=int(np.log(n+1))+1

    # Finally use strides to get the 3D array view
    return np.lib.stride_tricks.as_strided(T, shape=(m,nd0,k), strides=(s0,s1,s1))

def subsequences1d(arr, m):
    assert isinstance(arr,np.ndarray)
    assert isinstance(arr[0],float)
    
    if m==None:
        m=int(np.log(arr.shape[0]+1))+1
    n = arr.shape[0] - m + 1
    s = arr.itemsize
    return np.lib.stride_tricks.as_strided(arr, shape=(n,m), strides=(s,s))    
    

In [20]:
#train and test all 42 datasets
dataset_list=[]
auroc_list=[]
auprc_list=[]

for dataset in datasets_small:
    X_train, y_train, X_test, y_test = utilities.get_ucr_dataset('../UCRArchive_2018/',dataset)
    clf = KNeighborsClassifier(n_neighbors=1,metric=pairwise_min_shapelet)
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)
    auroc=roc_auc_score(y_test, y_pred[:, 1])
    auprc=average_precision_score(y_test, y_pred[:,1])
    print(dataset, " AUROC is: ", auroc," AUPRC is: ", auprc)
    dataset_list.append(dataset)
    auroc_list.append(auroc)
    auprc_list.append(auprc)
    
       
    

BeetleFly  AUROC is:  0.5499999999999999  AUPRC is:  0.47777777777777775


In [40]:
np.savetxt('pairwise_min_shapelet_KNN.csv', [p for p in zip(dataset_list, auroc_list,auprc_list)], delimiter=',',fmt="%s")

In [11]:
#Grid Search over datasets, subsequence length as a hyper-parameter 
dataset_list=[]
auroc_list=[]
auprc_list=[]
param_list=[]
parameters = {'n_neighbors':[1],'metric_params':[{'k':1},{'k':2},{'k':3},{'k':4}]}

for dataset in datasets_small[:1]:
    X_train, y_train, X_test, y_test = utilities.get_ucr_dataset('../UCRArchive_2018/',dataset)
    clf = GridSearchCV(KNeighborsClassifier(metric=pairwise_min_shapelet),parameters, cv=5, verbose=1)
    clf.fit(X_train, y_train)
    
    #get best estimator
    print(clf.best_params_)
    opt_clf=clf.best_estimator_
    
    y_pred = opt_clf.predict_proba(X_test)
    
    auroc=roc_auc_score(y_test, y_pred[:, 1])
    auprc=average_precision_score(y_test, y_pred[:,1])
    print(dataset, " AUROC is: ", auroc," AUPRC is: ", auprc)
    dataset_list.append(dataset)
    auroc_list.append(auroc)
    auprc_list.append(auprc)
    param_list.append(clf.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [None]:
np.savetxt('pairwise_min_shapelet_KNN.csv', [p for p in zip(dataset_list, auroc_list,auprc_list,param_list)], delimiter=',',fmt="%s")