In [1]:
%matplotlib inline
import os
import time
import numpy as np
import pandas as pd
import pyabf
import matplotlib.pyplot as plt
from datetime import datetime as dt
from eventsSegments import segment2
from utils import centering, rolling_window
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from tqdm import tnrange, tqdm_notebook

from keras.optimizers import Adagrad
from tslearn.preprocessing import TimeSeriesScalerMinMax, TimeSeriesScalerMeanVariance
from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict

Using TensorFlow backend.


In [2]:
dataset = pd.read_hdf("data/nosub250kHz.h5", key="abf")
dataset.head(5)

Unnamed: 0,start,stop,width,filename,analytes,raw,segment,level
0,49275,50541,1246,15o15012.abf,AA3,"[47.885254, 48.495605, 48.19043, 49.105957, 49...","[3.75836443901062, 1.9273097515106201, 2.23248...",23.1045
1,378486,379789,1283,15o15012.abf,AA3,"[45.443848, 45.443848, 46.0542, 45.13867, 44.5...","[1.1405439376831055, 0.5301923751831055, -0.08...",22.111444
2,1217885,1219160,1255,15o15012.abf,AA3,"[45.13867, 46.359375, 46.359375, 46.0542, 45.7...","[7.2821502685546875, 7.2821502685546875, 4.840...",21.785723
3,1519286,1520474,1168,15o15012.abf,AA3,"[46.0542, 46.359375, 45.749023, 45.443848, 46....","[4.288922309875488, 3.678570508956909, 3.06821...",21.858732
4,2045561,2046829,1248,15o15012.abf,AA3,"[49.105957, 48.80078, 49.71631, 49.105957, 49....","[7.265286445617676, 8.180813789367676, 6.04458...",21.747799


In [3]:
analytes2label = {
    "AA3": 1,
    "GA3": 0,
    "AA3+GA3": -1
}

train_df = dataset.loc[dataset.filename.map(lambda x : x in ("15o15012.abf", "15o18003.abf")), :]
test_df = dataset.loc[dataset.filename.map(lambda x : x in ("15o15023.abf", "15o18017.abf")), :]

train_x = np.array(train_df.segment.tolist()).reshape((-1, 1300, 1))
test_x = np.array(test_df.segment.tolist()).reshape((-1, 1300, 1))

train_label = train_df.analytes.map(lambda x : analytes2label[x])
test_label = test_df.analytes.map(lambda x : analytes2label[x])

print("Number of train samples: ", train_x.shape[0])
print("Number of test samples: ", test_x.shape[0])
print("AA3 count: ", train_label.sum())
print("GA3 count: ", train_label.size - train_label.sum())

Number of train samples:  278
Number of test samples:  197
AA3 count:  186
GA3 count:  92


In [4]:

l=0.1
r=4
max_iter = 50
lr = 0.1
weight_regularizer = 0.01
shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=train_x.shape[0],
                                                    ts_sz=train_x.shape[1],
                                                    n_classes=len(set(train_label)),
                                                    l=l,
                                                    r=r)

# Define the model using parameters provided by the authors (except that we use fewer iterations here)
shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                        optimizer=Adagrad(lr=lr),
                        weight_regularizer=weight_regularizer,
                        max_iter=max_iter,
                        verbose_level=2,
                        batch_size=32)
train_x, train_label = shuffle(train_x, train_label)
start_time = time.time()
shp_clf.fit(train_x, train_label)
stop_time = time.time()


Epoch 1/50
 - 2s - loss: 0.6233 - binary_accuracy: 0.6367 - binary_crossentropy: 0.6035
Epoch 2/50
 - 1s - loss: 0.4325 - binary_accuracy: 0.6835 - binary_crossentropy: 0.4124
Epoch 3/50
 - 1s - loss: 0.3866 - binary_accuracy: 0.8165 - binary_crossentropy: 0.3652
Epoch 4/50
 - 1s - loss: 0.3427 - binary_accuracy: 0.9029 - binary_crossentropy: 0.3196
Epoch 5/50
 - 1s - loss: 0.3124 - binary_accuracy: 0.9604 - binary_crossentropy: 0.2873
Epoch 6/50
 - 1s - loss: 0.2848 - binary_accuracy: 0.9496 - binary_crossentropy: 0.2576
Epoch 7/50
 - 1s - loss: 0.2613 - binary_accuracy: 0.9784 - binary_crossentropy: 0.2320
Epoch 8/50
 - 1s - loss: 0.2389 - binary_accuracy: 0.9892 - binary_crossentropy: 0.2076
Epoch 9/50
 - 1s - loss: 0.2232 - binary_accuracy: 0.9856 - binary_crossentropy: 0.1900
Epoch 10/50
 - 1s - loss: 0.2078 - binary_accuracy: 0.9856 - binary_crossentropy: 0.1728
Epoch 11/50
 - 1s - loss: 0.1962 - binary_accuracy: 0.9820 - binary_crossentropy: 0.1597
Epoch 12/50
 - 1s - loss: 0.18

In [5]:
predicted_labels = shp_clf.predict(train_x)
train_accuracy = accuracy_score(train_label, predicted_labels)
train_precision = precision_score(train_label, predicted_labels)
train_recall = recall_score(train_label, predicted_labels)
print("Training accuracy:",  train_accuracy)
print("Training precision:", train_precision)
print("Training recall:",    train_recall)

Training accuracy: 0.9964028776978417
Training precision: 1.0
Training recall: 0.9946236559139785


In [6]:
test_plabels = shp_clf.predict(test_x)
test_accuracy = accuracy_score(test_label, test_plabels)
test_precision = precision_score(test_label, test_plabels)
test_recall = recall_score(test_label, test_plabels)
test_f1 = f1_score(test_label, test_plabels)
print("Test accuracy:",  test_accuracy)
print("Test precision:", test_precision)
print("Test recall:",    test_recall)
print("Test F1:",        test_f1)

Test accuracy: 0.883248730964467
Test precision: 1.0
Test recall: 0.8516129032258064
Test F1: 0.9198606271777003
