In [1]:
import numpy as np
import pandas as pd
import math
from pandas import read_csv
from pandas import DataFrame
from pandas import concat

import matplotlib.pyplot as plt
import os

from scipy.stats import skew 


from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.svm import TimeSeriesSVC
from functools import reduce

from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [57]:
# Config
directory = 'Data-sets/Dance_Data'
entries = os.listdir(directory)
lag = 7
window_sizes = [5,10,15,20,25]
feature_list = ['mean', 'std', 'skew', 'zero_crossing_counts', 'peaks_count']

dances = {}

for i,x in enumerate(entries):
    dances[i] = x[:-4]
print(dances)

{0: 'dummy_dance_1', 1: 'dummy_dance_10', 2: 'dummy_dance_2', 3: 'dummy_dance_4', 4: 'dummy_dance_5', 5: 'dummy_dance_6', 6: 'dummy_dance_7', 7: 'dummy_dance_8', 8: 'dummy_dance_9', 9: 'move_left', 10: 'move_right'}


In [3]:
def data_tagging(df,j):
    df_x = df.copy()
    df_x['tag'] = j+1
    return df_x

In [4]:
def add_sliding_window(df, window_size, overlap, shift):
    for col in df:
        df[col] = df[col].rolling(window_size,1,win_type ='triang').std()
    df = df.reshape(
        df.shape[1],
        df.shape[0],
        df.shape[2]
    )
    return df[1:]

In [5]:
def initialize(df):
    df.columns = ['init'] # dummy
    df = df.init.str.split(expand=True)
    df.columns = ['X', 'Y', 'Z']
    df = df.apply(pd.to_numeric)
    return df 

In [6]:
def plot_graphs(processed_df, j, axs):
    x = [i for i in range(len(processed_df))]
    colors = np.tile(np.array(['red', 'orange', 'green']), math.ceil(len(processed_df.columns)/3))
    for i, col in enumerate(processed_df.columns):
        axs[j, i].plot(x, processed_df[col], 'tab:'+colors[i])
        axs[j, i].set_axis_off()
        axs[j, i].axis([0, len(processed_df), min(processed_df[col]), max(processed_df[col])])
        if i == 15:
            break
    return processed_df

## Lagging

In [7]:
def lagging(df, lag):
    temps = DataFrame(df.values)
    dataframe = concat([temps, temps.shift(lag)], axis=1)
    dataframe.columns = ['x(t)', 'y(t)', 'z(t)', 'x(t+1)', 'y(t+1)', 'z(t+1)']
    return dataframe[lag+1:]

1. Mean
2. Min/Max
3. Standard deviation
4. Mean/Median Absolute
5. Deviation
6. Signal Mag. area
7. Spectral Energy
8. Interquartile range
9. Autoregression (AR) and
10. Correlation coefficients
11. Signal Entropy
12. Skewness
13. Peak Frequency

In [8]:
def make_sliding(df, O, N):
    dfs = [df.shift(-i).applymap(lambda x: [x]) for i in range(O, N+1)]
    return reduce(lambda x, y: x.add(y), dfs).to_numpy()

In [9]:
def set_windows(df, window_size):
    max_rows = window_size * math.floor(len(df)/window_size)
    df = df[:max_rows]
    return np.array(df).reshape(-1, df.shape[1], window_size)

In [10]:
def add_mean(data):
    return np.mean(data)

In [11]:
def add_std(data):
    return np.std(data)

In [12]:
def add_skew(data):
    return skew(data)

In [59]:
def add_zero_crossing_count(data):
    zero_crossings = np.where(np.diff(np.signbit(data)))[0]
    return len(zero_crossings)

In [68]:
from scipy.signal import find_peaks

def add_peaks(data):
    peaks, _ = find_peaks(data, height=0)
    # print(np.var(data[peaks]))
    return np.nan_to_num(np.var(data[peaks]))

In [55]:
def feature_extraction(slide):
    full_features = np.array([])
    axis = ['x', 'y', 'z']
    titles = np.ravel(np.array([i+'_'+j for i in feature_list for j in axis]))

    for window in slide:
        for ax in window:
            full_features = np.append(full_features, add_mean(ax))
            full_features = np.append(full_features, add_std(ax))
            full_features = np.append(full_features, add_skew(ax))
            full_features = np.append(full_features, add_zero_crossing_count(ax))
            full_features = np.append(full_features, add_peaks(ax))
    
    full_features = full_features.reshape(
        -1,
        len(titles)
    )
    full_features_df = pd.DataFrame(full_features)
    full_features_df.columns = titles
    return full_features_df

In [15]:
def svm(df):
    if isinstance(df, DataFrame):
        df = df.to_numpy()
    X, y = df[:,:-1], df[:,-1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    
    X_train = TimeSeriesScalerMinMax().fit_transform(X_train)
    X_test = TimeSeriesScalerMinMax().fit_transform(X_test)

    clf = TimeSeriesSVC(kernel="gak", gamma=.3)
    clf.fit(X_train[:300,], y_train[:300,])
    print("Correct classification rate:", clf.score(X_test, y_test))

    return
    # save the model to disk
    filename = 'svm_model.sav'
    pickle.dump(model, open(filename, 'wb'))

    # some time later...

    # load the model from disk
    loaded_model = pickle.load(open(filename, 'rb'))
    result = loaded_model.score(X_test, Y_test)
    print(result)
    
    return

In [73]:
# from sklearn.neighbors import KNeighborsClassifier
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import (KNeighborsClassifier,
                               NeighborhoodComponentsAnalysis)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

def knn(df):
    if isinstance(df, DataFrame):
        df = df.to_numpy()
    X, y = df[:,:-1], df[:,-1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    pca = make_pipeline(StandardScaler(),
                        PCA(n_components=5, random_state=42))
    lda = make_pipeline(StandardScaler(),
                        LinearDiscriminantAnalysis(n_components=5))
    nca = make_pipeline(StandardScaler(),
                        NeighborhoodComponentsAnalysis(n_components=5,
                                                       random_state=42))

    knn = KNeighborsTimeSeriesClassifier()

    dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]
    for i, (name, model) in enumerate(dim_reduction_methods):
        model.fit(X_train, y_train)
        knn.fit(model.transform(X_train), y_train)
        acc_knn = knn.score(model.transform(X_test), y_test)
        print(acc_knn)

    return

    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)    
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(acc)
    return

In [74]:
def butterworth():
    sos = signal.butter(10, 15, 'hp', fs=1000, output='sos')
    filtered = signal.sosfilt(sos, sig)
    return filtered

In [75]:
def run(directory):
    # fig, axs = plt.subplots(len(dances), len(feature_list)*3, figsize=(20,20))
    full_features = 0
    full_clean = 0
    for ws in window_sizes:
        for j, entry in enumerate(entries):
            df = pd.read_csv(directory + '/' + entry) 
            df = initialize(df)
            tagged_df = data_tagging(df, j)
            
            print(df.shape)
            slide = set_windows(df, ws)
            print(slide.shape)
            features = feature_extraction(slide)
            tagged_features = data_tagging(features, j)

            # slide = plot_graphs(features, j, axs)

            if isinstance(full_clean, int) or isinstance(full_features, int):
                full_features = tagged_features
                full_clean = tagged_df
            else:
                full_features = np.append(full_features, tagged_features, axis=0)
                full_clean = np.append(full_clean, tagged_df, axis=0)
        knn(full_features)
        svm(full_features)
        knn(full_clean)
        svm(full_clean)
        
    return

run(directory)

(1799, 3)
(359, 3, 5)
(1799, 3)
(359, 3, 5)
(1799, 3)
(359, 3, 5)
(1799, 3)
(359, 3, 5)
(1799, 3)
(359, 3, 5)
(1799, 3)
(359, 3, 5)
(1799, 3)
(359, 3, 5)
(1799, 3)
(359, 3, 5)
(1799, 3)
(359, 3, 5)
(1799, 3)
(359, 3, 5)
(1799, 3)
(359, 3, 5)


KeyboardInterrupt: 

## Rolling Mean

In [7]:
import pandas as pd
dataframe['y(t)_rm'] = dataframe['y(t)'].rolling(window=7).mean()
dataframe['z(t)_rm'] = dataframe['z(t)'].rolling(window=7).mean()

dataframe.head(10)

NameError: name 'dataframe' is not defined

## Expanding

In [None]:
data = pd.read_csv('Train_SU63ISt.csv')
data['Datetime'] = pd.to_datetime(data['Datetime'],format='%d-%m-%Y %H:%M')

data['expanding_mean'] = data['Count'].expanding(2).mean()
data = data[['Datetime','Count', 'expanding_mean']]
data.head(10)

## Hierarchical Clustering

In [None]:
from sktime.distances.elastic_cython import dtw_distance
          
series_list = data['dim_0'].values
for i in range(len(series_list)):
    length = len(series_list[i])
    series_list[i] = series_list[i].values.reshape((length, 1))

# Initialize distance matrix
n_series = len(series_list)
distance_matrix = np.zeros(shape=(n_series, n_series))

# Build distance matrix
for i in range(n_series):
    for j in range(n_series):
        x = series_list[i]
        y = series_list[j]
        if i != j:
            dist = dtw_distance(x, y)
            distance_matrix[i, j] = dist

In [None]:
from scipy.cluster.hierarchy import single, complete, average, ward, dendrogram

def hierarchical_clustering(dist_mat, method='complete'):
    if method == 'complete':
        Z = complete(distance_matrix)
    if method == 'single':
        Z = single(distance_matrix)
    if method == 'average':
        Z = average(distance_matrix)
    if method == 'ward':
        Z = ward(distance_matrix)
    
    fig = plt.figure(figsize=(16, 8))
    dn = dendrogram(Z)
    plt.title(f"Dendrogram for {method}-linkage with correlation distance")
    plt.show()
    
    return Z

linkage_matrix = hierarchical_clustering(distance_matrix)

To-Do List:
1. Hierarchical Clustering
2. KNN DTW
3. MLP
4. Pytorch/Tensorflow?
5. Validation methods testing
6. Hyperparameters?
7. Loss Function?
8. 

## Support Vector Machine

In [None]:
import numpy
import matplotlib.pyplot as plt

from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.svm import TimeSeriesSVC

from sklearn.model_selection import train_test_split

numpy.random.seed(0)
# X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
# print(X_train.shape, y_test.shape)
total = np.array([])
for j, entry in enumerate(entries):
    df = pd.read_csv(directory + '/' + entry)
    df = initialize(df)
    tagged = data_tagging(df, j)
    total = np.append(total, tagged)

reshaped = total.reshape(-1, 4)
X, y = reshaped[:,:-1], reshaped[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(y_test)
X_train = TimeSeriesScalerMinMax().fit_transform(X_train)
X_test = TimeSeriesScalerMinMax().fit_transform(X_test)

clf = TimeSeriesSVC(kernel="gak", gamma=.1)
clf.fit(X_train, y_train)
print("Correct classification rate:", clf.score(X_test, y_test))

n_classes = len(set(y_train))

plt.figure()
support_vectors = clf.support_vectors_
for i, cl in enumerate(set(y_train)):
    plt.subplot(n_classes, 1, i + 1)
    plt.title("Support vectors for class %d" % cl)
    for ts in support_vectors[i]:
        plt.plot(ts.ravel())

plt.tight_layout()
plt.show()

## KNN DTW

In [None]:
class KnnDtw(object):
    """K-nearest neighbor classifier using dynamic time warping
    as the distance measure between pairs of time series arrays
    
    Arguments
    ---------
    n_neighbors : int, optional (default = 5)
        Number of neighbors to use by default for KNN
        
    max_warping_window : int, optional (default = infinity)
        Maximum warping window allowed by the DTW dynamic
        programming function
            
    subsample_step : int, optional (default = 1)
        Step size for the timeseries array. By setting subsample_step = 2,
        the timeseries length will be reduced by 50% because every second
        item is skipped. Implemented by x[:, ::subsample_step]
    """
    
    def __init__(self, n_neighbors=5, max_warping_window=10000, subsample_step=1):
        self.n_neighbors = n_neighbors
        self.max_warping_window = max_warping_window
        self.subsample_step = subsample_step
    
    def fit(self, x, l):
        """Fit the model using x as training data and l as class labels
        
        Arguments
        ---------
        x : array of shape [n_samples, n_timepoints]
            Training data set for input into KNN classifer
            
        l : array of shape [n_samples]
            Training labels for input into KNN classifier
        """
        
        self.x = x
        self.l = l
        
    def _dtw_distance(self, ts_a, ts_b, d = lambda x,y: abs(x-y)):
        """Returns the DTW similarity distance between two 2-D
        timeseries numpy arrays.

        Arguments
        ---------
        ts_a, ts_b : array of shape [n_samples, n_timepoints]
            Two arrays containing n_samples of timeseries data
            whose DTW distance between each sample of A and B
            will be compared
        
        d : DistanceMetric object (default = abs(x-y))
            the distance measure used for A_i - B_j in the
            DTW dynamic programming function
        
        Returns
        -------
        DTW distance between A and B
        """

        # Create cost matrix via broadcasting with large int
        ts_a, ts_b = np.array(ts_a), np.array(ts_b)
        M, N = len(ts_a), len(ts_b)
        cost = sys.maxint * np.ones((M, N))

        # Initialize the first row and column
        cost[0, 0] = d(ts_a[0], ts_b[0])
        for i in xrange(1, M):
            cost[i, 0] = cost[i-1, 0] + d(ts_a[i], ts_b[0])

        for j in xrange(1, N):
            cost[0, j] = cost[0, j-1] + d(ts_a[0], ts_b[j])

        # Populate rest of cost matrix within window
        for i in xrange(1, M):
            for j in xrange(max(1, i - self.max_warping_window),
                            min(N, i + self.max_warping_window)):
                choices = cost[i - 1, j - 1], cost[i, j-1], cost[i-1, j]
                cost[i, j] = min(choices) + d(ts_a[i], ts_b[j])

        # Return DTW distance given window 
        return cost[-1, -1]
    
    def _dist_matrix(self, x, y):
        """Computes the M x N distance matrix between the training
        dataset and testing dataset (y) using the DTW distance measure
        
        Arguments
        ---------
        x : array of shape [n_samples, n_timepoints]
        
        y : array of shape [n_samples, n_timepoints]
        
        Returns
        -------
        Distance matrix between each item of x and y with
            shape [training_n_samples, testing_n_samples]
        """
        
        # Compute the distance matrix        
        dm_count = 0
        
        # Compute condensed distance matrix (upper triangle) of pairwise dtw distances
        # when x and y are the same array
        if(np.array_equal(x, y)):
            x_s = np.shape(x)
            dm = np.zeros((x_s[0] * (x_s[0] - 1)) // 2, dtype=np.double)
            
            p = ProgressBar(shape(dm)[0])
            
            for i in xrange(0, x_s[0] - 1):
                for j in xrange(i + 1, x_s[0]):
                    dm[dm_count] = self._dtw_distance(x[i, ::self.subsample_step],
                                                      y[j, ::self.subsample_step])
                    
                    dm_count += 1
                    p.animate(dm_count)
            
            # Convert to squareform
            dm = squareform(dm)
            return dm

## Timer

In [30]:
def fib(n):
    if n <= 1:
        return n
    return fib(n-1) + fib(n-2)
%timeit fib(10)

23.8 µs ± 2.48 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [None]:
def convert_csv_to_h5():
    pass

## Saving Model

In [124]:
# MLP for Pima Indians Dataset Serialize to JSON and HDF5
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
import numpy
import os
# fix random seed for reproducibility
numpy.random.seed(7)
# load pima indians dataset
dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]
# create model
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X, Y, epochs=150, batch_size=10, verbose=0)
# evaluate the model
scores = model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

# later...

# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

Using TensorFlow backend.


OSError: pima-indians-diabetes.csv not found.