In [1]:
import os
import pickle
import numpy as np
import random
from scipy.ndimage import uniform_filter1d, gaussian_filter1d
from tslearn.utils import to_time_series_dataset
from tslearn.neighbors import KNeighborsTimeSeriesClassifier, \
    KNeighborsTimeSeries
from tslearn.clustering import TimeSeriesKMeans
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [2]:
training_directory = '/home/tyler/Documents/Matt3r/imu-classification/DATA/train_8s_stage3'
validation_directory = '/home/tyler/Documents/Matt3r/imu-classification/DATA/val_8s_stage3'

In [19]:
class PreprocessData:
    ''' Load all data contained in the directory as X and Y.
        Labels are stored in labels
    '''
    def __init__(self, directory):
        self.X = []
        self.Y = []
        self.labels = []
        for id, folder in enumerate(os.listdir(directory)):
            self.labels.append(folder)
            folder_list = []
            for file_name in os.listdir(os.path.join(directory, folder)):
                if file_name.endswith(".pkl"):
                    file_path = os.path.join(directory, folder, file_name)
                    with open(file_path, "rb") as file:
                        data = pickle.load(file)
                        folder_list.append(data['matt3r_format'])
            self.X.append(folder_list)
            self.Y.append([id] * len(folder_list))

    ''' Smoothen and discretize the data to length 50.
        If method = gaussian then perform a gaussian smoothing
        If method = moving average then compute the moving average
    '''
    def smoothen_discretize(self, smoothing_method='gaussian'):
        if smoothing_method == 'gaussian':
            SIGMA = 2.5
            FINAL_SEQ_LENGTH = 50
            offset = (400 / FINAL_SEQ_LENGTH) / 2
            DOWNSAMPLING_INDEXES = np.linspace(offset, 400-offset, FINAL_SEQ_LENGTH).round().astype(int)
            for class_num in range(len(self.X)):
                for id, timeseries in enumerate(self.X[class_num]):
                    # convert to numpy
                    data_np = np.vstack([timeseries[col] for col in timeseries])[1:,:]
                    # smooth data
                    data_smooth = gaussian_filter1d(data_np, sigma=SIGMA, mode='nearest')
                    # downsample
                    self.X[class_num][id] = data_smooth[:, DOWNSAMPLING_INDEXES]
        elif smoothing_method == 'moving average':
            WINDOW_SIZE = 6
            DISC_STEP = 8
            for class_num in range(len(self.X)):
                for id, timeseries in enumerate(self.X[class_num]):
                    for column in timeseries:
                        sequence = timeseries[column]
                        # compute moving average
                        sequence = uniform_filter1d(sequence, size=WINDOW_SIZE)
                        # discretize
                        if DISC_STEP > 1:
                            n = len(sequence)
                            n_steps = int(np.ceil(n / DISC_STEP))
                            sequence = np.array([
                                sequence[ i * DISC_STEP:(i + 1) * DISC_STEP].mean() 
                                for i in range(n_steps)
                            ])
                        timeseries[column] = sequence
                    # convert to numpy
                    self.X[class_num][id] = np.vstack([timeseries[col] for col in timeseries])[1:,:]
        else:
            # perform no smoothing or discretization
            for class_num in range(len(self.X)):
                for id, timeseries in enumerate(self.X[class_num]):
                    # convert to numpy
                    self.X[class_num][id] = np.vstack([timeseries[col] for col in timeseries])[1:,:]

    ''' Normalize the data.
        If method = standardized then divide by the precomputed standard deviation.
        If method = maxmin then perform max/min normalization between -1 and 1.
        If no method is passed then no normalization will be done.
    '''
    def normalize(self, normalize_method='standardized'):
        if normalize_method == 'maxmin':
            # compute the max and min values per axis
            mins = np.inf * np.ones(6)
            maxs = -np.inf * np.ones(6)
            for class_num in range(len(self.X)):
                for data in self.X[class_num]:
                    mins = np.min([mins, np.min(data, axis=1)], axis=0)
                    maxs = np.max([maxs, np.max(data, axis=1)], axis=0)
            # normalize to +/-1
            ranges = maxs - mins
            mins = mins[:, np.newaxis]
            ranges = ranges[:, np.newaxis]
            for class_num in range(len(self.X)):
                for id, data in enumerate(self.X[class_num]):
                    self.X[class_num][id] = ((data - mins) / ranges) * 2 - 1
        elif normalize_method == 'standardized':
            # precomputed standard deviations
            STD_DEVS = np.array([[1.33343277], [1.59291318], [0.52805266], [0.07645243], [0.07536972], [0.22749133]])
            for class_num in range(len(self.X)):
                for id, data in enumerate(self.X[class_num]):
                    self.X[class_num][id] = data / STD_DEVS

    ''' Compute the cumulative sum of each time series.
        The axis field allows you to pass a list of which IMU fields you would like to integrate.
        0 = lr_acc, 1 = bf_acc, 2 = vert_acc, 3 = lr_gyro, 4 = bf_gyro, 5 = vert_gyro
        dt is the length of time for each sample in seconds
        N is the length of each time series sample
    '''
    def integrate(self, axes=[0,5], dt=8, N=50):
        for class_num in range(len(self.X)):
                for id, data in enumerate(self.X[class_num]):
                    for axis in axes:
                        self.X[class_num][id] = dt * np.cumsum(data) / N

    ''' Perform all of the preprocessing steps in a single function call.
        Pass in the arguments for the necessary preprocessing steps you wish to perform.
    '''
    def preprocess(self, **kwargs):
        # smoothen and discretize
        if 'smoothing_method' in kwargs:
            self.smoothen_discretize(kwargs['smoothing_method'])
        else: self.smoothen_discretize()
        # normalize
        if 'normalize_method' in kwargs:
            self.normalize(kwargs['normalize_method'])
        else: self.normalize()

    ''' Converts the data from each class to a timeseries object with a set number of samples.
        The axis field allows you to pass a list of which IMU fields you would like to include.
        0 = lr_acc, 1 = bf_acc, 2 = vert_acc, 3 = lr_gyro, 4 = bf_gyro, 5 = vert_gyro
    '''
    def to_timeseries(self, num_samples, axes=[0,5]):
        for class_num in range(len(self.X)):
            # shuffle data for each instance
            indices = list(range(len(self.X[class_num])))
            random.shuffle(indices)
            self.X[class_num] = to_time_series_dataset([self.X[class_num][i][axes,:].T for i in indices][:num_samples])
            self.Y[class_num] = self.Y[class_num][:num_samples]

    ''' Cluster the data for each class into a set number of clusters.
    '''
    def cluster(self, num_clusters, cluster_method='dtw', **kwargs):
        for class_num in range(len(self.X)):
            km = TimeSeriesKMeans(num_clusters, metric=cluster_method, **kwargs)
            km.fit(self.X[class_num])
            self.X[class_num] = km.cluster_centers_

    ''' Returns the timeseries data, X, and the corresponding labels, Y
        X: ndarray of shape (num_samples, len_time_series, num_dimensions)
        Y: ndarray of shape (num_samples) containing int values corresponding to the class
    '''
    def get_arrays(self):
        X_ts = np.concatenate(self.X, axis=0)
        Y_ts = np.concatenate(self.Y)
        return X_ts, Y_ts

    ''' Produces a KNN model that can be used to predict maneuvers.
    '''
    def produce_knn(self, k=1, distance_metric='dtw', **kwargs):
        knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=k, metric=distance_metric, **kwargs)
        knn_clf.fit(self.X, self.Y)
        return knn_clf

In [20]:
TrainingData = PreprocessData(training_directory)
TrainingData.smoothen_discretize('gaussian')
# TrainingData.normalize('standardized')
# TrainingData.preprocess()
TrainingData.to_timeseries(100)
X, Y = TrainingData.produce_knn()

In [21]:
X.shape

(600, 50, 2)

In [90]:
TrainingData.X[0][0]

array([[-6.36389335e-02,  1.30353234e-01, -1.82329873e-02,
        -2.62918848e-01, -4.26173395e-01, -4.76318001e-01,
        -7.76477039e-01, -9.36673138e-01, -7.17018226e-01,
        -5.98285387e-01, -5.87059265e-01, -4.76407980e-01,
        -3.84835215e-01, -4.04347984e-01, -2.58613583e-01,
        -1.39456814e-01, -3.23120733e-02,  7.71416120e-02,
         2.08742472e-02,  1.74646679e-02, -1.06795535e-02,
         1.20084438e-01,  1.34140183e-01,  1.87118399e-01,
         1.39884323e-01,  6.35695687e-02,  6.19733510e-02,
         5.76316952e-02, -1.06442210e-01, -1.84797089e-01,
        -4.06173067e-01, -3.37861429e-01, -1.48409476e-01,
        -9.46714928e-03, -1.26102302e-01, -1.16813296e-01,
         3.91221269e-03,  1.01776182e-01,  3.46962461e-02,
         6.27627627e-02,  8.71126200e-02, -7.82807616e-02,
        -3.52202702e-01, -2.31563640e-01, -8.98409020e-02,
        -1.45740244e-02,  3.06453594e-02,  1.27606884e-01,
         1.47813624e-01,  4.22987623e-01],
       [ 5.81