In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

plt.style.use('ggplot')

%matplotlib inline

In [2]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [3]:
def save_model(classifier, directory, model_type, hz):
    '''Saves model to defined folder.'''

    import os
    import m2cgen as m2c
    
    BASE_PATH = f'models/{directory}/{model_type}'
    FILE_NAME = f'{model_type}_{hz}hz.py'

    if not os.path.isdir('models'):
        os.mkdir(BASE_PATH)

    code = m2c.export_to_python(classifier)
    
    with open(os.join(BASE_PATH + FILE_NAME), 'w') as f:
        f.writelines(code)

In [4]:
def transform_data_for_inference(df):
    '''
        Transoforms dataset for inference.
        ms,acc,gyro -> acc_x_0, gyro_x_0, acc_x_10, gyro_x_10, .... acc_x_n, gyro_x_n
    '''

    df_list=[]

    for time in df.index:
        _df = pd.DataFrame(df.loc[time]).T
        # _df.drop('ms',axis=1, inplace=True)
        df_list.append(_df.add_suffix(f'_{str(int(time))}').reset_index(drop=True))

    return pd.concat(df_list, axis=1)


def get_filter_string(start, step):
    '''
        Creates a string to filter dataset for defined timetimestamps.
        To be used with df.filter(regex='<string returned from this functions>')
        Example: 0|50|100
    '''

    keep = np.arange(start, start+1+1000, step=int(step))
    return '|'.join(map(str, keep.astype(int)))


def line_color(inf_result):
    '''Returns color associated with inference result.'''
    colors = {
        1:'blue',
        2:'red',
        3:'green'
    }
    return colors[inf_result]


def downsample_df(df, freq):
    '''Downsamples dataset.'''

    def get_period(frequency):
        return int(1000 / frequency)

    period = get_period(freq)

    last_index_ms = df.index[-1]
    keep = np.arange(last_index_ms, step=period)

    return df.loc[keep]


def run_inference(df, model, start, step):
    '''Runs inference.'''
    regex_filter = get_filter_string(start=start, step=step)
    data = list(df.filter(regex=f'_({regex_filter})$').loc[0])
    # print(len(data))
    return model.score(data)

def calculate_error(res, move_type):
    '''
        Calculates inference error rate in validation data.
    '''

    error_setup = {
        'circle': {'err_1':1,'err_2':2},
        'x':{'err_1':2, 'err_2':3},
        'y':{'err_1':1,'err_2':3}
    }

    err_1 = error_setup[move_type]['err_1']
    err_2 = error_setup[move_type]['err_2']

    val_counts = res['result'].value_counts().drop(0) # dropping `no movement`
    val_counts_keys = val_counts.keys()

    total_wrong = 0

    if err_1 in val_counts_keys:
        total_wrong += val_counts[err_1]
    if err_2 in val_counts_keys:
        total_wrong += val_counts[err_2]
    
    return (total_wrong / val_counts.sum()) * 100


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

def train_decision_tree(X_train, X_test, y_train, y_test, hz, direc):
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)

    save_model(classifier=clf, direc=direc, model_type='decision_tree', hz=hz)

    return accuracy, f1, precision, recall, clf

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

def train_random_forest(X_train, X_test, y_train, y_test, hz, direc):
    clf = RandomForestClassifier(random_state=42, n_estimators=4)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)

    save_model(classifier=clf, direc=direc, model_type='random_forest', hz=hz)

    return accuracy, f1, precision, recall, clf

In [7]:
from sklearn.svm import LinearSVC
from sklearn import metrics

def train_svc(X_train, X_test, y_train, y_test, hz, direc):
    clf = LinearSVC(random_state=42, max_iter=1000)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)

    save_model(classifier=clf, direc=direc, model_type='svc', hz=hz)

    return accuracy, f1, precision, recall, clf

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

def train_logistic_regression(X_train, X_test, y_train, y_test, hz, direc):
    clf = LogisticRegression(random_state=42,  max_iter=10_000)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)

    save_model(classifier=clf, direc=direc, model_type='logistic_regression', hz=hz)

    return accuracy, f1, precision, recall, clf

In [9]:
def get_df_baseline(df):
    return df[(df['shift'] == 0)]

def get_df_center(df):
    df = df[
            ((df['label'] == 1) & (df['shift'] == -20))
            | ((df['label'] == 2) & (df['shift'] == -20))
            | ((df['label'] == 3) & (df['shift'] == 0))
            | ((df['label'] == 0) & (df['shift'] == 0))
        ]
    return df

def get_df_shift_aug(df):
    df = df[
            ((df['label'] == 1) & (df['shift'] <= -15))
            | ((df['label'] == 2) & (df['shift'] <= -15))
            | ((df['label'] == 3) & (df['shift'] == -2))
            | ((df['label'] == 3) & (df['shift'] == -1))
            | ((df['label'] == 3) & (df['shift'] == 0))
            | ((df['label'] == 3) & (df['shift'] == 1))
            | ((df['label'] == 3) & (df['shift'] == 2))
            | ((df['label'] == 0) & (df['shift'] > 1))
        ]
    return df

def get_df_end(df):
    df = df[
            ((df['label'] == 1) & (df['shift'] == -37))
            | ((df['label'] == 2) & (df['shift'] == -37))
            | ((df['label'] == 3) & (df['shift'] == 0))
            | ((df['label'] == 0) & (df['shift'] == 0))
        ]
    return df

In [10]:
collect_metrics = {}
collect_metrics['decision_tree'] = {}
collect_metrics['random_forest'] = {}
collect_metrics['svc'] = {}
collect_metrics['logistic_regression'] = {}



for hz in [10,20,25,50,100]:
    print(f'Training on dataset with {hz} Hz')
    df = pd.read_csv(f'data/transformed/20210529_v2_data_all_{hz}hz.csv').reset_index(drop=True)
    
    df_train = get_df_baseline(df)
    df_train = df_train.dropna(axis=0)


    print('DF Shape', df_train.shape)
    X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['label','shift'],axis=1), df_train['label'], test_size=0.3, random_state=42)

    print('\n')
    print(f'Decision Tree {hz} Hz')
    accuracy, f1, precision, recall, dt = train_decision_tree(X_train, X_test, y_train, y_test, hz=hz, direc=MODELS)

    test_data = [X_train.iloc[0]]
    dt_time = get_ipython().run_line_magic('timeit', '-o dt.predict(test_data)')

    collect_metrics['decision_tree'][hz] = {
        'accuracy':accuracy,
        'f1':f1,
        'precision':precision,
        'recall':recall,
        'time':dt_time.timings
    }
    print('\n')

    print(f'Random Forest {hz} Hz')
    accuracy, f1, precision, recall, rfc = train_random_forest(X_train, X_test, y_train, y_test, hz=hz, direc=MODELS)

    test_data = [X_train.iloc[0]]
    rfc_time = get_ipython().run_line_magic('timeit', '-o rfc.predict(test_data)')

    collect_metrics['random_forest'][hz] = {
        'accuracy':accuracy,
        'f1':f1,
        'precision':precision,
        'recall':recall,
        'time': rfc_time.timings
    }
    print('\n')

    print(f'SVC {hz} Hz')
    accuracy, f1, precision, recall, svc = train_svc(X_train, X_test, y_train, y_test, hz=hz, direc=MODELS)

    test_data = [X_train.iloc[0]]
    svc_time = get_ipython().run_line_magic('timeit', '-o svc.predict(test_data)')

    collect_metrics['svc'][hz] = {
        'accuracy':accuracy,
        'f1':f1,
        'precision':precision,
        'recall':recall,
        'time':svc_time.timings
    }
    print('\n')
    
    print(f'Logistic regression {hz} Hz')
    accuracy, f1, precision, recall, lr = train_logistic_regression(X_train, X_test, y_train, y_test, hz=hz, direc=MODELS)

    test_data = [X_train.iloc[0]]
    lr_time = get_ipython().run_line_magic('timeit', '-o lr.predict(test_data)')

    collect_metrics['logistic_regression'][hz] = {
        'accuracy':accuracy,
        'f1':f1,
        'precision':precision,
        'recall':recall,
        'time':lr_time.timings
    }
    print('-' * 50)

Training on dataset with 10 Hz
DF Shape (228, 68)


Decision Tree 10 Hz


NameError: name 'MODELS' is not defined

In [None]:
def trainer_helper(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    
    accuracy = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)

    return accuracy, f1, precision, recall, clf


In [13]:
def run_training(hz, X_train, X_test, y_train, y_test, collect_time=False):
    collect_metrics = {}

    dt = DecisionTreeClassifier(random_state=42)
    rf = RandomForestClassifier(random_state=42, n_estimators=4)
    lsvc = LinearSVC(random_state=42, max_iter=1_000)
    lr = LogisticRegression(random_state=42,  max_iter=10_000)

    model_setup = [
                    (dt, 'decision_tree'),
                    (rf,'random_forest'),
                    (lsvc, 'svc'),
                    (lr, 'logistic_regression')
                ]

    for item in model_setup:
        clf = item[0]
        name = item[1]

        collect_metrics[name] = {}
        collect_metrics[name][hz] = {}

        print('\n')
        print(f'{name} {hz} Hz')

        accuracy, f1, precision, recall, model = clf(X_train, X_test, y_train, y_test, hz=hz, directory=name)

        collect_metrics['decision_tree'][hz] = {
                'accuracy':accuracy,
                'f1':f1,
                'precision':precision,
                'recall':recall}

        if collect_time:
            test_data = [X_train.iloc[0]]
            inf_time = get_ipython().run_line_magic('timeit', '-o model.predict(test_data)')
            collect_metrics[name][hz]['time'] = inf_time.timings

    return collect_metrics



In [14]:
for hz in [10,20,25,50,100][:1]:
    print(f'Training on dataset with {hz} Hz')
    df = pd.read_csv(f'data/transformed/20210529_v2_data_all_{hz}hz.csv').reset_index(drop=True)
    
    df_train = get_df_baseline(df)
    df_train = df_train.dropna(axis=0)


    print('DF Shape', df_train.shape)
    X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['label','shift'],axis=1), df_train['label'], test_size=0.3, random_state=42)

    run_training(hz, X_train, X_test, y_train, y_test)

Training on dataset with 10 Hz
DF Shape (228, 68)


decision_tree 10 Hz


TypeError: 'DecisionTreeClassifier' object is not callable