In [1]:
import pandas as pd
import keepsake
import numpy as np
import scipy.stats as stats

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

plt.style.use('ggplot')

%matplotlib inline

In [2]:
# only need to be run once to initialize

# ! echo 'repository: "file://.keepsake"' > keepsake.yaml 

In [3]:
# Generate combinations
from itertools import combinations

comb_list = []

for features in [1,2,3]:
    for com in combinations(['gyro_x','gyro_y','gyro_z'], features):
        comb_list.append(f'acc_x|acc_y|acc_z|{"|".join(list(com))}|label')
comb_list

['acc_x|acc_y|acc_z|gyro_x|label',
 'acc_x|acc_y|acc_z|gyro_y|label',
 'acc_x|acc_y|acc_z|gyro_z|label',
 'acc_x|acc_y|acc_z|gyro_x|gyro_y|label',
 'acc_x|acc_y|acc_z|gyro_x|gyro_z|label',
 'acc_x|acc_y|acc_z|gyro_y|gyro_z|label',
 'acc_x|acc_y|acc_z|gyro_x|gyro_y|gyro_z|label']

In [4]:
# df_10hz = pd.read_csv('data/transformed/20210529_v2_data_all_10hz.csv')
df_20hz = pd.read_csv('data/transformed/20210529_v2_data_all_20hz.csv')
df_25hz = pd.read_csv('data/transformed/20210529_v2_data_all_25hz.csv')
df_50hz = pd.read_csv('data/transformed/20210529_v2_data_all_50hz.csv')
# df_100hz = pd.read_csv('data/transformed/20210529_v2_data_all_100hz.csv')

In [5]:
def get_df_base(df):
    df = df[(df['shift'] == 0)]
    return df.dropna(axis=0)

In [6]:
df_20hz = get_df_base(df_20hz)
df_25hz = get_df_base(df_25hz)
df_50hz = get_df_base(df_50hz)

In [7]:
def save_model_optimized(classifier, stage, dataset, model_type, exp_id):
    '''
        Saves model to defined folder.

        stage - baseline/optimized
        dataset - base/centered/end/etc
        model_types - decision_tree, random_forest, ...
        hz - frequency
    '''

    import os
    import m2cgen as m2c
    
    BASE_PATH = f'models/{stage}/{dataset}/{model_type}/'
    FILE_NAME = f'{model_type}_{exp_id}.py'

    if not os.path.exists(BASE_PATH):
        os.makedirs(BASE_PATH)

    code = m2c.export_to_python(classifier)
    with open(BASE_PATH + FILE_NAME, 'w') as f:
        f.writelines(code)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics

is_save_model=True
model_type = 'random_forest'
stage='optimized'
dataset='base'
quantization=None

cutoff=0.99
dataset_test_sizes = [0.35]
datasets_setup = [(df_20hz, 20), (df_25hz, 25), (df_50hz, 50)]

for df_t in datasets_setup:
    for comb in comb_list:
        for dataset_test_size in dataset_test_sizes:
            df_filtered = df_t[0].filter(regex=comb)
            X_train, X_test, y_train, y_test = train_test_split(
                df_filtered.drop('label',axis=1), df_filtered['label'], test_size=dataset_test_size, random_state=42)

            for estimators in [4,5,6]:
                clf = RandomForestClassifier(n_jobs=-1, n_estimators=estimators, random_state=42)
                clf.fit(X_train, y_train)

                y_pred = clf.predict(X_test)

                accuracy = metrics.accuracy_score(y_test, y_pred)
                f1 = metrics.f1_score(y_test, y_pred, average='macro')
                precision = metrics.precision_score(y_test, y_pred, average='macro')
                recall = metrics.recall_score(y_test, y_pred, average='macro')

                if recall > cutoff:
                    signals = comb.replace('|label','').split('|')
                    print(f"Signals: {signals} @ {df_t[1]} >> Acc: {accuracy}, Prec: {precision}, Recall: {recall}")
                    
                    if is_save_model:
                        experiment = keepsake.init(
                            params={
                                'model':model_type,
                                'features': signals,
                                'feature_count': len(signals),
                                'n_estimators': estimators,
                                'dataset_test_size': dataset_test_size,
                                'hz':df_t[1],
                                'data_set':dataset,
                                'quantization': quantization,
                                'other_params': 'default',
                                'accuracy': accuracy,
                                'precision':precision,
                                'recall':recall,
                                'f1':f1
                                })

                        path = save_model_optimized(clf, stage=stage, dataset=dataset, model_type=model_type, exp_id=experiment.id[:7])

                        experiment.checkpoint(
                            path=path,
                            metrics={"accurracy":accuracy, "f1": f1, "precision": precision, "recall": recall},
                            primary_metric=("recall","maximize")
                        )
                        experiment.stop()

[2m═══╡ [0mCreating experiment 5ee0933...
[2m═══╡ [0mCreating checkpoint 6b64329...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0
Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 3705b82...
[2m═══╡ [0mCreating checkpoint a696a16...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment d75f1ea...
[2m═══╡ [0mCreating checkpoint f90d0e8...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 4d9ba42...
[2m═══╡ [0mCreating checkpoint 888b2c1...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'] @ 25 >> Acc: 0.9875, Prec: 0.984375, Recall: 0.9924242424242424


[2m═══╡ [0mCreating experiment 706aea9...
[2m═══╡ [0mCreating checkpoint 166e4e7...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'] @ 25 >> Acc: 0.9875, Prec: 0.984375, Recall: 0.9924242424242424


[2m═══╡ [0mCreating experiment e228c03...
[2m═══╡ [0mCreating checkpoint b63dacf...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 50 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 0136ff8...
[2m═══╡ [0mCreating checkpoint ee5c849...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z'] @ 50 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment a8bc105...
[2m═══╡ [0mCreating checkpoint 82ed6d8...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'] @ 50 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 1a919a5...
[2m═══╡ [0mCreating checkpoint b9d7531...


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

is_save_model=True
model_type = 'decision_tree'
stage='optimized'
dataset='base'
quantization=None

cutoff=0.985
dataset_test_sizes = [0.35]
datasets_setup = [(df_20hz, 20), (df_25hz, 25), (df_50hz, 50)]

for df_t in datasets_setup:
    for comb in comb_list:
        for dataset_test_size in dataset_test_sizes:
            df_filtered = df_t[0].filter(regex=comb)
            X_train, X_test, y_train, y_test = train_test_split(
                df_filtered.drop('label',axis=1), df_filtered['label'], test_size=dataset_test_size, random_state=42)

            for estimators in [4,5,6]:
                clf = DecisionTreeClassifier(random_state=42)
                clf.fit(X_train, y_train)

                y_pred = clf.predict(X_test)

                accuracy = metrics.accuracy_score(y_test, y_pred)
                f1 = metrics.f1_score(y_test, y_pred, average='macro')
                precision = metrics.precision_score(y_test, y_pred, average='macro')
                recall = metrics.recall_score(y_test, y_pred, average='macro')

                if recall > cutoff:
                    signals = comb.replace('|label','').split('|')
                    print(f"Signals: {signals} @ {df_t[1]} >> Acc: {accuracy}, Prec: {precision}, Recall: {recall}")
                    
                    if is_save_model:
                        experiment = keepsake.init(
                            params={
                                'model':model_type,
                                'features': signals,
                                'feature_count': len(signals),
                                'n_estimators': estimators,
                                'dataset_test_size': dataset_test_size,
                                'hz':df_t[1],
                                'data_set':dataset,
                                'quantization': quantization,
                                'other_params': 'default',
                                'accuracy': accuracy,
                                'precision':precision,
                                'recall':recall,
                                'f1':f1
                                })

                        path = save_model_optimized(clf, stage=stage, dataset=dataset, model_type=model_type, exp_id=experiment.id[:7])

                        experiment.checkpoint(
                            path=path,
                            metrics={"accurracy":accuracy, "f1": f1, "precision": precision, "recall": recall},
                            primary_metric=("recall","maximize")
                        )
                        experiment.stop()

Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 55a187f...
[2m═══╡ [0mCreating checkpoint f657040...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 4e9abf7...
[2m═══╡ [0mCreating checkpoint 6b901ac...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment a619ed2...
[2m═══╡ [0mCreating checkpoint ec9b383...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_y'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 8e89dfc...
[2m═══╡ [0mCreating checkpoint a4a74b1...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_y'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 6a6868f...
[2m═══╡ [0mCreating checkpoint b86968a...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_y'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 3dab78c...
[2m═══╡ [0mCreating checkpoint 9e2831a...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_z'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment d0d141e...
[2m═══╡ [0mCreating checkpoint 1eb8a47...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_z'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 20ee1e4...
[2m═══╡ [0mCreating checkpoint 1c7eb3b...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_z'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 647eac6...
[2m═══╡ [0mCreating checkpoint 4611a2c...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x'] @ 50 >> Acc: 0.9875, Prec: 0.9722222222222222, Recall: 0.9895833333333334


[2m═══╡ [0mCreating experiment f7dee0f...
[2m═══╡ [0mCreating checkpoint 0f8297b...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x'] @ 50 >> Acc: 0.9875, Prec: 0.9722222222222222, Recall: 0.9895833333333334


[2m═══╡ [0mCreating experiment 2804e5d...
[2m═══╡ [0mCreating checkpoint cb57d7b...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x'] @ 50 >> Acc: 0.9875, Prec: 0.9722222222222222, Recall: 0.9895833333333334


[2m═══╡ [0mCreating experiment 82db6a2...
[2m═══╡ [0mCreating checkpoint 094bfdf...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_y'] @ 50 >> Acc: 0.9875, Prec: 0.9722222222222222, Recall: 0.9895833333333334


[2m═══╡ [0mCreating experiment 9f1f49f...
[2m═══╡ [0mCreating checkpoint 8f7762d...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_y'] @ 50 >> Acc: 0.9875, Prec: 0.9722222222222222, Recall: 0.9895833333333334


[2m═══╡ [0mCreating experiment 20c41aa...
[2m═══╡ [0mCreating checkpoint 811575c...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_y'] @ 50 >> Acc: 0.9875, Prec: 0.9722222222222222, Recall: 0.9895833333333334


[2m═══╡ [0mCreating experiment fd11ea3...
[2m═══╡ [0mCreating checkpoint 3a08a37...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_z'] @ 50 >> Acc: 0.9875, Prec: 0.9722222222222222, Recall: 0.9895833333333334


[2m═══╡ [0mCreating experiment 881d8a3...
[2m═══╡ [0mCreating checkpoint 0a81c72...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_z'] @ 50 >> Acc: 0.9875, Prec: 0.9722222222222222, Recall: 0.9895833333333334


[2m═══╡ [0mCreating experiment 37d13bd...
[2m═══╡ [0mCreating checkpoint 68e6639...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_z'] @ 50 >> Acc: 0.9875, Prec: 0.9722222222222222, Recall: 0.9895833333333334


[2m═══╡ [0mCreating experiment 752a24a...
[2m═══╡ [0mCreating checkpoint 2663202...


In [12]:
keepsake.experiments.list(lambda exp: exp.params['accuracy'] == 1.0 and exp.params['model'] == 'random_forest')

id,created,params,latest_checkpoint,best_checkpoint
5ee0933,2021-06-11 18:02:29.878801,"{'quantization': None, 'dataset_test_size': 0.35, 'other_params': 'default', 'f1': 1, 'data_set': 'base', 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'precision': 1, 'model': 'random_forest', 'accuracy': 1, 'recall': 1, 'n_estimators': 4, 'feature_count': 5, 'hz': 20}",6b64329 (step 0; recall: 1),6b64329 (step 0; recall: 1)
3705b82,2021-06-11 18:02:30.950993,"{'f1': 1, 'model': 'random_forest', 'n_estimators': 5, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'other_params': 'default', 'data_set': 'base', 'dataset_test_size': 0.35, 'precision': 1, 'feature_count': 5, 'hz': 20, 'quantization': None, 'accuracy': 1, 'recall': 1}",a696a16 (step 0; recall: 1),a696a16 (step 0; recall: 1)
d75f1ea,2021-06-11 18:02:32.011452,"{'precision': 1, 'n_estimators': 6, 'feature_count': 5, 'data_set': 'base', 'other_params': 'default', 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'accuracy': 1, 'hz': 20, 'quantization': None, 'dataset_test_size': 0.35, 'f1': 1, 'model': 'random_forest', 'recall': 1}",f90d0e8 (step 0; recall: 1),f90d0e8 (step 0; recall: 1)
4d9ba42,2021-06-11 18:02:33.080264,"{'accuracy': 1, 'recall': 1, 'other_params': 'default', 'data_set': 'base', 'hz': 20, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z'], 'quantization': None, 'n_estimators': 4, 'f1': 1, 'precision': 1, 'feature_count': 5, 'model': 'random_forest', 'dataset_test_size': 0.35}",888b2c1 (step 0; recall: 1),888b2c1 (step 0; recall: 1)
0136ff8,2021-06-11 18:02:36.906082,"{'hz': 50, 'f1': 1, 'precision': 1, 'feature_count': 5, 'model': 'random_forest', 'n_estimators': 6, 'quantization': None, 'recall': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'dataset_test_size': 0.35, 'other_params': 'default', 'data_set': 'base', 'accuracy': 1}",ee5c849 (step 0; recall: 1),ee5c849 (step 0; recall: 1)
a8bc105,2021-06-11 18:02:38.004194,"{'recall': 1, 'n_estimators': 6, 'data_set': 'base', 'model': 'random_forest', 'quantization': None, 'other_params': 'default', 'dataset_test_size': 0.35, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z'], 'precision': 1, 'f1': 1, 'accuracy': 1, 'hz': 50, 'feature_count': 5}",82ed6d8 (step 0; recall: 1),82ed6d8 (step 0; recall: 1)
1a919a5,2021-06-11 18:02:39.130068,"{'f1': 1, 'feature_count': 6, 'precision': 1, 'other_params': 'default', 'dataset_test_size': 0.35, 'data_set': 'base', 'recall': 1, 'hz': 50, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'], 'n_estimators': 5, 'accuracy': 1, 'model': 'random_forest', 'quantization': None}",b9d7531 (step 0; recall: 1),b9d7531 (step 0; recall: 1)


In [13]:
keepsake.experiments.list(lambda exp: exp.params['accuracy'] == 1.0 and exp.params['model'] == 'decision_tree')

id,created,params,latest_checkpoint,best_checkpoint
55a187f,2021-06-11 18:02:40.403022,"{'data_set': 'base', 'accuracy': 1, 'precision': 1, 'hz': 25, 'f1': 1, 'recall': 1, 'model': 'decision_tree', 'other_params': 'default', 'dataset_test_size': 0.35, 'n_estimators': 4, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x'], 'quantization': None, 'feature_count': 4}",f657040 (step 0; recall: 1),f657040 (step 0; recall: 1)
4e9abf7,2021-06-11 18:02:41.451743,"{'accuracy': 1, 'data_set': 'base', 'n_estimators': 5, 'quantization': None, 'model': 'decision_tree', 'other_params': 'default', 'dataset_test_size': 0.35, 'hz': 25, 'recall': 1, 'precision': 1, 'f1': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x'], 'feature_count': 4}",6b901ac (step 0; recall: 1),6b901ac (step 0; recall: 1)
a619ed2,2021-06-11 18:02:42.499350,"{'quantization': None, 'other_params': 'default', 'precision': 1, 'data_set': 'base', 'model': 'decision_tree', 'feature_count': 4, 'accuracy': 1, 'dataset_test_size': 0.35, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x'], 'f1': 1, 'n_estimators': 6, 'recall': 1, 'hz': 25}",ec9b383 (step 0; recall: 1),ec9b383 (step 0; recall: 1)
8e89dfc,2021-06-11 18:02:43.550260,"{'model': 'decision_tree', 'precision': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_y'], 'f1': 1, 'other_params': 'default', 'feature_count': 4, 'quantization': None, 'data_set': 'base', 'recall': 1, 'n_estimators': 4, 'dataset_test_size': 0.35, 'hz': 25, 'accuracy': 1}",a4a74b1 (step 0; recall: 1),a4a74b1 (step 0; recall: 1)
6a6868f,2021-06-11 18:02:44.599815,"{'hz': 25, 'quantization': None, 'data_set': 'base', 'model': 'decision_tree', 'feature_count': 4, 'n_estimators': 5, 'precision': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_y'], 'accuracy': 1, 'f1': 1, 'other_params': 'default', 'dataset_test_size': 0.35, 'recall': 1}",b86968a (step 0; recall: 1),b86968a (step 0; recall: 1)
3dab78c,2021-06-11 18:02:45.645298,"{'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_y'], 'model': 'decision_tree', 'feature_count': 4, 'recall': 1, 'f1': 1, 'accuracy': 1, 'n_estimators': 6, 'precision': 1, 'dataset_test_size': 0.35, 'data_set': 'base', 'other_params': 'default', 'hz': 25, 'quantization': None}",9e2831a (step 0; recall: 1),9e2831a (step 0; recall: 1)
d0d141e,2021-06-11 18:02:46.698828,"{'dataset_test_size': 0.35, 'n_estimators': 4, 'hz': 25, 'f1': 1, 'model': 'decision_tree', 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_z'], 'recall': 1, 'accuracy': 1, 'quantization': None, 'other_params': 'default', 'precision': 1, 'data_set': 'base', 'feature_count': 4}",1eb8a47 (step 0; recall: 1),1eb8a47 (step 0; recall: 1)
20ee1e4,2021-06-11 18:02:47.745302,"{'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_z'], 'feature_count': 4, 'accuracy': 1, 'model': 'decision_tree', 'f1': 1, 'precision': 1, 'data_set': 'base', 'recall': 1, 'quantization': None, 'dataset_test_size': 0.35, 'n_estimators': 5, 'hz': 25, 'other_params': 'default'}",1c7eb3b (step 0; recall: 1),1c7eb3b (step 0; recall: 1)
647eac6,2021-06-11 18:02:48.791788,"{'data_set': 'base', 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_z'], 'dataset_test_size': 0.35, 'n_estimators': 6, 'other_params': 'default', 'f1': 1, 'precision': 1, 'recall': 1, 'model': 'decision_tree', 'hz': 25, 'feature_count': 4, 'quantization': None, 'accuracy': 1}",4611a2c (step 0; recall: 1),4611a2c (step 0; recall: 1)
