In [1]:
import pandas as pd
import keepsake
import numpy as np
import scipy.stats as stats

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

plt.style.use('ggplot')

%matplotlib inline

In [2]:
# only need to be run once to initialize

# ! echo 'repository: "file://.keepsake"' > keepsake.yaml 

In [3]:
# Generate combinations
from itertools import combinations

comb_list = []

for features in [1,2,3]:
    for com in combinations(['gyro_x','gyro_y','gyro_z'], features):
        comb_list.append(f'acc_x|acc_y|acc_z|{"|".join(list(com))}|label')
comb_list

['acc_x|acc_y|acc_z|gyro_x|label',
 'acc_x|acc_y|acc_z|gyro_y|label',
 'acc_x|acc_y|acc_z|gyro_z|label',
 'acc_x|acc_y|acc_z|gyro_x|gyro_y|label',
 'acc_x|acc_y|acc_z|gyro_x|gyro_z|label',
 'acc_x|acc_y|acc_z|gyro_y|gyro_z|label',
 'acc_x|acc_y|acc_z|gyro_x|gyro_y|gyro_z|label']

In [4]:
# df_10hz = pd.read_csv('data/transformed/20210529_v2_data_all_10hz.csv')
df_20hz = pd.read_csv('data/transformed/20210529_v2_data_all_20hz.csv')
df_25hz = pd.read_csv('data/transformed/20210529_v2_data_all_25hz.csv')
df_50hz = pd.read_csv('data/transformed/20210529_v2_data_all_50hz.csv')
# df_100hz = pd.read_csv('data/transformed/20210529_v2_data_all_100hz.csv')

In [5]:
def get_df_base(df):
    df = df[(df['shift'] == 0)]
    return df.dropna(axis=0)

In [6]:
df_20hz = get_df_base(df_20hz)
df_25hz = get_df_base(df_25hz)
df_50hz = get_df_base(df_50hz)

In [7]:
def save_model_optimized(classifier, stage, dataset, model_type, exp_id):
    '''
        Saves model to defined folder.

        stage - baseline/optimized
        dataset - base/centered/end/etc
        model_types - decision_tree, random_forest, ...
        hz - frequency
    '''

    import os
    import m2cgen as m2c
    
    BASE_PATH = f'models/{stage}/{dataset}/{model_type}/'
    FILE_NAME = f'{model_type}_{exp_id}.py'

    if not os.path.exists(BASE_PATH):
        os.makedirs(BASE_PATH)

    code = m2c.export_to_python(classifier)
    with open(BASE_PATH + FILE_NAME, 'w') as f:
        f.writelines(code)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics

is_save_model = True
model_type = 'random_forest'
stage = 'optimized'
dataset = 'base'
quantization = None
estimators = None

cutoff=0.99
dataset_test_sizes = [0.35]
datasets_setup = [(df_20hz, 20), (df_25hz, 25), (df_50hz, 50)]

for df_t in datasets_setup:
    for comb in comb_list:
        for dataset_test_size in dataset_test_sizes:
            df_filtered = df_t[0].filter(regex=comb)
            X_train, X_test, y_train, y_test = train_test_split(
                df_filtered.drop('label',axis=1), df_filtered['label'], test_size=dataset_test_size, random_state=42)

            for estimators in [4,5,6]:
                clf = RandomForestClassifier(n_jobs=-1, n_estimators=estimators, random_state=42)
                clf.fit(X_train, y_train)

                y_pred = clf.predict(X_test)

                accuracy = metrics.accuracy_score(y_test, y_pred)
                f1 = metrics.f1_score(y_test, y_pred, average='macro')
                precision = metrics.precision_score(y_test, y_pred, average='macro')
                recall = metrics.recall_score(y_test, y_pred, average='macro')

                if recall > cutoff:
                    signals = comb.replace('|label','').split('|')
                    print(f"Signals: {signals} @ {df_t[1]} >> Acc: {accuracy}, Prec: {precision}, Recall: {recall}")
                    
                    if is_save_model:
                        experiment = keepsake.init(
                            params={
                                'model':model_type,
                                'features': signals,
                                'feature_count': len(signals),
                                'n_estimators': estimators,
                                'dataset_test_size': dataset_test_size,
                                'hz':df_t[1],
                                'data_set':dataset,
                                'quantization': quantization,
                                'other_params': 'default',
                                'accuracy': accuracy,
                                'precision':precision,
                                'recall':recall,
                                'f1':f1
                                })

                        path = save_model_optimized(clf, stage=stage, dataset=dataset, model_type=model_type, exp_id=experiment.id[:7])

                        experiment.checkpoint(
                            path=path,
                            metrics={"accurracy":accuracy, "f1": f1, "precision": precision, "recall": recall},
                            primary_metric=("recall","maximize")
                        )
                        experiment.stop()

[2m═══╡ [0mCreating experiment 3d76d7f...
[2m═══╡ [0mCreating checkpoint f1a5d95...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0
Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 13bfccf...
[2m═══╡ [0mCreating checkpoint c95b470...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 108ab22...
[2m═══╡ [0mCreating checkpoint 4c4f248...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z'] @ 20 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment b3dfc46...
[2m═══╡ [0mCreating checkpoint cbe53bd...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'] @ 25 >> Acc: 0.9875, Prec: 0.984375, Recall: 0.9924242424242424


[2m═══╡ [0mCreating experiment eb10317...
[2m═══╡ [0mCreating checkpoint 178bd05...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'] @ 25 >> Acc: 0.9875, Prec: 0.984375, Recall: 0.9924242424242424


[2m═══╡ [0mCreating experiment 600aa47...
[2m═══╡ [0mCreating checkpoint e0254c0...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'] @ 50 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 801e911...
[2m═══╡ [0mCreating checkpoint 773d393...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z'] @ 50 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 4dd6089...
[2m═══╡ [0mCreating checkpoint 20197d7...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'] @ 50 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment bd45d83...
[2m═══╡ [0mCreating checkpoint ab6ebed...


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

is_save_model = True
model_type = 'decision_tree'
stage = 'optimized'
dataset = 'base'
quantization = None
estimators = None

cutoff=0.99
dataset_test_sizes = [0.35]
datasets_setup = [(df_20hz, 20), (df_25hz, 25), (df_50hz, 50)]

for df_t in datasets_setup:
    for comb in comb_list:
        for dataset_test_size in dataset_test_sizes:
            df_filtered = df_t[0].filter(regex=comb)
            X_train, X_test, y_train, y_test = train_test_split(
                df_filtered.drop('label',axis=1), df_filtered['label'], test_size=dataset_test_size, random_state=42)

            for estimators in [4,5,6]:
                clf = DecisionTreeClassifier(random_state=42)
                clf.fit(X_train, y_train)

                y_pred = clf.predict(X_test)

                accuracy = metrics.accuracy_score(y_test, y_pred)
                f1 = metrics.f1_score(y_test, y_pred, average='macro')
                precision = metrics.precision_score(y_test, y_pred, average='macro')
                recall = metrics.recall_score(y_test, y_pred, average='macro')

                if recall > cutoff:
                    signals = comb.replace('|label','').split('|')
                    print(f"Signals: {signals} @ {df_t[1]} >> Acc: {accuracy}, Prec: {precision}, Recall: {recall}")
                    
                    if is_save_model:
                        experiment = keepsake.init(
                            params={
                                'model':model_type,
                                'features': signals,
                                'feature_count': len(signals),
                                'n_estimators': estimators,
                                'dataset_test_size': dataset_test_size,
                                'hz':df_t[1],
                                'data_set':dataset,
                                'quantization': quantization,
                                'other_params': 'default',
                                'accuracy': accuracy,
                                'precision':precision,
                                'recall':recall,
                                'f1':f1
                                })

                        path = save_model_optimized(clf, stage=stage, dataset=dataset, model_type=model_type, exp_id=experiment.id[:7])

                        experiment.checkpoint(
                            path=path,
                            metrics={"accurracy":accuracy, "f1": f1, "precision": precision, "recall": recall},
                            primary_metric=("recall","maximize")
                        )
                        experiment.stop()

Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 422cad5...
[2m═══╡ [0mCreating checkpoint 3598ade...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 7af5dfd...
[2m═══╡ [0mCreating checkpoint 1c59282...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_x'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment c21431e...
[2m═══╡ [0mCreating checkpoint 8b655fc...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_y'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 7f7e49b...
[2m═══╡ [0mCreating checkpoint d30e6c0...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_y'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment fe811f6...
[2m═══╡ [0mCreating checkpoint e899e1b...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_y'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 6eb4b55...
[2m═══╡ [0mCreating checkpoint 35dfca7...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_z'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment c68b1db...
[2m═══╡ [0mCreating checkpoint 170830a...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_z'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment 76a01f4...
[2m═══╡ [0mCreating checkpoint 1387cfe...


Signals: ['acc_x', 'acc_y', 'acc_z', 'gyro_z'] @ 25 >> Acc: 1.0, Prec: 1.0, Recall: 1.0


[2m═══╡ [0mCreating experiment b3ffc1f...
[2m═══╡ [0mCreating checkpoint a410ab0...


In [10]:
keepsake.experiments.list(lambda exp: exp.params['accuracy'] == 1.0 and exp.params['model'] == 'random_forest')

[33m═══╡ [0mFailed to load metadata from "metadata/experiments/.DS_Store": Parse error: invalid character '\x00' looking for beginning of value


id,created,params,latest_checkpoint,best_checkpoint
3d76d7f,2021-06-11 18:16:22.207957,"{'n_estimators': 4, 'model': 'random_forest', 'data_set': 'base', 'quantization': None, 'other_params': 'default', 'precision': 1, 'dataset_test_size': 0.35, 'recall': 1, 'hz': 20, 'feature_count': 5, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'f1': 1, 'accuracy': 1}",f1a5d95 (step 0; recall: 1),f1a5d95 (step 0; recall: 1)
13bfccf,2021-06-11 18:16:23.271930,"{'feature_count': 5, 'f1': 1, 'other_params': 'default', 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'precision': 1, 'quantization': None, 'recall': 1, 'data_set': 'base', 'model': 'random_forest', 'accuracy': 1, 'n_estimators': 5, 'hz': 20, 'dataset_test_size': 0.35}",c95b470 (step 0; recall: 1),c95b470 (step 0; recall: 1)
108ab22,2021-06-11 18:16:24.331389,"{'feature_count': 5, 'accuracy': 1, 'data_set': 'base', 'dataset_test_size': 0.35, 'model': 'random_forest', 'recall': 1, 'n_estimators': 6, 'other_params': 'default', 'f1': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'precision': 1, 'quantization': None, 'hz': 20}",4c4f248 (step 0; recall: 1),4c4f248 (step 0; recall: 1)
b3dfc46,2021-06-11 18:16:25.398884,"{'hz': 20, 'precision': 1, 'data_set': 'base', 'other_params': 'default', 'n_estimators': 4, 'quantization': None, 'f1': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z'], 'accuracy': 1, 'model': 'random_forest', 'feature_count': 5, 'recall': 1, 'dataset_test_size': 0.35}",cbe53bd (step 0; recall: 1),cbe53bd (step 0; recall: 1)
801e911,2021-06-11 18:16:29.230001,"{'other_params': 'default', 'precision': 1, 'dataset_test_size': 0.35, 'data_set': 'base', 'accuracy': 1, 'hz': 50, 'quantization': None, 'feature_count': 5, 'f1': 1, 'n_estimators': 6, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y'], 'model': 'random_forest', 'recall': 1}",773d393 (step 0; recall: 1),773d393 (step 0; recall: 1)
4dd6089,2021-06-11 18:16:30.348131,"{'n_estimators': 6, 'feature_count': 5, 'model': 'random_forest', 'dataset_test_size': 0.35, 'precision': 1, 'quantization': None, 'data_set': 'base', 'hz': 50, 'accuracy': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_z'], 'recall': 1, 'other_params': 'default', 'f1': 1}",20197d7 (step 0; recall: 1),20197d7 (step 0; recall: 1)
bd45d83,2021-06-11 18:16:31.482641,"{'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z'], 'model': 'random_forest', 'quantization': None, 'data_set': 'base', 'recall': 1, 'f1': 1, 'dataset_test_size': 0.35, 'n_estimators': 5, 'accuracy': 1, 'hz': 50, 'feature_count': 6, 'other_params': 'default', 'precision': 1}",ab6ebed (step 0; recall: 1),ab6ebed (step 0; recall: 1)


In [11]:
keepsake.experiments.list(lambda exp: exp.params['accuracy'] == 1.0 and exp.params['model'] == 'decision_tree')

id,created,params,latest_checkpoint,best_checkpoint
422cad5,2021-06-11 18:16:32.770409,"{'data_set': 'base', 'feature_count': 4, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x'], 'precision': 1, 'accuracy': 1, 'hz': 25, 'other_params': 'default', 'quantization': None, 'n_estimators': 4, 'f1': 1, 'recall': 1, 'dataset_test_size': 0.35, 'model': 'decision_tree'}",3598ade (step 0; recall: 1),3598ade (step 0; recall: 1)
7af5dfd,2021-06-11 18:16:33.820238,"{'n_estimators': 5, 'precision': 1, 'recall': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x'], 'other_params': 'default', 'model': 'decision_tree', 'quantization': None, 'accuracy': 1, 'dataset_test_size': 0.35, 'f1': 1, 'data_set': 'base', 'feature_count': 4, 'hz': 25}",1c59282 (step 0; recall: 1),1c59282 (step 0; recall: 1)
c21431e,2021-06-11 18:16:34.865517,"{'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_x'], 'model': 'decision_tree', 'quantization': None, 'data_set': 'base', 'hz': 25, 'dataset_test_size': 0.35, 'f1': 1, 'feature_count': 4, 'other_params': 'default', 'recall': 1, 'precision': 1, 'accuracy': 1, 'n_estimators': 6}",8b655fc (step 0; recall: 1),8b655fc (step 0; recall: 1)
7f7e49b,2021-06-11 18:16:35.910328,"{'precision': 1, 'f1': 1, 'accuracy': 1, 'data_set': 'base', 'recall': 1, 'n_estimators': 4, 'model': 'decision_tree', 'dataset_test_size': 0.35, 'feature_count': 4, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_y'], 'other_params': 'default', 'hz': 25, 'quantization': None}",d30e6c0 (step 0; recall: 1),d30e6c0 (step 0; recall: 1)
fe811f6,2021-06-11 18:16:36.961192,"{'f1': 1, 'precision': 1, 'n_estimators': 5, 'data_set': 'base', 'recall': 1, 'other_params': 'default', 'feature_count': 4, 'accuracy': 1, 'model': 'decision_tree', 'dataset_test_size': 0.35, 'quantization': None, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_y'], 'hz': 25}",e899e1b (step 0; recall: 1),e899e1b (step 0; recall: 1)
6eb4b55,2021-06-11 18:16:38.009594,"{'f1': 1, 'model': 'decision_tree', 'accuracy': 1, 'feature_count': 4, 'recall': 1, 'hz': 25, 'quantization': None, 'n_estimators': 6, 'other_params': 'default', 'dataset_test_size': 0.35, 'precision': 1, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_y'], 'data_set': 'base'}",35dfca7 (step 0; recall: 1),35dfca7 (step 0; recall: 1)
c68b1db,2021-06-11 18:16:39.060393,"{'other_params': 'default', 'f1': 1, 'feature_count': 4, 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_z'], 'model': 'decision_tree', 'dataset_test_size': 0.35, 'quantization': None, 'hz': 25, 'accuracy': 1, 'recall': 1, 'precision': 1, 'n_estimators': 4, 'data_set': 'base'}",170830a (step 0; recall: 1),170830a (step 0; recall: 1)
76a01f4,2021-06-11 18:16:40.109861,"{'recall': 1, 'f1': 1, 'model': 'decision_tree', 'accuracy': 1, 'n_estimators': 5, 'quantization': None, 'hz': 25, 'precision': 1, 'feature_count': 4, 'data_set': 'base', 'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_z'], 'dataset_test_size': 0.35, 'other_params': 'default'}",1387cfe (step 0; recall: 1),1387cfe (step 0; recall: 1)
b3ffc1f,2021-06-11 18:16:41.154905,"{'features': ['acc_x', 'acc_y', 'acc_z', 'gyro_z'], 'hz': 25, 'quantization': None, 'accuracy': 1, 'f1': 1, 'other_params': 'default', 'precision': 1, 'data_set': 'base', 'model': 'decision_tree', 'feature_count': 4, 'dataset_test_size': 0.35, 'recall': 1, 'n_estimators': 6}",a410ab0 (step 0; recall: 1),a410ab0 (step 0; recall: 1)
