In [1]:
from datetime import datetime
from pathlib import Path
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier
from statistics import multimode, mode
import numpy as np
import pandas as pd

In [8]:
def get_metrics(expected_y, predicted_y):
    return {
        "recall": metrics.recall_score(expected_y, predicted_y),
        "f1": metrics.f1_score(expected_y, predicted_y),
        "precision": metrics.precision_score(expected_y, predicted_y),
        "accuracy": metrics.accuracy_score(expected_y, predicted_y),
    }


def choose_model_params(params: list):
    return multimode(params)

In [9]:
models = {
    "KNN": {
        "estimator": KNN(metric="euclidean"),
        "params": {
            "n_neighbors": [3, 5, 7, 9, 11],
            "weights": ["uniform", "distance"],
        },
    },
    "RF": {
        "estimator": RandomForestClassifier(
            criterion="gini", oob_score=metrics.recall_score
        ),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_features": ["sqrt", "log2"],
        },
    },
    # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
    "MLP": {
        "estimator": MLPClassifier(solver="adam", max_iter=500),
        "params": {
            # "max_iter": [100, 500, 1000],  # number of epochs
            "hidden_layer_sizes": [
                (8,),
                (8, 8),
                (8, 16),
                (8, 32),
                (8, 64),
                (8, 128),
                (16,),
                (16, 8),
                (16, 16),
                (16, 32),
                (16, 64),
                (16, 128),
                (32,),
                (32, 8),
                (32, 16),
                (32, 32),
                (32, 64),
                (32, 128),
                (64,),
                (64, 8),
                (64, 16),
                (64, 32),
                (64, 64),
                (64, 128),
                (128,),
                (128, 8),
                (128, 16),
                (128, 32),
                (128, 64),
                (128, 128),
            ],
            # "activation": ["relu"],
            # "penalty(alpha?)": [0.0001],
            # "batch_size": ["auto"],
            # "learning_rate_init": [0.001],
            # "beta_1": [0.9],  # From docs: Only used when solver=’adam’.
            # "beta_2": [0.999],  # Froms docs: Only used when solver=’adam’.
            # "n_iter_no_change": [10],
        },
    },
}
algorithm_options = ["KNN", "RF", "MLP"]
algorithm = "MLP"
model = models[algorithm]

In [10]:
datasets_path = Path.cwd() / "datasets"
columns_not_used_for_training = ["Timestamp", "Subject", "Trial", "Activity", "Tag"]

skf = StratifiedKFold(n_splits=5, shuffle=True)

gs_cv_model = GridSearchCV(
    model["estimator"], model["params"], scoring="recall", n_jobs=-1, cv=2
)

In [11]:
output_dir_name = "results/" + datetime.today().strftime("%Y-%m-%d_%H-%M_") + algorithm
output_dir = Path.cwd() / output_dir_name
output_dir.mkdir(exist_ok=True, parents=True)

training_info_output_file = output_dir / "_training_info.txt"
training_info_output_file.write_text(f"{algorithm} - {model['params']}")

for dataset in datasets_path.iterdir():
    data = pd.read_csv(dataset, header=0).dropna()
    X = data.drop(columns=columns_not_used_for_training)
    print(X.shape)

    y = pd.DataFrame(
        {"is_fall": [0 if row > 5 else 1 for row in data["Tag"]]}
    ).values.reshape(
        -1,
    )
    print(y.shape)

    predicted_y = []
    expected_y = []
    n_fold = 0
    best_params = []

    output_file_path = (
        f"{str(dataset).replace('datasets', output_dir_name).replace('csv', 'txt')}"
    )
    output_file = open(output_file_path, mode="w", encoding="utf-8")
    output_file.write("n_fold, recall, f1, precision, accuracy, best_params\n")

    for train, test in skf.split(X, y):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y[train], y[test]

        classifier = gs_cv_model.fit(X_train, y_train)

        prediction = classifier.predict(X_test)
        predicted_y.extend(prediction)
        expected_y.extend(y_test)

        fold_metrics = get_metrics(y_test, prediction)

        output_file.write(
            f"{n_fold}, {fold_metrics['recall']}, {fold_metrics['f1']}, {fold_metrics['precision']}, {fold_metrics['accuracy']}, {classifier.best_params_}\n"
        )

        best_params.append(
            {"params": classifier.best_params_, "recall": fold_metrics["recall"]}
        )
        n_fold += 1

    # Choose best model between folds
    sorted_by_recall = sorted(best_params, key=lambda d: d["recall"], reverse=True)
    print(sorted_by_recall)
    params = [str(d["params"]) for d in sorted_by_recall]
    print(params)
    chosen_params = multimode(params)

    general_metrics = get_metrics(expected_y, predicted_y)
    output_file.write(
        f"-1, {general_metrics['recall']}, {general_metrics['f1']}, {general_metrics['accuracy']}, {general_metrics['accuracy']}, {chosen_params}\n"
    )
    output_file.close()

    print(metrics.classification_report(expected_y, predicted_y))

(32290, 108)
(32290,)
[{'params': {'hidden_layer_sizes': (32,)}, 'recall': np.float64(0.675392670157068)}, {'params': {'hidden_layer_sizes': (128, 128)}, 'recall': np.float64(0.5968586387434555)}, {'params': {'hidden_layer_sizes': (16, 32)}, 'recall': np.float64(0.5287958115183246)}, {'params': {'hidden_layer_sizes': (16, 16)}, 'recall': np.float64(0.4607329842931937)}, {'params': {'hidden_layer_sizes': (64, 128)}, 'recall': np.float64(0.010471204188481676)}]
["{'hidden_layer_sizes': (32,)}", "{'hidden_layer_sizes': (128, 128)}", "{'hidden_layer_sizes': (16, 32)}", "{'hidden_layer_sizes': (16, 16)}", "{'hidden_layer_sizes': (64, 128)}"]
              precision    recall  f1-score   support

           0       0.98      0.95      0.97     31335
           1       0.21      0.45      0.29       955

    accuracy                           0.93     32290
   macro avg       0.60      0.70      0.63     32290
weighted avg       0.96      0.93      0.95     32290

(32290, 36)
(32290,)
[{'para

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[{'params': {'hidden_layer_sizes': (32,)}, 'recall': np.float64(0.6492146596858639)}, {'params': {'hidden_layer_sizes': (32, 32)}, 'recall': np.float64(0.5759162303664922)}, {'params': {'hidden_layer_sizes': (32,)}, 'recall': np.float64(0.44502617801047123)}, {'params': {'hidden_layer_sizes': (64, 128)}, 'recall': np.float64(0.06282722513089005)}, {'params': {'hidden_layer_sizes': (128, 16)}, 'recall': np.float64(0.0)}]
["{'hidden_layer_sizes': (32,)}", "{'hidden_layer_sizes': (32, 32)}", "{'hidden_layer_sizes': (32,)}", "{'hidden_layer_sizes': (64, 128)}", "{'hidden_layer_sizes': (128, 16)}"]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     31339
           1       0.36      0.35      0.35       955

    accuracy                           0.96     32294
   macro avg       0.67      0.66      0.67     32294
weighted avg       0.96      0.96      0.96     32294

(32290, 108)
(32290,)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[{'params': {'hidden_layer_sizes': (32, 16)}, 'recall': np.float64(0.31413612565445026)}, {'params': {'hidden_layer_sizes': (64, 32)}, 'recall': np.float64(0.24607329842931938)}, {'params': {'hidden_layer_sizes': (32, 128)}, 'recall': np.float64(0.015706806282722512)}, {'params': {'hidden_layer_sizes': (8, 32)}, 'recall': np.float64(0.0)}, {'params': {'hidden_layer_sizes': (8, 16)}, 'recall': np.float64(0.0)}]
["{'hidden_layer_sizes': (32, 16)}", "{'hidden_layer_sizes': (64, 32)}", "{'hidden_layer_sizes': (32, 128)}", "{'hidden_layer_sizes': (8, 32)}", "{'hidden_layer_sizes': (8, 16)}"]
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     31335
           1       0.12      0.12      0.12       955

    accuracy                           0.95     32290
   macro avg       0.55      0.55      0.55     32290
weighted avg       0.95      0.95      0.95     32290

(32290, 36)
(32290,)
[{'params': {'hidden_layer_sizes': (32, 32)}, 'recall': np

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[{'params': {'hidden_layer_sizes': (32,)}, 'recall': np.float64(0.387434554973822)}, {'params': {'hidden_layer_sizes': (32, 64)}, 'recall': np.float64(0.09947643979057591)}, {'params': {'hidden_layer_sizes': (64, 128)}, 'recall': np.float64(0.08376963350785341)}, {'params': {'hidden_layer_sizes': (128, 32)}, 'recall': np.float64(0.005235602094240838)}, {'params': {'hidden_layer_sizes': (128, 32)}, 'recall': np.float64(0.0)}]
["{'hidden_layer_sizes': (32,)}", "{'hidden_layer_sizes': (32, 64)}", "{'hidden_layer_sizes': (64, 128)}", "{'hidden_layer_sizes': (128, 32)}", "{'hidden_layer_sizes': (128, 32)}"]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     31339
           1       0.34      0.12      0.17       955

    accuracy                           0.97     32294
   macro avg       0.66      0.55      0.58     32294
weighted avg       0.95      0.97      0.96     32294

(28466, 540)
(28466,)
[{'params': {'hidden_layer_sizes': (128, 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[{'params': {'hidden_layer_sizes': (16, 64)}, 'recall': np.float64(0.27225130890052357)}, {'params': {'hidden_layer_sizes': (32, 128)}, 'recall': np.float64(0.0)}, {'params': {'hidden_layer_sizes': (16, 128)}, 'recall': np.float64(0.0)}, {'params': {'hidden_layer_sizes': (128, 64)}, 'recall': np.float64(0.0)}, {'params': {'hidden_layer_sizes': (128, 32)}, 'recall': np.float64(0.0)}]
["{'hidden_layer_sizes': (16, 64)}", "{'hidden_layer_sizes': (32, 128)}", "{'hidden_layer_sizes': (16, 128)}", "{'hidden_layer_sizes': (128, 64)}", "{'hidden_layer_sizes': (128, 32)}"]
              precision    recall  f1-score   support

           0       0.97      0.96      0.97     31335
           1       0.04      0.05      0.05       955

    accuracy                           0.94     32290
   macro avg       0.51      0.51      0.51     32290
weighted avg       0.94      0.94      0.94     32290

(32294, 72)
(32294,)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[{'params': {'hidden_layer_sizes': (16, 16)}, 'recall': np.float64(0.450261780104712)}, {'params': {'hidden_layer_sizes': (32,)}, 'recall': np.float64(0.29842931937172773)}, {'params': {'hidden_layer_sizes': (64,)}, 'recall': np.float64(0.1099476439790576)}, {'params': {'hidden_layer_sizes': (16, 32)}, 'recall': np.float64(0.0)}, {'params': {'hidden_layer_sizes': (8, 16)}, 'recall': np.float64(0.0)}]
["{'hidden_layer_sizes': (16, 16)}", "{'hidden_layer_sizes': (32,)}", "{'hidden_layer_sizes': (64,)}", "{'hidden_layer_sizes': (16, 32)}", "{'hidden_layer_sizes': (8, 16)}"]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     31339
           1       0.29      0.17      0.22       955

    accuracy                           0.96     32294
   macro avg       0.63      0.58      0.60     32294
weighted avg       0.95      0.96      0.96     32294

(32290, 108)
(32290,)
[{'params': {'hidden_layer_sizes': (16,)}, 'recall': np.float64(0.4973821

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[{'params': {'hidden_layer_sizes': (128, 128)}, 'recall': np.float64(0.17801047120418848)}, {'params': {'hidden_layer_sizes': (128,)}, 'recall': np.float64(0.1256544502617801)}, {'params': {'hidden_layer_sizes': (16, 128)}, 'recall': np.float64(0.0)}, {'params': {'hidden_layer_sizes': (128,)}, 'recall': np.float64(0.0)}, {'params': {'hidden_layer_sizes': (16, 128)}, 'recall': np.float64(0.0)}]
["{'hidden_layer_sizes': (128, 128)}", "{'hidden_layer_sizes': (128,)}", "{'hidden_layer_sizes': (16, 128)}", "{'hidden_layer_sizes': (128,)}", "{'hidden_layer_sizes': (16, 128)}"]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     31335
           1       0.11      0.06      0.08       955

    accuracy                           0.96     32290
   macro avg       0.54      0.52      0.53     32290
weighted avg       0.95      0.96      0.95     32290

(32294, 72)
(32294,)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[{'params': {'hidden_layer_sizes': (128,)}, 'recall': np.float64(0.6073298429319371)}, {'params': {'hidden_layer_sizes': (16,)}, 'recall': np.float64(0.2774869109947644)}, {'params': {'hidden_layer_sizes': (64, 128)}, 'recall': np.float64(0.015706806282722512)}, {'params': {'hidden_layer_sizes': (32, 128)}, 'recall': np.float64(0.010471204188481676)}, {'params': {'hidden_layer_sizes': (32, 8)}, 'recall': np.float64(0.0)}]
["{'hidden_layer_sizes': (128,)}", "{'hidden_layer_sizes': (16,)}", "{'hidden_layer_sizes': (64, 128)}", "{'hidden_layer_sizes': (32, 128)}", "{'hidden_layer_sizes': (32, 8)}"]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     31339
           1       0.39      0.18      0.25       955

    accuracy                           0.97     32294
   macro avg       0.68      0.59      0.62     32294
weighted avg       0.96      0.97      0.96     32294

(28466, 108)
(28466,)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[{'params': {'hidden_layer_sizes': (64,)}, 'recall': np.float64(0.43529411764705883)}, {'params': {'hidden_layer_sizes': (64, 32)}, 'recall': np.float64(0.4235294117647059)}, {'params': {'hidden_layer_sizes': (64, 64)}, 'recall': np.float64(0.42105263157894735)}, {'params': {'hidden_layer_sizes': (128, 8)}, 'recall': np.float64(0.0)}, {'params': {'hidden_layer_sizes': (64, 128)}, 'recall': np.float64(0.0)}]
["{'hidden_layer_sizes': (64,)}", "{'hidden_layer_sizes': (64, 32)}", "{'hidden_layer_sizes': (64, 64)}", "{'hidden_layer_sizes': (128, 8)}", "{'hidden_layer_sizes': (64, 128)}"]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     27615
           1       0.32      0.26      0.29       851

    accuracy                           0.96     28466
   macro avg       0.65      0.62      0.63     28466
weighted avg       0.96      0.96      0.96     28466

(28466, 36)
(28466,)
[{'params': {'hidden_layer_sizes': (32, 32)}, 'recall': np.flo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[{'params': {'hidden_layer_sizes': (64, 64)}, 'recall': np.float64(0.13612565445026178)}, {'params': {'hidden_layer_sizes': (64,)}, 'recall': np.float64(0.09424083769633508)}, {'params': {'hidden_layer_sizes': (32, 32)}, 'recall': np.float64(0.005235602094240838)}, {'params': {'hidden_layer_sizes': (32, 128)}, 'recall': np.float64(0.005235602094240838)}, {'params': {'hidden_layer_sizes': (8, 16)}, 'recall': np.float64(0.0)}]
["{'hidden_layer_sizes': (64, 64)}", "{'hidden_layer_sizes': (64,)}", "{'hidden_layer_sizes': (32, 32)}", "{'hidden_layer_sizes': (32, 128)}", "{'hidden_layer_sizes': (8, 16)}"]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     31339
           1       0.23      0.05      0.08       955

    accuracy                           0.97     32294
   macro avg       0.60      0.52      0.53     32294
weighted avg       0.95      0.97      0.96     32294



In [None]:
print(sorted_by_recall)

In [None]:
gs_cv_model.best_params_