In [1]:
from datetime import datetime
from pathlib import Path
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier
from statistics import multimode, mode
import numpy as np
import pandas as pd

In [2]:
def get_metrics(expected_y, predicted_y):
    return {
        "recall": metrics.recall_score(expected_y, predicted_y),
        "f1": metrics.f1_score(expected_y, predicted_y),
        "precision": metrics.precision_score(expected_y, predicted_y),
        "accuracy": metrics.accuracy_score(expected_y, predicted_y),
    }


def choose_model_params(params: list):
    return multimode(params)

In [3]:
models = {
    "KNN": {
        "estimator": KNN(metric="euclidean"),
        "params": {
            "n_neighbors": [3, 5, 7, 9, 11],
            "weights": ["uniform", "distance"],
        },
    },
    "RF": {
        "estimator": RandomForestClassifier(
            criterion="gini", oob_score=metrics.recall_score
        ),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_features": ["sqrt", "log2"],
        },
    },
    # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
    "MLP": {
        "estimator": MLPClassifier(solver="adam", max_iter=500),
        "params": {
            # "max_iter": [100, 500, 1000],  # number of epochs
            "hidden_layer_sizes": [
                (8,),
                (8, 8),
                (8, 16),
                (8, 32),
                (8, 64),
                (8, 128),
                (16,),
                (16, 8),
                (16, 16),
                (16, 32),
                (16, 64),
                (16, 128),
                (32,),
                (32, 8),
                (32, 16),
                (32, 32),
                (32, 64),
                (32, 128),
                (64,),
                (64, 8),
                (64, 16),
                (64, 32),
                (64, 64),
                (64, 128),
                (128,),
                (128, 8),
                (128, 16),
                (128, 32),
                (128, 64),
                (128, 128),
            ],
            # "activation": ["relu"],
            # "penalty(alpha?)": [0.0001],
            # "batch_size": ["auto"],
            # "learning_rate_init": [0.001],
            # "beta_1": [0.9],  # From docs: Only used when solver=’adam’.
            # "beta_2": [0.999],  # Froms docs: Only used when solver=’adam’.
            # "n_iter_no_change": [10],
        },
    },
}
algorithm_options = ["KNN", "RF", "MLP"]
algorithm = "RF"
model = models[algorithm]

In [4]:
datasets_path = Path.cwd() / "datasets"
columns_not_used_for_training = ["Timestamp", "Subject", "Trial", "Activity", "Tag"]

skf = StratifiedKFold(n_splits=5, shuffle=True)

gs_cv_model = GridSearchCV(
    model["estimator"], model["params"], scoring="recall", n_jobs=-1, cv=2
)

In [5]:
output_dir_name = "results/" + datetime.today().strftime("%Y-%m-%d_%H-%M_") + algorithm
output_dir = Path.cwd() / output_dir_name
output_dir.mkdir(exist_ok=True, parents=True)

training_info_output_file = output_dir / "_training_info.txt"
training_info_output_file.write_text(f"{algorithm} - {model['params']}")

output_file_path = output_dir / f"{algorithm}_training_results.csv"

output_file = open(output_file_path, mode="w", encoding="utf-8")
output_file.write(
    "sensor_position, features_domain, n_fold, recall, f1, precision, accuracy, best_params\n"
)

for dataset in datasets_path.iterdir():
    data = pd.read_csv(dataset, header=0).dropna()
    X = data.drop(columns=columns_not_used_for_training)
    print(X.shape)

    y = pd.DataFrame(
        {"is_fall": [0 if row > 5 else 1 for row in data["Tag"]]}
    ).values.reshape(
        -1,
    )
    print(y.shape)

    n_fold = 0
    sensor_position, features_domain = dataset.name.rstrip("dataset.csv").split("_", 1)

    features_domain = "both" if not features_domain else features_domain.rstrip("_")

    for train, test in skf.split(X, y):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y[train], y[test]

        classifier = gs_cv_model.fit(X_train, y_train)

        prediction = classifier.predict(X_test)

        fold_metrics = get_metrics(y_test, prediction)

        output_file.write(
            f"{sensor_position}, {features_domain}, {n_fold}, {fold_metrics['recall']}, "
            f"{fold_metrics['f1']}, {fold_metrics['precision']}, {fold_metrics['accuracy']}, {classifier.best_params_}\n"
        )

        n_fold += 1

output_file.close()

# print(metrics.classification_report(expected_y, predicted_y))

(32290, 108)
(32290,)


  _data = np.array(data, dtype=dtype, copy=copy,


(32290, 36)
(32290,)
(32294, 72)
(32294,)
(32290, 108)
(32290,)
(32290, 36)
(32290,)
(32294, 72)
(32294,)
(28466, 540)
(28466,)
(32290, 36)
(32290,)
(32294, 72)
(32294,)
(32290, 108)
(32290,)
(32290, 36)
(32290,)
(32294, 72)
(32294,)
(28466, 108)
(28466,)
(28466, 36)
(28466,)
(28468, 72)
(28468,)
(32290, 108)
(32290,)
(32290, 36)
(32290,)
(32294, 72)
(32294,)


In [None]:
print(sorted_by_recall)

In [None]:
gs_cv_model.best_params_