In [1]:
from datetime import datetime
from pathlib import Path
import sklearn
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import numpy as np
import pandas as pd

print(sklearn.__version__)

1.5.2


In [2]:
ACTIVITY_TAG_MAP = {
    1: "fall_forward_hands",
    2: "fall_forward_knees",
    3: "fall_backwards",
    4: "fall_sideward",
    5: "fall_sitting_chair",
    6: "walking",
    7: "standing",
    8: "sitting",
    9: "picking_object",
    10: "jumping",
    11: "laying",
}

In [3]:
def get_metrics(expected_y, predicted_y):
    return {
        "recall": metrics.recall_score(expected_y, predicted_y),
        "f1": metrics.f1_score(expected_y, predicted_y),
        "precision": metrics.precision_score(expected_y, predicted_y),
        "accuracy": metrics.accuracy_score(expected_y, predicted_y),
    }

In [4]:
models = {
    "KNN": {
        "estimator": KNN(metric="euclidean"),
        "params": {
            "n_neighbors": [3, 5, 7, 9, 11],
            "weights": ["uniform", "distance"],
        },
    },
    "RF": {
        "estimator": RandomForestClassifier(
            criterion="gini", oob_score=metrics.recall_score
        ),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_features": ["sqrt", "log2"],
        },
    },
    # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
    "MLP": {
        "estimator": MLPClassifier(solver="adam", max_iter=1000),
        "params": {
            # "max_iter": [100, 500, 1000],  # number of epochs
            "hidden_layer_sizes": [
                (8,),
                (8, 8),
                (8, 16),
                (8, 32),
                (8, 64),
                (8, 128),
                (16,),
                (16, 8),
                (16, 16),
                (16, 32),
                (16, 64),
                (16, 128),
                (32,),
                (32, 8),
                (32, 16),
                (32, 32),
                (32, 64),
                (32, 128),
                (64,),
                (64, 8),
                (64, 16),
                (64, 32),
                (64, 64),
                (64, 128),
                (128,),
                (128, 8),
                (128, 16),
                (128, 32),
                (128, 64),
                (128, 128),
            ],
        },
    },
    "SVM": {
        "estimator": SVC(),
        "params": {"class_weight": [{0: 1, 1: 1}, {0: 1, 1: 10}, {0: 1, 1: 20}]},
    },
}

algorithm_options = ["KNN", "RF", "MLP", "SVM"]
# algorithm = "KNN"

In [5]:
datasets_path = Path.cwd() / "datasets"
columns_not_used_for_training = ["Timestamp", "Subject", "Trial", "Activity", "Tag"]

skf = StratifiedKFold(n_splits=5, shuffle=True)

In [None]:
for algorithm in algorithm_options:
    model = models[algorithm]

    gs_cv_model = GridSearchCV(
        model["estimator"],
        model["params"],
        scoring="f1",
        n_jobs=-1,
        cv=2,
    )
    output_dir_name = (
        "results/" + datetime.today().strftime("%Y-%m-%d_%H-%M_") + algorithm
    )

    output_dir = Path.cwd() / output_dir_name

    output_dir.mkdir(exist_ok=True, parents=True)

    training_info_output_file = output_dir / "_training_info.txt"

    training_info_output_file.write_text(f"{algorithm} - {model['params']}")

    output_file_path = output_dir / f"{algorithm}_training_results.csv"

    output_file = open(output_file_path, mode="w", encoding="utf-8")
    output_file.write(
        "sensor_position,features_domain,n_fold,recall,f1,precision,accuracy,best_params\n"
    )

    mistakes_file_path = output_dir / f"{algorithm}_mistakes.csv"
    mistakes_file = open(mistakes_file_path, mode="w", encoding="utf-8")
    mistakes_file.write(
        "sensor_position,features_domain,n_fold,"
        + ",".join(ACTIVITY_TAG_MAP.values())
        + "\n"
    )

    for dataset in datasets_path.iterdir():

        data = pd.read_csv(dataset, header=0).dropna()
        data = data[data["Tag"] != 20]

        X = data.drop(columns=columns_not_used_for_training)
        # X = data
        print(X.shape)

        y = pd.DataFrame(
            {"is_fall": [0 if row > 5 else 1 for row in data["Tag"]]}
        ).values.reshape(
            -1,
        )

        print(y.shape)

        # print(data.iloc[0])
        # print(data.at[data.index[0], "Tag"])

        # continue

        n_fold = 0

        sensor_position, features_domain = dataset.name.rstrip("dataset.csv").split(
            "_", 1
        )

        features_domain = "both" if not features_domain else features_domain.rstrip("_")

        for train, test in skf.split(X, y):

            X_train, X_test = X.iloc[train], X.iloc[test]

            y_train, y_test = y[train], y[test]

            classifier = gs_cv_model.fit(X_train, y_train)

            prediction = classifier.predict(X_test)

            # print(prediction)

            fold_metrics = get_metrics(y_test, prediction)

            output_file.write(
                f"{sensor_position},{features_domain},{n_fold},{fold_metrics['recall']},"
                f"{fold_metrics['f1']},{fold_metrics['precision']},{fold_metrics['accuracy']},\"{classifier.best_params_}\"\n"
            )

            print(
                f"{sensor_position},{features_domain},{n_fold},{fold_metrics['recall']},"
                f"{fold_metrics['f1']},{fold_metrics['precision']},{fold_metrics['accuracy']},\"{classifier.best_params_}\"\n"
            )

            fold_mistakes = {i: 0 for i in range(1, 12)}

            for i in range(len(prediction)):
                if prediction[i] != y_test[i]:
                    fold_mistakes[data.at[data.index[test[i]], "Tag"]] += 1

            mistakes_file.write(
                f"{sensor_position},{features_domain},{n_fold},"
                f"{','.join(str(value) for value in fold_mistakes.values())}\n"
            )

            print(
                f"{sensor_position},{features_domain},{n_fold},"
                f"{','.join(str(value) for value in fold_mistakes.values())}\n"
            )

            print(metrics.confusion_matrix(y_test, prediction))

            n_fold += 1

    output_file.close()
    mistakes_file.close()


# print(metrics.classification_report(expected_y, predicted_y))

In [7]:
len(classifier.best_estimator_.feature_importances_)

NameError: name 'classifier' is not defined