## **Import libraries**

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#TODO: SAVE EACH MODEL

## **Upload dataset**

In [6]:
#TODO: dane z gotowymi cechami -> czy próbować robić po swojemu ekstrakcję, skoro juz to zrobili?
dataset = pd.read_csv('masterTrain.csv')
dataset = dataset.iloc[:, 1:]
# 377010 x 34
# print(dataset.iloc[0:1,:])

In [7]:
from sklearn.model_selection import train_test_split

# Split dataset into users and impostors
num_classes = len(np.unique(dataset['class']))

user = {}
impostor = {}
X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []

for i in range(num_classes):
    user[i] = dataset.loc[dataset['class'] == i]
    impostor[i] = dataset.loc[dataset['class'] != i]
    # scale impostor to the same size as user
    impostor[i] = impostor[i].sample(n=len(user[i]))

    user[i].loc[:, 'class'] = 0
    impostor[i].loc[:, 'class'] = 1

    merged_data = pd.concat([user[i], impostor[i]])
    
    X = merged_data.loc[:, 'mean_x_speed':'numCritPoints']
    y = merged_data['class']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

#### Feature importance

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
extra_tree_forest = ExtraTreesClassifier(n_estimators = 5, criterion ='entropy', max_features = 30)

extra_tree_forest.fit(X, y)

feature_importance = extra_tree_forest.feature_importances_

feature_importance_normalized = np.std([tree.feature_importances_ for tree in extra_tree_forest.estimators_], axis = 0)

plt.figure(figsize=(10, 6))
plt.bar(X.columns, feature_importance_normalized)
plt.xticks(rotation=70)

plt.xlabel('Feature Labels')
plt.ylabel('Feature Importances')
plt.title('Comparison of different Feature Importances')
plt.tight_layout()
plt.show()

## **Random Forest**

### Hyperparameters tuning

Randomized Search

In [4]:
from sklearn.ensemble import RandomForestClassifier
import random

best_models_random = []

hyperparams = {
        'n_estimators': [700, 750, 850], #TODO: np.arange(700, 1000, step=50), # tested: 100-500, step=100 -> best 500
        'max_depth': [None] + list(np.arange(50, 100, step=10)), # tested: 10-50, step=10
        'min_samples_split': [1, 2, 4], #TODO: [2, 4, 6, 8, 10],
        'min_samples_leaf': [1], #TODO: [1, 2, 4], 
        'criterion': ['entropy'], #TODO: ['gini','entropy'],
        'max_features': ['sqrt'], #TODO: ['log2', 'sqrt'],
        'bootstrap': [False] #TODO: [True, False]
    }

classes = list(range(num_classes))
random.shuffle(classes)
random_classes = classes[:]

for i in random_classes:
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    rf = RandomForestClassifier()

    random_search = RandomizedSearchCV(rf, hyperparams, cv=5, scoring='accuracy', n_jobs=-1, n_iter=10)
    random_search.fit(X_train, y_train)

    print('User ', i)
    print('Best hyperparams:', random_search.best_params_)
    print('Best score: ', random_search.best_score_)

    best_model = random_search.best_estimator_
    best_models_random.append(best_model)

    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCac

User  0
Best hyperparams: {'n_estimators': 750, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8095807068145797
Mean Squared Error: 0.16066145793825382
R-squared (R2): 0.35735416454918656
              precision    recall  f1-score   support

           0       0.83      0.86      0.84      6591
           1       0.85      0.82      0.84      6592

    accuracy                           0.84     13183
   macro avg       0.84      0.84      0.84     13183
weighted avg       0.84      0.84      0.84     13183



30 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalC

User  8
Best hyperparams: {'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 70, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8111077694235588
Mean Squared Error: 0.16777608469003127
R-squared (R2): 0.32889565692342404
              precision    recall  f1-score   support

           0       0.81      0.86      0.84      6235
           1       0.85      0.80      0.83      6234

    accuracy                           0.83     12469
   macro avg       0.83      0.83      0.83     12469
weighted avg       0.83      0.83      0.83     12469



15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalC

User  1
Best hyperparams: {'n_estimators': 850, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8173700787401575
Mean Squared Error: 0.15614764424288233
R-squared (R2): 0.3754094230284707
              precision    recall  f1-score   support

           0       0.83      0.86      0.85      7938
           1       0.86      0.82      0.84      7938

    accuracy                           0.84     15876
   macro avg       0.84      0.84      0.84     15876
weighted avg       0.84      0.84      0.84     15876



10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalC

User  7
Best hyperparams: {'n_estimators': 750, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8170613737980915
Mean Squared Error: 0.1554597366274327
R-squared (R2): 0.3781610534902692
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      8581
           1       0.86      0.82      0.84      8581

    accuracy                           0.84     17162
   macro avg       0.85      0.84      0.84     17162
weighted avg       0.85      0.84      0.84     17162



25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalC

User  9
Best hyperparams: {'n_estimators': 750, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8117147192716236
Mean Squared Error: 0.16521000242777373
R-squared (R2): 0.3391599902889051
              precision    recall  f1-score   support

           0       0.82      0.86      0.84      8238
           1       0.85      0.81      0.83      8238

    accuracy                           0.83     16476
   macro avg       0.84      0.83      0.83     16476
weighted avg       0.84      0.83      0.83     16476



25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalC

User  2
Best hyperparams: {'n_estimators': 850, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 90, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8213025917336365
Mean Squared Error: 0.1482793867120954
R-squared (R2): 0.4068824503974877
              precision    recall  f1-score   support

           0       0.84      0.87      0.85      7338
           1       0.86      0.83      0.85      7337

    accuracy                           0.85     14675
   macro avg       0.85      0.85      0.85     14675
weighted avg       0.85      0.85      0.85     14675



30 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalC

User  3
Best hyperparams: {'n_estimators': 750, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 90, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8138846135814128
Mean Squared Error: 0.1617490051441328
R-squared (R2): 0.35300397942346884
              precision    recall  f1-score   support

           0       0.81      0.88      0.84     10303
           1       0.87      0.80      0.83     10303

    accuracy                           0.84     20606
   macro avg       0.84      0.84      0.84     20606
weighted avg       0.84      0.84      0.84     20606



20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalC

User  4
Best hyperparams: {'n_estimators': 850, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8005267108978614
Mean Squared Error: 0.17772842639593908
R-squared (R2): 0.28908629441624367
              precision    recall  f1-score   support

           0       0.79      0.88      0.83      7880
           1       0.86      0.77      0.81      7880

    accuracy                           0.82     15760
   macro avg       0.83      0.82      0.82     15760
weighted avg       0.83      0.82      0.82     15760



25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalC

User  6
Best hyperparams: {'n_estimators': 750, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8222169121448213
Mean Squared Error: 0.15740659725260303
R-squared (R2): 0.3703736061693724
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      5715
           1       0.86      0.82      0.84      5714

    accuracy                           0.84     11429
   macro avg       0.84      0.84      0.84     11429
weighted avg       0.84      0.84      0.84     11429



20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Kuba\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalC

User  5
Best hyperparams: {'n_estimators': 700, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 90, 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.7879334862488993
Mean Squared Error: 0.18796022166552798
R-squared (R2): 0.24815910900520866
              precision    recall  f1-score   support

           0       0.83      0.78      0.81      6587
           1       0.79      0.84      0.82      6586

    accuracy                           0.81     13173
   macro avg       0.81      0.81      0.81     13173
weighted avg       0.81      0.81      0.81     13173



Grid Search

In [8]:
from sklearn.ensemble import RandomForestClassifier
import random

best_models_grid = []

hyperparams = {
        'n_estimators': list(np.arange(700, 800, step=20)),
        'max_depth': [None] + list(np.arange(70, 90, step=5)),
        'min_samples_split': [2],
        'min_samples_leaf': [1], 
        'criterion': ['entropy'],
        'max_features': ['sqrt'],
        'bootstrap': [False]
    }

classes = list(range(num_classes))
random.shuffle(classes)
random_classes = classes[:5]

for i in range(random_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    rf = RandomForestClassifier()

    grid_search = GridSearchCV(rf, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print('User ', i)
    print('Best hyperparams:', grid_search.best_params_)
    print('Best score: ', grid_search.best_score_)

    best_model = grid_search.best_estimator_
    best_models_grid.append(best_model)

    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

User  0
Best hyperparams: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 70, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 780}
Best score:  0.8125391726819478
Mean Squared Error: 0.16369566866418872
R-squared (R2): 0.34521732157561125
              precision    recall  f1-score   support

           0       0.82      0.86      0.84      6591
           1       0.85      0.82      0.83      6592

    accuracy                           0.84     13183
   macro avg       0.84      0.84      0.84     13183
weighted avg       0.84      0.84      0.84     13183

User  1
Best hyperparams: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 70, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 720}
Best score:  0.8166456692913385
Mean Squared Error: 0.16219450743260266
R-squared (R2): 0.35122197026958935
              precision    recall  f1-score   support

           0       0.83      0.86      0.8

### Final model

In [4]:
from sklearn.ensemble import RandomForestClassifier

models_rf = {}

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print("User: ", i)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))
    

User:  0
Mean Squared Error: 0.21603580368656602
R-squared (R2): 0.135856780281437
              precision    recall  f1-score   support

           0       0.76      0.83      0.79      6592
           1       0.81      0.74      0.77      6591

    accuracy                           0.78     13183
   macro avg       0.79      0.78      0.78     13183
weighted avg       0.79      0.78      0.78     13183

User:  1
Mean Squared Error: 0.20105820105820105
R-squared (R2): 0.1957671957671958
              precision    recall  f1-score   support

           0       0.78      0.83      0.80      7938
           1       0.82      0.77      0.79      7938

    accuracy                           0.80     15876
   macro avg       0.80      0.80      0.80     15876
weighted avg       0.80      0.80      0.80     15876

User:  2
Mean Squared Error: 0.21110732538330493
R-squared (R2): 0.15557069454568806
              precision    recall  f1-score   support

           0       0.77      0.83      

# **K Nearest Neighbors**

### Hyperparameters tuning

Randomized Search

In [None]:
from sklearn.ensemble import KNeighborsClassifier
import random

best_models_random = []

hyperparams = {
        'n_neighbors': list(range(2, 21)),
        'leaf_size': list(range(1, 31)),
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2, 3]
    }

classes = list(range(num_classes))
random.shuffle(classes)
random_classes = classes[:3]

for i in random_classes:
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    knn = KNeighborsClassifier()

    random_search = RandomizedSearchCV(knn, hyperparams, cv=5, scoring='accuracy', n_jobs=-1, n_iter=10)
    random_search.fit(X_train, y_train)

    print('User ', i)
    print('Best hyperparams:', random_search.best_params_)
    print('Best score: ', random_search.best_score_)

    best_model = random_search.best_estimator_
    best_models_random.append(best_model)

    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

Grid Search

In [None]:
best_models_grid = []

hyperparams = {
    'n_neighbors': list(range(2, 21)),
    'leaf_size': list(range(1, 31)),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2, 3]
}

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    knn = KNeighborsClassifier()

    grid_search = GridSearchCV(knn, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print('User ', i)
    print('Best hyperparams:', grid_search.best_params_)
    print('Best score: ', grid_search.best_score_)

    best_model = grid_search.best_estimator_
    best_models_grid.append(best_model)

    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

### Final model

In [None]:
from sklearn.ensemble import RandomForestClassifier

models_knn = {}

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)

    predictions = knn.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print("User: ", i)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

# **Neural Network**

In [None]:
%pip install tensorflow

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import f1_score

models_nn = {}

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

    model = Sequential()

    model.add(Dense(256, input_dim=31, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(256, activation='leaky_relu'))
    model.add(Dropout(0.2))
    model.add(Dense(51, activation='softmax'))

    # metrics=['categorical_accuracy']) #TODO: sprawdzić czy to jest dobre
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=100, verbose=2)

    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()

    plt.tight_layout()
    plt.show()

    print("User: ", i)
    predictions = model.predict(X_test)
    predictions = (predictions > 0.95).astype(int)

    f1 = f1_score(y_test, predictions, average='macro')
    print(f'F1 Score (Macro): {f1}')


In [None]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(20, 16))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()