## **Import libraries**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#TODO: SAVE EACH MODEL

## **Upload dataset**

In [2]:
#TODO: dane z gotowymi cechami -> czy próbować robić po swojemu ekstrakcję, skoro juz to zrobili?
dataset = pd.read_csv('masterTrain.csv')
dataset = dataset.iloc[:, 1:]
# 377010 x 34
print(dataset.iloc[0:1,:])

   mean_x_speed  mean_y_speed  mean_speed     mean_acc    mean_jerk  mean_ang  \
0    603.874693     39.584684  607.956707  1980.903177  1373208.638 -0.807362   

   mean_curve  std_x_speed  std_y_speed   std_speed  ...      max_acc  \
0   -0.022807   370.019319    56.034093  369.694944  ...  87490.11908   

     max_ang     max_jerk  max_curve  elapsed_time  sum_of_angles  \
0  38.722551  5776079.105   0.160875      0.105366       0.611254   

   accTimeatBeg  traj_length  numCritPoints  class  
0      0.007992    46.307298              3      0  

[1 rows x 34 columns]


In [3]:
from sklearn.model_selection import train_test_split

# Split dataset into users and impostors
num_classes = len(np.unique(dataset['class']))

user = {}
impostor = {}
X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []

for i in range(num_classes):
    user[i] = dataset.loc[dataset['class'] == i]
    impostor[i] = dataset.loc[dataset['class'] != i]
    # scale impostor to the same size as user
    impostor[i] = impostor[i].sample(n=len(user[i]))

    user[i].loc[:, 'class'] = 0
    impostor[i].loc[:, 'class'] = 1

    merged_data = pd.concat([user[i], impostor[i]])
    
    X = merged_data.loc[:, 'mean_x_speed':'numCritPoints']
    y = merged_data['class']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

## **Random Forest**

In [4]:
from sklearn.ensemble import RandomForestClassifier

models_rf = {}

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print("User: ", i)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))
    

User:  0
Mean Squared Error: 0.21603580368656602
R-squared (R2): 0.135856780281437
              precision    recall  f1-score   support

           0       0.76      0.83      0.79      6592
           1       0.81      0.74      0.77      6591

    accuracy                           0.78     13183
   macro avg       0.79      0.78      0.78     13183
weighted avg       0.79      0.78      0.78     13183

User:  1
Mean Squared Error: 0.20105820105820105
R-squared (R2): 0.1957671957671958
              precision    recall  f1-score   support

           0       0.78      0.83      0.80      7938
           1       0.82      0.77      0.79      7938

    accuracy                           0.80     15876
   macro avg       0.80      0.80      0.80     15876
weighted avg       0.80      0.80      0.80     15876

User:  2
Mean Squared Error: 0.21110732538330493
R-squared (R2): 0.15557069454568806
              precision    recall  f1-score   support

           0       0.77      0.83      

### Hyperparameters tuning

Randomized Search

In [7]:
from sklearn.ensemble import RandomForestClassifier
import random

best_hyperparams_random = []
best_models_random = []

hyperparams = {
        'n_estimators': [100], #np.arange(100, 500, step=50),
        # 'max_depth': [None] + list(np.arange(10, 100, step=20)), # TODO: ograniczyć max_depth do np. 50
        # 'min_samples_split': [2, 4, 6, 8, 10],
        # 'min_samples_leaf': [1, 2, 4], 
        'criterion':['gini','entropy'],
        'max_features': ['log2', 'sqrt'],
        'bootstrap': [False]
    }

classes = list(range(num_classes))
random.shuffle(classes)
random_classes = classes[:3]

for i in random_classes:
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    rf = RandomForestClassifier()

    random_search = RandomizedSearchCV(rf, hyperparams, cv=5, scoring='accuracy', n_jobs=-1, n_iter=10)
    random_search.fit(X_train, y_train)

    print('User ', i)
    print('Best hyperparams:', random_search.best_params_)
    print('Best score: ', random_search.best_score_)

    best_model = random_search.best_estimator_
    best_models_random.append(best_model)
    best_hyperparams_random.append(random_search.best_params_)

    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
User 0
Best hyperparams: {'n_estimators': 100, 'max_features': 'log2', 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.800932977149361
Mean Squared Error: 0.17120534021087763
R-squared (R2): 0.3151786352160123
              precision    recall  f1-score   support

           0       0.81      0.86      0.83      6591
           1       0.85      0.80      0.82      6592

    accuracy                           0.83     13183
   macro avg       0.83      0.83      0.83     13183
weighted avg       0.83      0.83      0.83     13183



Grid Search

In [None]:
best_hyperparams_grid = []
best_models_grid = []

hyperparams = {
        'n_estimators': list(range(10, 200)),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4], 
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True, False]
    }

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    rf = RandomForestClassifier()

    grid_search = GridSearchCV(rf, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print('User ', i)
    print('Best hyperparams:', grid_search.best_params_)
    print('Best score: ', random_search.best_score_)

    best_model = grid_search.best_estimator_
    best_models_grid.append(best_model)
    best_hyperparams_grid.append(grid_search.best_params_)

    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))