## **Import libraries**

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#TODO: SAVE EACH MODEL

## **Upload dataset**

In [8]:
#TODO: dane z gotowymi cechami -> czy próbować robić po swojemu ekstrakcję, skoro juz to zrobili?
dataset = pd.read_csv('masterTrain.csv')
dataset = dataset.iloc[:, 1:]
# 377010 x 34
# print(dataset.iloc[0:1,:])

In [9]:
from sklearn.model_selection import train_test_split

# Split dataset into users and impostors
num_classes = len(np.unique(dataset['class']))

user = {}
impostor = {}
X_train_list = []
X_test_list = []
y_train_list = []
y_test_list = []

for i in range(num_classes):
    user[i] = dataset.loc[dataset['class'] == i]
    impostor[i] = dataset.loc[dataset['class'] != i]
    # scale impostor to the same size as user
    impostor[i] = impostor[i].sample(n=len(user[i]))

    user[i].loc[:, 'class'] = 0
    impostor[i].loc[:, 'class'] = 1

    merged_data = pd.concat([user[i], impostor[i]])
    
    X = merged_data.loc[:, 'mean_x_speed':'numCritPoints']
    y = merged_data['class']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

#### Feature importance

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
extra_tree_forest = ExtraTreesClassifier(n_estimators = 5, criterion ='entropy', max_features = 30)

extra_tree_forest.fit(X, y)

feature_importance = extra_tree_forest.feature_importances_

feature_importance_normalized = np.std([tree.feature_importances_ for tree in extra_tree_forest.estimators_], axis = 0)

plt.figure(figsize=(10, 6))
plt.bar(X.columns, feature_importance_normalized)
plt.xticks(rotation=70)

plt.xlabel('Feature Labels')
plt.ylabel('Feature Importances')
plt.title('Comparison of different Feature Importances')
plt.tight_layout()
plt.show()

## **Random Forest**

### Hyperparameters tuning

Randomized Search

In [10]:
from sklearn.ensemble import RandomForestClassifier
import random

best_models_random = []

hyperparams = {
        'n_estimators': [100, 200, 300, 400, 500], #np.arange(100, 500, step=50),
        # 'max_depth': [None] + list(np.arange(10, 100, step=20)), # TODO: ograniczyć max_depth do np. 50
        # 'min_samples_split': [2, 4, 6, 8, 10],
        # 'min_samples_leaf': [1, 2, 4], 
        'criterion': ['gini','entropy'],
        'max_features': ['log2', 'sqrt'],
        'bootstrap': [False] #TODO: [True, False]
    }

classes = list(range(num_classes))
random.shuffle(classes)
random_classes = classes[:3]

for i in random_classes:
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    rf = RandomForestClassifier()

    random_search = RandomizedSearchCV(rf, hyperparams, cv=5, scoring='accuracy', n_jobs=-1, n_iter=10)
    random_search.fit(X_train, y_train)

    print('User ', i)
    print('Best hyperparams:', random_search.best_params_)
    print('Best score: ', random_search.best_score_)

    best_model = random_search.best_estimator_
    best_models_random.append(best_model)

    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

User  3
Best hyperparams: {'n_estimators': 500, 'max_features': 'sqrt', 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8153163637985514
Mean Squared Error: 0.15757546345724546
R-squared (R2): 0.36969814617101815
              precision    recall  f1-score   support

           0       0.82      0.88      0.85     10303
           1       0.87      0.81      0.84     10303

    accuracy                           0.84     20606
   macro avg       0.84      0.84      0.84     20606
weighted avg       0.84      0.84      0.84     20606

User  9
Best hyperparams: {'n_estimators': 500, 'max_features': 'sqrt', 'criterion': 'entropy', 'bootstrap': False}
Best score:  0.8068133535660091
Mean Squared Error: 0.1607186210245205
R-squared (R2): 0.357125515901918
              precision    recall  f1-score   support

           0       0.82      0.87      0.84      8238
           1       0.86      0.81      0.83      8238

    accuracy                           0.84     16476
   macro a

Grid Search

In [None]:
best_models_grid = []

hyperparams = {
        'n_estimators': list(range(10, 200)),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4], 
        'criterion': ['gini','entropy'],
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [False]
    }

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    rf = RandomForestClassifier()

    grid_search = GridSearchCV(rf, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print('User ', i)
    print('Best hyperparams:', grid_search.best_params_)
    print('Best score: ', grid_search.best_score_)

    best_model = grid_search.best_estimator_
    best_models_grid.append(best_model)

    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

### Final model

In [4]:
from sklearn.ensemble import RandomForestClassifier

models_rf = {}

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(X_train, y_train)

    predictions = rf.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print("User: ", i)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))
    

User:  0
Mean Squared Error: 0.21603580368656602
R-squared (R2): 0.135856780281437
              precision    recall  f1-score   support

           0       0.76      0.83      0.79      6592
           1       0.81      0.74      0.77      6591

    accuracy                           0.78     13183
   macro avg       0.79      0.78      0.78     13183
weighted avg       0.79      0.78      0.78     13183

User:  1
Mean Squared Error: 0.20105820105820105
R-squared (R2): 0.1957671957671958
              precision    recall  f1-score   support

           0       0.78      0.83      0.80      7938
           1       0.82      0.77      0.79      7938

    accuracy                           0.80     15876
   macro avg       0.80      0.80      0.80     15876
weighted avg       0.80      0.80      0.80     15876

User:  2
Mean Squared Error: 0.21110732538330493
R-squared (R2): 0.15557069454568806
              precision    recall  f1-score   support

           0       0.77      0.83      

# **K Nearest Neighbors**

### Hyperparameters tuning

Randomized Search

In [None]:
from sklearn.ensemble import KNeighborsClassifier
import random

best_models_random = []

hyperparams = {
        'n_neighbors': list(range(2, 21)),
        'leaf_size': list(range(1, 31)),
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2, 3]
    }

classes = list(range(num_classes))
random.shuffle(classes)
random_classes = classes[:3]

for i in random_classes:
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    knn = KNeighborsClassifier()

    random_search = RandomizedSearchCV(knn, hyperparams, cv=5, scoring='accuracy', n_jobs=-1, n_iter=10)
    random_search.fit(X_train, y_train)

    print('User ', i)
    print('Best hyperparams:', random_search.best_params_)
    print('Best score: ', random_search.best_score_)

    best_model = random_search.best_estimator_
    best_models_random.append(best_model)

    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

Grid Search

In [None]:
best_models_grid = []

hyperparams = {
    'n_neighbors': list(range(2, 21)),
    'leaf_size': list(range(1, 31)),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2, 3]
}

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    knn = KNeighborsClassifier()

    grid_search = GridSearchCV(knn, hyperparams, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print('User ', i)
    print('Best hyperparams:', grid_search.best_params_)
    print('Best score: ', grid_search.best_score_)

    best_model = grid_search.best_estimator_
    best_models_grid.append(best_model)

    predictions = best_model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

### Final model

In [None]:
from sklearn.ensemble import RandomForestClassifier

models_knn = {}

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)

    predictions = knn.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print("User: ", i)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared (R2): {r2}")
    print(classification_report(y_test, predictions))

# **Neural Network**

In [None]:
%pip install tensorflow

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import f1_score

models_nn = {}

for i in range(num_classes):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

    model = Sequential()

    model.add(Dense(256, input_dim=31, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(256, activation='leaky_relu'))
    model.add(Dropout(0.2))
    model.add(Dense(51, activation='softmax'))

    # metrics=['categorical_accuracy']) #TODO: sprawdzić czy to jest dobre
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=100, verbose=2)

    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()

    plt.tight_layout()
    plt.show()

    print("User: ", i)
    predictions = model.predict(X_test)
    predictions = (predictions > 0.95).astype(int)

    f1 = f1_score(y_test, predictions, average='macro')
    print(f'F1 Score (Macro): {f1}')


In [None]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(20, 16))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()