In [1]:
# SVM with stratified k-fold cross validation
print('SVM with stratified k-fold cross validation without metadata:')

import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn import metrics
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Import data from Excel
train_data = pd.read_excel("train_data.xlsx")
val_data = pd.read_excel("val_data.xlsx")
test_data = pd.read_excel("test_data.xlsx")

# Combine train and val data
train_val_data = pd.concat([train_data, val_data], axis=0)

X_columns = ['total_keystrokes_averagepergame', 'total_down_averagepergame', 'total_left_averagepergame', 'total_right_averagepergame', 'total_left_right_averagepergame', 'total_clockwise_rotations_averagepergame', 'total_conterclockwise_rotations_averagepergame', 'total_rotations_averagepergame']
y_column = ['tetris_experience_binary']

X_train_val = train_val_data[X_columns].values
y_train_val = train_val_data[y_column].values.ravel()
X_test = test_data[X_columns].values
y_test = test_data[y_column].values.ravel()

# Scale the data
scaler = StandardScaler()
X_train_val = scaler.fit_transform(X_train_val)
X_test = scaler.transform(X_test)

# Stratified k-fold cross validation
skf = StratifiedKFold(n_splits=5)
accuracy_scores_svm = []
precision_scores_svm = []
recall_scores_svm = []
f1_scores_svm = []
aucroc_scores_svm = []

for train_index, test_index in skf.split(X_train_val, y_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Create an SVM classifier
    clf_svm = SVC(probability=True)

    # Define the hyperparameters to tune
    parameters = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}

    # Create the GridSearchCV object
    grid_search = GridSearchCV(clf_svm, parameters, cv=5, scoring='f1')

    # Fit the GridSearchCV object to the data
    grid_search.fit(X_train_val, y_train_val)
    clf_svm = grid_search.best_estimator_

    # Train the model using the training sets
    clf_svm.fit(X_train_cv, y_train_cv)

    # Predict the response for test dataset
    y_pred = clf_svm.predict(X_test_cv)

    # Evaluation
    accuracy_scores_svm.append(metrics.accuracy_score(y_test_cv, y_pred))
    precision_scores_svm.append(metrics.precision_score(y_test_cv, y_pred))
    recall_scores_svm.append(metrics.recall_score(y_test_cv, y_pred))
    f1_scores_svm.append(metrics.f1_score(y_test_cv, y_pred))
    y_proba = clf_svm.predict_proba(X_test_cv)[:, 1]
    aucroc_scores_svm.append(metrics.roc_auc_score(y_test_cv, y_proba))

# Calculate average scores
print("\nCross-Validation Average Accuracy:", np.mean(accuracy_scores_svm))
print("Cross-Validation Average Precision:", np.mean(precision_scores_svm))
print("Cross-Validation Average Recall:", np.mean(recall_scores_svm))
print("Cross-Validation Average F1-score:", np.mean(f1_scores_svm))
print("Cross-Validation Average AUCROC:", np.mean(aucroc_scores_svm))

# Evaluate on test set
y_test_pred_svm = clf_svm.predict(X_test)
y_test_proba = clf_svm.predict_proba(X_test)[:, 1]

print("\nTest Accuracy:", metrics.accuracy_score(y_test, y_test_pred_svm))
print("Test Precision:", metrics.precision_score(y_test, y_test_pred_svm))
print("Test Recall:", metrics.recall_score(y_test, y_test_pred_svm))
print("Test F1-score:", metrics.f1_score(y_test, y_test_pred_svm))
print("Test AUCROC:", metrics.roc_auc_score(y_test, y_test_proba))



SVM with stratified k-fold cross validation without metadata:

Cross-Validation Average Accuracy: 0.7333333333333334
Cross-Validation Average Precision: 0.8
Cross-Validation Average Recall: 0.37
Cross-Validation Average F1-score: 0.4904761904761904
Cross-Validation Average AUCROC: 0.6880555555555555

Test Accuracy: 0.625
Test Precision: 0.75
Test Recall: 0.375
Test F1-score: 0.5
Test AUCROC: 0.84375


In [2]:
from sklearn.inspection import permutation_importance

# Calculate permutation importance
result = permutation_importance(
    clf_svm, X_train_val, y_train_val, n_repeats=10, random_state=42, scoring='f1'
)

# Store feature importances and their corresponding feature names in a list
feature_importances = list(zip(X_columns, result.importances_mean))

# Normalize the feature importances
normalized_feature_importances = [
    (feature, importance / sum(result.importances_mean))
    for feature, importance in feature_importances
]

# Sort the list based on the normalized importance values (from high to low)
sorted_feature_importances = sorted(normalized_feature_importances, key=lambda x: x[1], reverse=True)

# Print the sorted feature importances
print("\nNormalized permutation importance (from high to low):")
for feature, importance in sorted_feature_importances:
    print(f"{feature}: {importance}")




Normalized permutation importance (from high to low):
total_left_averagepergame: 0.4022708639545511
total_rotations_averagepergame: 0.3231665743230315
total_clockwise_rotations_averagepergame: 0.29554728859769247
total_keystrokes_averagepergame: 0.017631830006201842
total_conterclockwise_rotations_averagepergame: 0.014693191671834869
total_down_averagepergame: 0.011754553337467893
total_left_right_averagepergame: 0.011754553337467893
total_right_averagepergame: -0.07681885522824752


In [67]:
from sklearn.metrics import confusion_matrix

# Bereken de confusion matrix
cm_svm = confusion_matrix(y_test, y_test_pred_svm)

# Maak een DataFrame van de confusion matrix met labels
cm_svm_df = pd.DataFrame(cm_svm, 
                     index = ['Actual Negative', 'Actual Positive'], 
                     columns = ['Predicted Negative', 'Predicted Positive'])

# Print de confusion matrix
print("Confusion Matrix for SVM:")
print(cm_svm_df)


Confusion Matrix for SVM:
                 Predicted Negative  Predicted Positive
Actual Negative                   7                   1
Actual Positive                   5                   3


In [68]:
from sklearn.ensemble import RandomForestClassifier


# Random Forest with stratified k-fold cross validation

print('Random Forest with stratified k-fold cross validation without metadata:')

# Import data from Excel
train_data = pd.read_excel("train_data.xlsx")
val_data = pd.read_excel("val_data.xlsx")
test_data = pd.read_excel("test_data.xlsx")

X_columns = ['total_keystrokes_averagepergame', 'total_down_averagepergame', 'total_left_averagepergame', 'total_right_averagepergame', 'total_left_right_averagepergame', 'total_clockwise_rotations_averagepergame', 'total_conterclockwise_rotations_averagepergame', 'total_rotations_averagepergame']
y_column = ['tetris_experience_binary']

X_train = train_data[X_columns].values
y_train = train_data[y_column].values.ravel()
X_val = val_data[X_columns].values
y_val = val_data[y_column].values.ravel()
X_test = test_data[X_columns].values
y_test = test_data[y_column].values.ravel()

# Combine train_data and val_data
train_val_data = pd.concat([train_data, val_data])
X_train_val = train_val_data[X_columns].values
y_train_val = train_val_data[y_column].values.ravel()

# Scale the data
scaler = StandardScaler()
X_train_val = scaler.fit_transform(X_train_val)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Stratified k-fold cross validation
skf = StratifiedKFold(n_splits=5)
accuracy_scores_rf = []
precision_scores_rf = []
recall_scores_rf = []
f1_scores_rf = []
aucroc_scores_rf = []

# Initialize the cumulative feature importances array
cumulative_feature_importances = np.zeros(len(X_columns))

for train_index, test_index in skf.split(X_train_val, y_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Define the hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5]
    }

    # Create a Random Forest classifier
    clf_rf = RandomForestClassifier(n_estimators=100)

    # Perform grid search
    grid_search = GridSearchCV(clf_rf, param_grid=param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train_val, y_train_val)
    
    # Train the model using the best hyperparameters
    clf_rf = grid_search.best_estimator_

    # Train the model using the training sets
    clf_rf.fit(X_train_cv, y_train_cv)
    
    # Add the current model's feature importances to the cumulative feature importances array
    cumulative_feature_importances += clf_rf.feature_importances_

    # Predict the response for test dataset
    y_pred = clf_rf.predict(X_test_cv)

    # Evaluation
    accuracy_scores_rf.append(metrics.accuracy_score(y_test_cv, y_pred))
    precision_scores_rf.append(metrics.precision_score(y_test_cv, y_pred))
    recall_scores_rf.append(metrics.recall_score(y_test_cv, y_pred))
    f1_scores_rf.append(metrics.f1_score(y_test_cv, y_pred))
    y_proba = clf_rf.predict_proba(X_test_cv)[:, 1]
    aucroc_scores_rf.append(metrics.roc_auc_score(y_test_cv, y_proba))

# Calculate average scores
print("\nCross-Validation Average Accuracy:", np.mean(accuracy_scores_rf))
print("Cross-Validation Average Precision:", np.mean(precision_scores_rf))
print("Cross-Validation Average Recall:", np.mean(recall_scores_rf))
print("Cross-Validation Average F1-score:", np.mean(f1_scores_rf))
print("Cross-Validation Average AUCROC:", np.mean(aucroc_scores_rf))

# Evaluate on test set
y_test_pred_rf = clf_rf.predict(X_test)
y_test_proba = clf_rf.predict_proba(X_test)[:, 1]

print("\nTest Accuracy:", metrics.accuracy_score(y_test, y_test_pred_rf))
print("Test Precision:", metrics.precision_score(y_test, y_test_pred_rf))
print("Test Recall:", metrics.recall_score(y_test, y_test_pred_rf))
print("Test F1-score:", metrics.f1_score(y_test, y_test_pred_rf))
print("Test AUCROC:", metrics.roc_auc_score(y_test, y_test_proba))

# Calculate the average feature importances by dividing the cumulative feature importances array by the number of folds
average_feature_importances = cumulative_feature_importances / skf.get_n_splits()

# Print the average feature importances
feature_importances = pd.DataFrame(average_feature_importances, index=X_columns, columns=['importance']).sort_values('importance', ascending=False)
print("\nFeature Importance - Random Forest:")
for index, row in feature_importances.iterrows():
    print(f"{index:<30} importance: {row['importance']:.3f}")



Random Forest with stratified k-fold cross validation without metadata:

Cross-Validation Average Accuracy: 0.6192307692307693
Cross-Validation Average Precision: 0.4
Cross-Validation Average Recall: 0.32
Cross-Validation Average F1-score: 0.33714285714285713
Cross-Validation Average AUCROC: 0.7270833333333334

Test Accuracy: 0.625
Test Precision: 0.6666666666666666
Test Recall: 0.5
Test F1-score: 0.5714285714285715
Test AUCROC: 0.59375

Feature Importance - Random Forest:
total_down_averagepergame      importance: 0.175
total_rotations_averagepergame importance: 0.173
total_clockwise_rotations_averagepergame importance: 0.139
total_right_averagepergame     importance: 0.121
total_keystrokes_averagepergame importance: 0.116
total_left_averagepergame      importance: 0.101
total_left_right_averagepergame importance: 0.088
total_conterclockwise_rotations_averagepergame importance: 0.087


In [69]:
# Importeer de benodigde bibliotheken
from sklearn.metrics import confusion_matrix
import pandas as pd
import seaborn as sns

# Bereken de confusion matrix
cm_rf = confusion_matrix(y_test, y_test_pred_rf)

# Maak een DataFrame van de confusion matrix met labels
cm_rf_df = pd.DataFrame(cm_rf, 
                     index = ['Actual Negative', 'Actual Positive'], 
                     columns = ['Predicted Negative', 'Predicted Positive'])

# Print de confusion matrix
print("Confusion Matrix for Random Forest:")
print(cm_rf_df)


Confusion Matrix for Random Forest:
                 Predicted Negative  Predicted Positive
Actual Negative                   6                   2
Actual Positive                   4                   4


In [2]:
# Logistic Regression with stratified k-fold cross validation

print('Logistic Regression with stratified k-fold cross validation without metadata:')

import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np
from sklearn.preprocessing import StandardScaler

# Import data from Excel
train_data = pd.read_excel("train_data.xlsx")
val_data = pd.read_excel("val_data.xlsx")
test_data = pd.read_excel("test_data.xlsx")

# Merge train_data and val_data
train_val_data = pd.concat([train_data, val_data], axis=0)

X_columns = ['total_keystrokes_averagepergame', 'total_down_averagepergame', 'total_left_averagepergame', 'total_right_averagepergame', 'total_left_right_averagepergame', 'total_clockwise_rotations_averagepergame', 'total_conterclockwise_rotations_averagepergame', 'total_rotations_averagepergame']
y_column = ['tetris_experience_binary']

X_train_val = train_val_data[X_columns].values
y_train_val = train_val_data[y_column].values.ravel()
X_test = test_data[X_columns].values
y_test = test_data[y_column].values.ravel()

# Scale the data
scaler = StandardScaler()
X_train_val = scaler.fit_transform(X_train_val)
X_test = scaler.transform(X_test)

# Stratified k-fold cross validation
skf = StratifiedKFold(n_splits=5)
accuracy_scores_lr = []
precision_scores_lr = []
recall_scores_lr = []
f1_scores_lr = []
aucroc_scores_lr = []

for train_index, test_index in skf.split(X_train_val, y_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Create a logistic regression classifier
    clf_lr = LogisticRegression(max_iter=1000)

    # Define the hyperparameter grid
    param_grid = {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear', 'saga']}

    # Create a GridSearchCV object
    grid_search = GridSearchCV(clf_lr, param_grid, cv=5, scoring='f1')

    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train_val, y_train_val)
    
    # Use the best hyperparameters to create a new logistic regression classifier
    clf_lr = LogisticRegression(C=grid_search.best_params_['C'], solver=grid_search.best_params_['solver'], max_iter=1000)

    # Train the model using the training sets
    clf_lr.fit(X_train_cv, y_train_cv)

    # Predict the response for test dataset
    y_pred = clf_lr.predict(X_test_cv)

    # Evaluation
    accuracy_scores_lr.append(metrics.accuracy_score(y_test_cv, y_pred))
    precision_scores_lr.append(metrics.precision_score(y_test_cv, y_pred))
    recall_scores_lr.append(metrics.recall_score(y_test_cv, y_pred))
    f1_scores_lr.append(metrics.f1_score(y_test_cv, y_pred))
    y_proba = clf_lr.predict_proba(X_test_cv)[:, 1]
    aucroc_scores_lr.append(metrics.roc_auc_score(y_test_cv, y_proba))

# Calculate average scores
print("\nCross-Validation Average Accuracy:", np.mean(accuracy_scores_lr))
print("Cross-Validation Average Precision:", np.mean(precision_scores_lr))
print("Cross-Validation Average Recall:", np.mean(recall_scores_lr))
print("Cross-Validation Average F1-score:", np.mean(f1_scores_lr))
print("Cross-Validation Average AUCROC:", np.mean(aucroc_scores_lr))

# Evaluate on test set
y_test_pred_lr = clf_lr.predict(X_test)
y_test_proba = clf_lr.predict_proba(X_test)[:, 1]

print("\nTest Accuracy:", metrics.accuracy_score(y_test, y_test_pred_lr))
print("Test Precision:", metrics.precision_score(y_test, y_test_pred_lr))
print("Test Recall:", metrics.recall_score(y_test, y_test_pred_lr))
print("Test F1-score:", metrics.f1_score(y_test, y_test_pred_lr))
print("Test AUCROC:", metrics.roc_auc_score(y_test, y_test_proba))


Logistic Regression with stratified k-fold cross validation without metadata:


NameError: name 'GridSearchCV' is not defined

In [5]:
# Feature importance - logistic regression
print("\nFeature Importance - Logistic Regression:")
importance = clf_lr.coef_[0]
importance_dict = {X_columns[i]: v for i, v in enumerate(importance)}
sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, score in sorted_importance:
    print(f'{feature:<30} importance: {score:.3f}')


Feature Importance - Logistic Regression:
total_clockwise_rotations_averagepergame importance: 0.229
total_rotations_averagepergame importance: 0.190
total_left_averagepergame      importance: 0.108
total_left_right_averagepergame importance: 0.094
total_right_averagepergame     importance: 0.076
total_keystrokes_averagepergame importance: 0.065
total_down_averagepergame      importance: 0.002
total_conterclockwise_rotations_averagepergame importance: -0.034


In [81]:
# Importeer de benodigde bibliotheken
from sklearn.metrics import confusion_matrix
import pandas as pd
import seaborn as sns

# Bereken de confusion matrix
cm_lr = confusion_matrix(y_test, y_test_pred_lr)

# Maak een DataFrame van de confusion matrix met labels
cm_lr_df = pd.DataFrame(cm_lr, 
                     index = ['Actual Negative', 'Actual Positive'], 
                     columns = ['Predicted Negative', 'Predicted Positive'])

# Print de confusion matrix
print("Confusion Matrix for Logistic Regression:")
print(cm_lr_df)


Confusion Matrix for Logistic Regression:
                 Predicted Negative  Predicted Positive
Actual Negative                   6                   2
Actual Positive                   4                   4


In [78]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error

# Baseline model with stratified k-fold cross validation
print('\nBaseline model with stratified k-fold cross validation:')

accuracy_scores_baseline = []
precision_scores_baseline = []
recall_scores_baseline = []
f1_scores_baseline = []
aucroc_scores_baseline = []

for train_index, test_index in skf.split(X_train_val, y_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Create a baseline classifier
    clf_baseline = DummyClassifier(strategy='most_frequent')

    # Train the model using the training sets
    clf_baseline.fit(X_train_cv, y_train_cv)

    # Predict the response for test dataset
    y_pred_baseline = clf_baseline.predict(X_test_cv)

    # Evaluation
    accuracy_scores_baseline.append(metrics.accuracy_score(y_test_cv, y_pred_baseline))
    precision_scores_baseline.append(metrics.precision_score(y_test_cv, y_pred_baseline))
    recall_scores_baseline.append(metrics.recall_score(y_test_cv, y_pred_baseline))
    f1_scores_baseline.append(metrics.f1_score(y_test_cv, y_pred_baseline))
    y_proba_baseline = clf_baseline.predict_proba(X_test_cv)[:, 1]
    aucroc_scores_baseline.append(metrics.roc_auc_score(y_test_cv, y_proba_baseline))

# Calculate average scores for the baseline model
print("\nCross-Validation Average Accuracy (Baseline):", np.mean(accuracy_scores_baseline))
print("Cross-Validation Average Precision (Baseline):", np.mean(precision_scores_baseline))
print("Cross-Validation Average Recall (Baseline):", np.mean(recall_scores_baseline))
print("Cross-Validation Average F1-score (Baseline):", np.mean(f1_scores_baseline))
print("Cross-Validation Average AUCROC (Baseline):", np.mean(aucroc_scores_baseline))

# Evaluate the baseline model on the test set
y_test_pred_baseline = clf_baseline.predict(X_test)
y_test_proba_baseline = clf_baseline.predict_proba(X_test)[:, 1]

print("\nTest Accuracy (Baseline):", metrics.accuracy_score(y_test, y_test_pred_baseline))
print("Test Precision (Baseline):", metrics.precision_score(y_test, y_test_pred_baseline))
print("Test Recall (Baseline):", metrics.recall_score(y_test, y_test_pred_baseline))
print("Test F1-score (Baseline):", metrics.f1_score(y_test, y_test_pred_baseline))
print("Test AUCROC (Baseline):", metrics.roc_auc_score(y_test, y_test_proba_baseline))



Baseline model with stratified k-fold cross validation:

Cross-Validation Average Accuracy (Baseline): 0.6512820512820513
Cross-Validation Average Precision (Baseline): 0.0
Cross-Validation Average Recall (Baseline): 0.0
Cross-Validation Average F1-score (Baseline): 0.0
Cross-Validation Average AUCROC (Baseline): 0.5

Test Accuracy (Baseline): 0.5
Test Precision (Baseline): 0.0
Test Recall (Baseline): 0.0
Test F1-score (Baseline): 0.0
Test AUCROC (Baseline): 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
