In [8]:
# Ridge regression with k-fold cross validation without metadata
print('Ridge regression with k-fold cross validation without metadata:')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Import data from Excel
train_data = pd.read_excel("train_data.xlsx")
val_data = pd.read_excel("val_data.xlsx")
test_data = pd.read_excel("test_data.xlsx")

X_columns = ['total_keystrokes_averagepergame', 'total_down_averagepergame', 'total_left_averagepergame', 'total_right_averagepergame', 'total_left_right_averagepergame', 'total_clockwise_rotations_averagepergame', 'total_conterclockwise_rotations_averagepergame', 'total_rotations_averagepergame']
y_column = ['average_score']

X_train = train_data[X_columns].values
y_train = train_data[y_column].values.ravel()
X_val = val_data[X_columns].values
y_val = val_data[y_column].values.ravel()
X_test = test_data[X_columns].values
y_test = test_data[y_column].values.ravel()

# Combine train_data and val_data
train_val_data = pd.concat([train_data, val_data])
X_train_val = train_val_data[X_columns].values
y_train_val = train_val_data[y_column].values.ravel()

# Scale the data
scaler = StandardScaler()
X_train_val = scaler.fit_transform(X_train_val)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Define the hyperparameter grid
param_grid = {
    'alpha': [0.1, 1, 10, 100]
}

# Create a Ridge regressor
regr_ridge = Ridge()

# Perform grid search
grid_search = GridSearchCV(regr_ridge, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_val, y_train_val)

# Train the model using the best hyperparameters
regr_ridge = grid_search.best_estimator_

# K-fold cross validation
kf = KFold(n_splits=5)
rmse_scores_ridge = []
r2_scores_ridge = []
mape_scores_ridge = []

for train_index, test_index in kf.split(X_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Train the model using the training sets
    regr_ridge.fit(X_train_cv, y_train_cv)

    # Predict the response for test dataset
    y_pred = regr_ridge.predict(X_test_cv)

    # Evaluation
    rmse_scores_ridge.append(np.sqrt(mean_squared_error(y_test_cv, y_pred)))
    r2_scores_ridge.append(r2_score(y_test_cv, y_pred))
    mape_scores_ridge.append(mean_absolute_percentage_error(y_test_cv, y_pred))

# Calculate average scores
print("\nCross-Validation Average RMSE:", np.mean(rmse_scores_ridge))
print("Cross-Validation Average R-squared:", np.mean(r2_scores_ridge))
print("Cross-Validation Average MAPE:", np.mean(mape_scores_ridge))

# Evaluate on test set
y_test_pred = regr_ridge.predict(X_test)

#Evaluate on test set
print("\nTest RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Test R-squared:", r2_score(y_test, y_test_pred))
print("Test MAPE:", mean_absolute_percentage_error(y_test, y_test_pred))



Ridge regression with k-fold cross validation without metadata:

Cross-Validation Average RMSE: 2958.444268803276
Cross-Validation Average R-squared: 0.6469470348835437
Cross-Validation Average MAPE: 2.178177081705308

Test RMSE: 6360.084622223177
Test R-squared: 0.5861525893960047
Test MAPE: 3.0138622710884975


In [9]:
# Calculate feature importance using coefficients
feature_importance = np.abs(regr_ridge.coef_)

# Normalize the feature importances
normalized_feature_importance = feature_importance / np.sum(feature_importance)

# Create a dictionary of feature names and their corresponding importances
feature_importance_dict = dict(zip(X_columns, normalized_feature_importance))

# Sort the dictionary by importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importances
print("\nFeature Importance (sorted from high to low):")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")



Feature Importance (sorted from high to low):
total_down_averagepergame: 0.2699813139239217
total_left_averagepergame: 0.23241314247022246
total_right_averagepergame: 0.16118381489052613
total_keystrokes_averagepergame: 0.1542543093285493
total_left_right_averagepergame: 0.06429702066372019
total_conterclockwise_rotations_averagepergame: 0.061205585022997167
total_rotations_averagepergame: 0.05517198780772223
total_clockwise_rotations_averagepergame: 0.0014928258923407546


In [10]:
# Lasso regression with K-fold cross validation without metadata
print('Lasso regression with k-fold cross validation without metadata:')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Import data from Excel
train_data = pd.read_excel("train_data.xlsx")
val_data = pd.read_excel("val_data.xlsx")
test_data = pd.read_excel("test_data.xlsx")

X_columns = ['total_keystrokes_averagepergame', 'total_down_averagepergame', 'total_left_averagepergame', 'total_right_averagepergame', 'total_left_right_averagepergame', 'total_clockwise_rotations_averagepergame', 'total_conterclockwise_rotations_averagepergame', 'total_rotations_averagepergame']
y_column = ['average_score']

X_train = train_data[X_columns].values
y_train = train_data[y_column].values.ravel()
X_val = val_data[X_columns].values
y_val = val_data[y_column].values.ravel()
X_test = test_data[X_columns].values
y_test = test_data[y_column].values.ravel()

# Combine train_data and val_data
train_val_data = pd.concat([train_data, val_data])
X_train_val = train_val_data[X_columns].values
y_train_val = train_val_data[y_column].values.ravel()

# Scale the data
scaler = StandardScaler()
X_train_val = scaler.fit_transform(X_train_val)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# K-fold cross validation
kf = KFold(n_splits=5)
rmse_scores_lasso = []
r2_scores_lasso = []
mape_scores_lasso = []

for train_index, test_index in kf.split(X_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Define the hyperparameter grid
    param_grid = {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
    }

    # Create a Lasso regressor
    regr_lasso = Lasso()

    # Perform grid search
    grid_search = GridSearchCV(regr_lasso, param_grid=param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train_val, y_train_val)
    
    # Train the model using the best hyperparameters
    regr_lasso = grid_search.best_estimator_

    # Train the model using the training sets
    regr_lasso.fit(X_train_cv, y_train_cv)

    # Predict the response for test dataset
    y_pred = regr_lasso.predict(X_test_cv)

    # Evaluation
    rmse_scores_lasso.append(np.sqrt(mean_squared_error(y_test_cv, y_pred)))
    r2_scores_lasso.append(r2_score(y_test_cv, y_pred))
    mape_scores_lasso.append(mean_absolute_percentage_error(y_test_cv, y_pred))

# Calculate average scores
print("\nCross-Validation Average RMSE:", np.mean(rmse_scores_lasso))
print("Cross-Validation Average R-squared:", np.mean(r2_scores_lasso))
print("Cross-Validation Average MAPE:", np.mean(mape_scores_lasso))

# Evaluate on test set
y_test_pred = regr_lasso.predict(X_test)

print("\nTest RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Test R-squared:", r2_score(y_test, y_test_pred))
print("Test MAPE:", mean_absolute_percentage_error(y_test, y_test_pred))


Lasso regression with k-fold cross validation without metadata:


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c


Cross-Validation Average RMSE: 2891.721795871247
Cross-Validation Average R-squared: 0.6566656287837785
Cross-Validation Average MAPE: 2.1454990862144347

Test RMSE: 6386.676976545064
Test R-squared: 0.582684653139588
Test MAPE: 3.05172818892416


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [11]:
# Calculate feature importance using coefficients
feature_importance = np.abs(regr_lasso.coef_)

# Normalize the feature importances
normalized_feature_importance = feature_importance / np.sum(feature_importance)

# Create a dictionary of feature names and their corresponding importances
feature_importance_dict = dict(zip(X_columns, normalized_feature_importance))

# Sort the dictionary by importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importances
print("\nFeature Importance (sorted from high to low):")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance (sorted from high to low):
total_down_averagepergame: 0.6023762714128087
total_left_averagepergame: 0.33556852866608466
total_conterclockwise_rotations_averagepergame: 0.06205519992110663
total_keystrokes_averagepergame: 0.0
total_right_averagepergame: 0.0
total_left_right_averagepergame: 0.0
total_clockwise_rotations_averagepergame: 0.0
total_rotations_averagepergame: 0.0


In [12]:
# Random Forest regression with k-fold cross validation without metadata
print('Random Forest regression with k-fold cross validation without metadata:')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Import data from Excel
train_data = pd.read_excel("train_data.xlsx")
val_data = pd.read_excel("val_data.xlsx")
test_data = pd.read_excel("test_data.xlsx")

X_columns = ['total_keystrokes_averagepergame', 'total_down_averagepergame', 'total_left_averagepergame', 'total_right_averagepergame', 'total_left_right_averagepergame', 'total_clockwise_rotations_averagepergame', 'total_conterclockwise_rotations_averagepergame', 'total_rotations_averagepergame']
y_column = ['average_score']

X_train = train_data[X_columns].values
y_train = train_data[y_column].values.ravel()
X_val = val_data[X_columns].values
y_val = val_data[y_column].values.ravel()
X_test = test_data[X_columns].values
y_test = test_data[y_column].values.ravel()

# Combine train_data and val_data
train_val_data = pd.concat([train_data, val_data])
X_train_val = train_val_data[X_columns].values
y_train_val = train_val_data[y_column].values.ravel()

# Scale the data
scaler = StandardScaler()
X_train_val = scaler.fit_transform(X_train_val)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Initialize the cumulative feature importances array
cumulative_feature_importances = np.zeros(len(X_columns))

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50]
}

# Create a RandomForestRegressor
regr_rf = RandomForestRegressor()

# Perform grid search
grid_search = GridSearchCV(regr_rf, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_val, y_train_val)

# Train the model using the best hyperparameters
regr_rf = grid_search.best_estimator_

# K-fold cross validation
kf = KFold(n_splits=5)
rmse_scores_rf = []
r2_scores_rf = []
mape_scores_rf = []

for train_index, test_index in kf.split(X_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Train the model using the training sets
    regr_rf.fit(X_train_cv, y_train_cv)

    # Predict the response for test dataset
    y_pred = regr_rf.predict(X_test_cv)

    # Add the current model's feature importances to the cumulative feature importances array
    cumulative_feature_importances += regr_rf.feature_importances_

    # Evaluation
    rmse_scores_rf.append(np.sqrt(mean_squared_error(y_test_cv, y_pred)))
    r2_scores_rf.append(r2_score(y_test_cv, y_pred))
    mape_scores_rf.append(mean_absolute_percentage_error(y_test_cv, y_pred))

# Calculate average scores
print("\nCross-Validation Average RMSE:", np.mean(rmse_scores_rf))
print("Cross-Validation Average R-squared:", np.mean(r2_scores_rf))
print("Cross-Validation Average MAPE:", np.mean(mape_scores_rf))

# Evaluate on test set
y_test_pred_rf = regr_rf.predict(X_test)

# set
print("\nTest RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred_rf)))
print("Test R-squared:", r2_score(y_test, y_test_pred_rf))
print("Test MAPE:", mean_absolute_percentage_error(y_test, y_test_pred_rf))

# Calculate the average feature importances by dividing the cumulative feature importances array by the number of folds
average_feature_importances = cumulative_feature_importances / kf.get_n_splits()

# Print the average feature importances
feature_importances = pd.DataFrame(average_feature_importances, index=X_columns, columns=['importance']).sort_values('importance', ascending=False)
print("\nFeature Importance - Random Forest:")
for index, row in feature_importances.iterrows():
    print(f"{index:<30} importance: {row['importance']:.3f}")



Random Forest regression with k-fold cross validation without metadata:

Cross-Validation Average RMSE: 2678.800719778771
Cross-Validation Average R-squared: 0.6747391570114347
Cross-Validation Average MAPE: 0.9180206323223864

Test RMSE: 7972.271668141207
Test R-squared: 0.3497525867108148
Test MAPE: 1.892681215600175

Feature Importance - Random Forest:
total_down_averagepergame      importance: 0.411
total_keystrokes_averagepergame importance: 0.201
total_right_averagepergame     importance: 0.143
total_left_averagepergame      importance: 0.099
total_left_right_averagepergame importance: 0.060
total_conterclockwise_rotations_averagepergame importance: 0.032
total_rotations_averagepergame importance: 0.028
total_clockwise_rotations_averagepergame importance: 0.025


In [13]:
# Baseline/dummy regression with K-fold cross validation metadata only
print('Baseline/dummy regression with k-fold cross validation metadata only:')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

# All the data importing and preprocessing steps remain the same

# K-fold cross validation
kf = KFold(n_splits=5)
rmse_scores_dummy = []
r2_scores_dummy = []
mape_scores_dummy = []

for train_index, test_index in kf.split(X_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Create a dummy regressor
    dummy_regr = DummyRegressor(strategy="mean")

    # Train the model using the training sets
    dummy_regr.fit(X_train_cv, y_train_cv)

    # Predict the response for test dataset
    y_pred = dummy_regr.predict(X_test_cv)

    # Evaluation
    rmse_scores_dummy.append(np.sqrt(mean_squared_error(y_test_cv, y_pred)))
    r2_scores_dummy.append(r2_score(y_test_cv, y_pred))
    mape_scores_dummy.append(mean_absolute_percentage_error(y_test_cv, y_pred))

# Calculate average scores
print("\nCross-Validation Average RMSE:", np.mean(rmse_scores_dummy))
print("Cross-Validation Average R-squared:", np.mean(r2_scores_dummy))
print("Cross-Validation Average MAPE:", np.mean(mape_scores_dummy))

# Evaluate on test set
y_test_pred = dummy_regr.predict(X_test)

print("\nTest RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Test R-squared:", r2_score(y_test, y_test_pred))
print("Test MAPE:", mean_absolute_percentage_error(y_test, y_test_pred))


Baseline/dummy regression with k-fold cross validation metadata only:

Cross-Validation Average RMSE: 5557.865822470064
Cross-Validation Average R-squared: -0.1704769979939511
Cross-Validation Average MAPE: 6.161399700043671

Test RMSE: 10354.493059881766
Test R-squared: -0.09691288197550874
Test MAPE: 3.7145358397888057
