In [13]:
# Ridge regression with k-fold cross validation without metadata
print('Ridge regression with k-fold cross validation without metadata:')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Import data from Excel
train_data = pd.read_excel("train_data.xlsx")
val_data = pd.read_excel("val_data.xlsx")
test_data = pd.read_excel("test_data.xlsx")

X_columns = ['sex_num', "age", 'videogames_hours_a_week','total_keystrokes_averagepergame', 'total_down_averagepergame', 'total_left_averagepergame', 'total_right_averagepergame', 'total_left_right_averagepergame', 'total_clockwise_rotations_averagepergame', 'total_conterclockwise_rotations_averagepergame', 'total_rotations_averagepergame']
y_column = ['average_score']

X_train = train_data[X_columns].values
y_train = train_data[y_column].values.ravel()
X_val = val_data[X_columns].values
y_val = val_data[y_column].values.ravel()
X_test = test_data[X_columns].values
y_test = test_data[y_column].values.ravel()

# Combine train_data and val_data
train_val_data = pd.concat([train_data, val_data])
X_train_val = train_val_data[X_columns].values
y_train_val = train_val_data[y_column].values.ravel()

# Scale the data
scaler = StandardScaler()
X_train_val = scaler.fit_transform(X_train_val)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Define the hyperparameter grid
param_grid = {
    'alpha': [0.1, 1, 10, 100]
}

# Create a Ridge regressor
regr_ridge = Ridge()

# Perform grid search
grid_search = GridSearchCV(regr_ridge, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_val, y_train_val)

# Train the model using the best hyperparameters
regr_ridge = grid_search.best_estimator_

# K-fold cross validation
kf = KFold(n_splits=5)
rmse_scores_ridge = []
r2_scores_ridge = []
mape_scores_ridge = []

for train_index, test_index in kf.split(X_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Train the model using the training sets
    regr_ridge.fit(X_train_cv, y_train_cv)

    # Predict the response for test dataset
    y_pred = regr_ridge.predict(X_test_cv)

    # Evaluation
    rmse_scores_ridge.append(np.sqrt(mean_squared_error(y_test_cv, y_pred)))
    r2_scores_ridge.append(r2_score(y_test_cv, y_pred))
    mape_scores_ridge.append(mean_absolute_percentage_error(y_test_cv, y_pred))

# Calculate average scores
print("\nCross-Validation Average RMSE:", np.mean(rmse_scores_ridge))
print("Cross-Validation Average R-squared:", np.mean(r2_scores_ridge))
print("Cross-Validation Average MAPE:", np.mean(mape_scores_ridge))

# Evaluate on test set
y_test_pred = regr_ridge.predict(X_test)

#Evaluate on test set
print("\nTest RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Test R-squared:", r2_score(y_test, y_test_pred))
print("Test MAPE:", mean_absolute_percentage_error(y_test, y_test_pred))



Ridge regression with k-fold cross validation without metadata:

Cross-Validation Average RMSE: 3074.7469392219164
Cross-Validation Average R-squared: 0.6052995988525762
Cross-Validation Average MAPE: 2.2713371462818555

Test RMSE: 6707.870524629781
Test R-squared: 0.5396546118917163
Test MAPE: 2.861606444930138


In [2]:
# Calculate feature importance using coefficients
feature_importance = np.abs(regr_ridge.coef_)

# Normalize the feature importances
normalized_feature_importance = feature_importance / np.sum(feature_importance)

# Create a dictionary of feature names and their corresponding importances
feature_importance_dict = dict(zip(X_columns, normalized_feature_importance))

# Sort the dictionary by importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importances
print("\nFeature Importance (sorted from high to low):")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance (sorted from high to low):
total_down_averagepergame: 0.3048735590149824
total_keystrokes_averagepergame: 0.19653256669155217
total_left_averagepergame: 0.13735496989738993
total_left_right_averagepergame: 0.10346650407980712
videogames_hours_a_week: 0.06292146362629654
total_right_averagepergame: 0.05703661373865388
age: 0.05611207789968003
sex_num: 0.037834225841533294
total_clockwise_rotations_averagepergame: 0.0216138998009167
total_conterclockwise_rotations_averagepergame: 0.01299996813624622
total_rotations_averagepergame: 0.009254151272941718


In [11]:
# Calculate feature importance using coefficients
feature_importance = np.abs(regr_ridge.coef_)

# Normalize the feature importances
normalized_feature_importance = feature_importance / np.sum(feature_importance)

# Create a dictionary of feature names and their corresponding importances
feature_importance_dict = dict(zip(X_columns, normalized_feature_importance))

# Sort the dictionary by importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importances
print("\nFeature Importance (sorted from high to low):")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance (sorted from high to low):
total_down_averagepergame: 0.3048735590149824
total_keystrokes_averagepergame: 0.19653256669155217
total_left_averagepergame: 0.13735496989738993
total_left_right_averagepergame: 0.10346650407980712
videogames_hours_a_week: 0.06292146362629654
total_right_averagepergame: 0.05703661373865388
age: 0.05611207789968003
sex_num: 0.037834225841533294
total_clockwise_rotations_averagepergame: 0.0216138998009167
total_conterclockwise_rotations_averagepergame: 0.01299996813624622
total_rotations_averagepergame: 0.009254151272941718


In [14]:
# Lasso regression with K-fold cross validation without metadata
print('Lasso regression with k-fold cross validation without metadata:')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Import data from Excel
train_data = pd.read_excel("train_data.xlsx")
val_data = pd.read_excel("val_data.xlsx")
test_data = pd.read_excel("test_data.xlsx")

X_columns = ['sex_num', "age", 'videogames_hours_a_week','total_keystrokes_averagepergame', 'total_down_averagepergame', 'total_left_averagepergame', 'total_right_averagepergame', 'total_left_right_averagepergame', 'total_clockwise_rotations_averagepergame', 'total_conterclockwise_rotations_averagepergame', 'total_rotations_averagepergame']
y_column = ['average_score']

X_train = train_data[X_columns].values
y_train = train_data[y_column].values.ravel()
X_val = val_data[X_columns].values
y_val = val_data[y_column].values.ravel()
X_test = test_data[X_columns].values
y_test = test_data[y_column].values.ravel()

# Combine train_data and val_data
train_val_data = pd.concat([train_data, val_data])
X_train_val = train_val_data[X_columns].values
y_train_val = train_val_data[y_column].values.ravel()

# Scale the data
scaler = StandardScaler()
X_train_val = scaler.fit_transform(X_train_val)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# K-fold cross validation
kf = KFold(n_splits=5)
rmse_scores_lasso = []
r2_scores_lasso = []
mape_scores_lasso = []

for train_index, test_index in kf.split(X_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Define the hyperparameter grid
    param_grid = {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
    }

    # Create a Lasso regressor
    regr_lasso = Lasso()

    # Perform grid search
    grid_search = GridSearchCV(regr_lasso, param_grid=param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train_val, y_train_val)
    
    # Train the model using the best hyperparameters
    regr_lasso = grid_search.best_estimator_

    # Train the model using the training sets
    regr_lasso.fit(X_train_cv, y_train_cv)

    # Predict the response for test dataset
    y_pred = regr_lasso.predict(X_test_cv)

    # Evaluation
    rmse_scores_lasso.append(np.sqrt(mean_squared_error(y_test_cv, y_pred)))
    r2_scores_lasso.append(r2_score(y_test_cv, y_pred))
    mape_scores_lasso.append(mean_absolute_percentage_error(y_test_cv, y_pred))

# Calculate average scores
print("\nCross-Validation Average RMSE:", np.mean(rmse_scores_lasso))
print("Cross-Validation Average R-squared:", np.mean(r2_scores_lasso))
print("Cross-Validation Average MAPE:", np.mean(mape_scores_lasso))

# Evaluate on test set
y_test_pred = regr_lasso.predict(X_test)

print("\nTest RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Test R-squared:", r2_score(y_test, y_test_pred))
print("Test MAPE:", mean_absolute_percentage_error(y_test, y_test_pred))


Lasso regression with k-fold cross validation without metadata:


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c


Cross-Validation Average RMSE: 3026.0209333480598
Cross-Validation Average R-squared: 0.6229924451390707
Cross-Validation Average MAPE: 2.2145333235238334

Test RMSE: 6514.20432124315
Test R-squared: 0.5658525977932807
Test MAPE: 2.9626210171253495


In [4]:
# Calculate feature importance using coefficients
feature_importance = np.abs(regr_lasso.coef_)

# Normalize the feature importances
normalized_feature_importance = feature_importance / np.sum(feature_importance)

# Create a dictionary of feature names and their corresponding importances
feature_importance_dict = dict(zip(X_columns, normalized_feature_importance))

# Sort the dictionary by importance
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the sorted feature importances
print("\nFeature Importance (sorted from high to low):")
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance (sorted from high to low):
total_down_averagepergame: 0.5241299816021844
total_left_averagepergame: 0.30981248148037926
total_conterclockwise_rotations_averagepergame: 0.06601694914312395
videogames_hours_a_week: 0.05844029181004331
age: 0.027491570047634965
sex_num: 0.01410872591663427
total_keystrokes_averagepergame: 0.0
total_right_averagepergame: 0.0
total_left_right_averagepergame: 0.0
total_clockwise_rotations_averagepergame: 0.0
total_rotations_averagepergame: 0.0


In [12]:
# Random Forest regression with k-fold cross validation including metadata
print('Random Forest regression with k-fold cross validation including metadata:')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Import data from Excel
train_data = pd.read_excel("train_data.xlsx")
val_data = pd.read_excel("val_data.xlsx")
test_data = pd.read_excel("test_data.xlsx")

X_columns = ['sex_num', "age", 'videogames_hours_a_week','total_keystrokes_averagepergame', 'total_down_averagepergame', 'total_left_averagepergame', 'total_right_averagepergame', 'total_left_right_averagepergame', 'total_clockwise_rotations_averagepergame', 'total_conterclockwise_rotations_averagepergame', 'total_rotations_averagepergame']
y_column = ['average_score']

X_train = train_data[X_columns].values
y_train = train_data[y_column].values.ravel()
X_val = val_data[X_columns].values
y_val = val_data[y_column].values.ravel()
X_test = test_data[X_columns].values
y_test = test_data[y_column].values.ravel()

# Combine train_data and val_data
train_val_data = pd.concat([train_data, val_data])
X_train_val = train_val_data[X_columns].values
y_train_val = train_val_data[y_column].values.ravel()

# Scale the data
scaler = StandardScaler()
X_train_val = scaler.fit_transform(X_train_val)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Initialize the cumulative feature importances array
cumulative_feature_importances = np.zeros(len(X_columns))

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50]
}

# Create a RandomForestRegressor
regr_rf = RandomForestRegressor()

# Perform grid search
grid_search = GridSearchCV(regr_rf, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_val, y_train_val)

# Train the model using the best hyperparameters
regr_rf = grid_search.best_estimator_

# K-fold cross validation
kf = KFold(n_splits=5)
rmse_scores_rf = []
r2_scores_rf = []
mape_scores_rf = []

for train_index, test_index in kf.split(X_train_val):
    X_train_cv, X_test_cv = X_train_val[train_index], X_train_val[test_index]
    y_train_cv, y_test_cv = y_train_val[train_index], y_train_val[test_index]

    # Train the model using the training sets
    regr_rf.fit(X_train_cv, y_train_cv)

    # Predict the response for test dataset
    y_pred = regr_rf.predict(X_test_cv)

    # Add the current model's feature importances to the cumulative feature importances array
    cumulative_feature_importances += regr_rf.feature_importances_

    # Evaluation
    rmse_scores_rf.append(np.sqrt(mean_squared_error(y_test_cv, y_pred)))
    r2_scores_rf.append(r2_score(y_test_cv, y_pred))
    mape_scores_rf.append(mean_absolute_percentage_error(y_test_cv, y_pred))

# Calculate average scores
print("\nCross-Validation Average RMSE:", np.mean(rmse_scores_rf))
print("Cross-Validation Average R-squared:", np.mean(r2_scores_rf))
print("Cross-Validation Average MAPE:", np.mean(mape_scores_rf))

# Evaluate on test set
y_test_pred_rf = regr_rf.predict(X_test)

# set
print("\nTest RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred_rf)))
print("Test R-squared:", r2_score(y_test, y_test_pred_rf))
print("Test MAPE:", mean_absolute_percentage_error(y_test, y_test_pred_rf))

# Calculate the average feature importances by dividing the cumulative feature importances array by the number of folds
average_feature_importances = cumulative_feature_importances / kf.get_n_splits()

# Print the average feature importances
feature_importances = pd.DataFrame(average_feature_importances, index=X_columns, columns=['importance']).sort_values('importance', ascending=False)
print("\nFeature Importance - Random Forest:")
for index, row in feature_importances.iterrows():
    print(f"{index:<30} importance: {row['importance']:.3f}")



Random Forest regression with k-fold cross validation including metadata:

Cross-Validation Average RMSE: 2652.341392365287
Cross-Validation Average R-squared: 0.691642802518293
Cross-Validation Average MAPE: 0.8888130009301289

Test RMSE: 7963.3598276019775
Test R-squared: 0.35120553828368106
Test MAPE: 1.8621672331800152

Feature Importance - Random Forest:
total_down_averagepergame      importance: 0.415
total_keystrokes_averagepergame importance: 0.192
total_right_averagepergame     importance: 0.117
total_left_averagepergame      importance: 0.101
total_left_right_averagepergame importance: 0.063
total_conterclockwise_rotations_averagepergame importance: 0.040
total_rotations_averagepergame importance: 0.027
total_clockwise_rotations_averagepergame importance: 0.025
age                            importance: 0.008
videogames_hours_a_week        importance: 0.008
sex_num                        importance: 0.004


In [15]:
# Baseline/dummy regression metadata only
print('Baseline/dummy regression metadata only:')

import pandas as pd
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

# All the data importing and preprocessing steps remain the same

# Create a dummy regressor
dummy_regr = DummyRegressor(strategy="mean")

# Train the model using the training sets
dummy_regr.fit(X_train, y_train)

# Predict the response for the validation set
y_val_pred = dummy_regr.predict(X_val)

# Evaluation
print("\nValidation RMSE:", np.sqrt(mean_squared_error(y_val, y_val_pred)))
print("Validation R-squared:", r2_score(y_val, y_val_pred))
print("Validation MAPE:", mean_absolute_percentage_error(y_val, y_val_pred))

# Predict the response for test dataset
y_test_pred = dummy_regr.predict(X_test)

print("\nTest RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Test R-squared:", r2_score(y_test, y_test_pred))
print("Test MAPE:", mean_absolute_percentage_error(y_test, y_test_pred))


Baseline/dummy regression metadata only:

Validation RMSE: 3669.953059776524
Validation R-squared: -0.5622350256075279
Validation MAPE: 8.347702550483959

Test RMSE: 10321.63858341834
Test R-squared: -0.0899629861666651
Test MAPE: 3.8279604145607897
