In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from concurrent.futures import ThreadPoolExecutor
import threading

# Lock for thread-safe writing to CSV
lock = threading.Lock()

# Load the data
all_data = pd.read_csv('all_data.csv')

# Preprocessing
all_data = all_data.drop(['team'], axis=1)
team_name_mapping = pd.read_csv('team_name_mapping.csv')  # Assume it has columns 'original' and 'encoded'
team_name_dict = dict(zip(team_name_mapping['original'], team_name_mapping['encoded']))
all_data['team_name'] = all_data['team_name'].map(team_name_dict)
all_data = all_data.dropna(axis=0)
all_data = all_data.drop(['name'], axis=1)

# Train and Test seasons
# train_seasons = ['2016-17', '2017-18','2018-19','2019-20','2020-21','2021-22','2022-23']
# test_season = ['2023-24']

train_seasons = ['2016-17', '2017-18','2018-19','2019-20','2020-21','2021-22']
test_season = ['2022-23']


# Feature sets to test
feature_sets = [
    ['minutes_rolling_5', 'opponent_team', 'element'],  # 1. Basic features
    ['ict_index_rolling_10', 'lagged_bps_rolling_5', 'creativity_rolling_5'],  # 2. Combined lagged and current features
    ['bps_rolling_5', 'lagged_influence_rolling_5', 'value_rolling_5'],  # 3. More combined lagged and current features
    ['influence_rolling_5', 'threat_rolling_5', 'team_name'],  # 4. Influence and threat
    ['lagged_value', 'lagged_selected', 'lagged_influence'],  # 5. Only lagged basic features
    ['lagged_bps_rolling_5', 'lagged_creativity_rolling_10'],  # 6. Only lagged rolling metrics
    ['ict_index_rolling_5', 'ict_index_rolling_10'],  # 7. Different rolling windows of the same metric
    ['minutes_rolling_5', 'minutes_rolling_10'],  # 8. Different rolling windows for minutes played
    ['transfers_in_rolling_5', 'transfers_in_rolling_10'],  # 9. Transfers in different rolling windows
    ['lagged_value_x_lagged_total_points', 'lagged_value_x_lagged_influence'],  # 10. Interactions between lagged features
    ['clean_sheets_rolling_5', 'goals_conceded_rolling_5'],  # 11. Defensive metrics
    ['lagged_value', 'lagged_ict_index', 'lagged_bps', 'lagged_minutes'],  # 12. Multiple lagged features
    ['value_rolling_5', 'influence_rolling_5', 'bps_rolling_5'],  # 13. Multiple rolling features
    ['ict_index_rolling_10', 'threat_rolling_5', 'creativity_rolling_5'],  # 14. Different types of rolling features
    ['team_name', 'was_home', 'element'],  # 15. Team and positional data
    ['lagged_minutes_squared', 'lagged_minutes_cubed'],  # 16. Polynomial features of lagged minutes
    ['minutes_rolling_5', 'clean_sheets_rolling_5'],  # 17. Minutes and clean sheets
    ['lagged_transfers_in_rolling_5', 'lagged_transfers_in_rolling_10'],  # 18. Lagged transfers in different rolling windows
    ['creativity_rolling_5', 'creativity_rolling_10'],  # 19. Different rolling windows of creativity
    ['bps_rolling_5', 'bps_rolling_10', 'threat_rolling_5', 'threat_rolling_10'],  # 20. Mix of bps and threat in different rolling windows
    ['minutes_rolling_5', 'ict_index_rolling_5', 'transfers_in_rolling_5', 'clean_sheets_rolling_5'],  # 21. A bigger set of rolling features
    ['minutes_rolling_5', 'lagged_minutes_rolling_5', 'lagged_minutes_rolling_10', 'lagged_value'],  # 22. A mix of lagged and current rolling minutes
    ['lagged_influence', 'lagged_ict_index', 'lagged_bps', 'lagged_minutes', 'lagged_value'],  # 23. Multiple lagged basic features
    ['ict_index_rolling_5', 'threat_rolling_5', 'creativity_rolling_5', 'influence_rolling_5'],  # 24. All rolling 5 metrics
    ['minutes_rolling_5', 'clean_sheets_rolling_5', 'threat_rolling_5', 'creativity_rolling_5'],  # 25. Mix of different types of rolling 5 metrics
    ['bps_rolling_5', 'bps_rolling_10', 'threat_rolling_5', 'threat_rolling_10', 'influence_rolling_5', 'influence_rolling_10'],  # 26. Mix of bps, threat, influence in different rolling windows
    ['element', 'was_home', 'team_name', 'opponent_team', 'season'],  # 27. All categorical features
    ['ict_index_rolling_10', 'lagged_bps_rolling_5', 'lagged_bps_rolling_10', 'bps_rolling_5', 'creativity_rolling_5'],  # 28. Combined lagged and current features (including bps)
    ['lagged_value_x_lagged_influence', 'lagged_value_x_lagged_total_points', 'lagged_value_x_lagged_ict_index'],  # 29. Multiple interaction terms
    ['minutes_rolling_5', 'minutes_rolling_10', 'lagged_minutes_rolling_5', 'lagged_minutes_rolling_10', 'lagged_value', 'lagged_influence'],  # 30. Bigger combination of minutes and lagged features
    ['ict_index_rolling_5', 'ict_index_rolling_10', 'lagged_ict_index_rolling_5', 'lagged_ict_index_rolling_10'],  # 31. ICT index in multiple time frames
    ['lagged_bps_rolling_5', 'lagged_bps_rolling_10', 'lagged_transfers_in_rolling_5', 'lagged_transfers_in_rolling_10'],  # 32. Multiple lagged features (bps and transfers)
    ['influence_rolling_5', 'influence_rolling_10', 'bps_rolling_5', 'bps_rolling_10', 'creativity_rolling_5', 'creativity_rolling_10'],  # 33. Different types of rolling features
    ['lagged_ict_index_x_lagged_minutes', 'lagged_bps_x_lagged_minutes', 'lagged_minutes_squared', 'lagged_minutes_cubed'],  # 34. Interaction and polynomial terms of lagged minutes
    ['ict_index_rolling_5', 'threat_rolling_5', 'creativity_rolling_5', 'influence_rolling_5', 'value_rolling_5', 'minutes_rolling_5'],  # 35. Multiple types of rolling 5 metrics
    ['was_home', 'team_name', 'opponent_team', 'element', 'minutes_rolling_5', 'influence_rolling_5'],  # 36. Mix of categorical and numerical features
    ['transfers_in_rolling_5', 'transfers_in_rolling_10', 'lagged_transfers_in_rolling_5', 'lagged_transfers_in_rolling_10'],  # 37. Transfers in different lagged rolling windows
    ['clean_sheets_rolling_5', 'clean_sheets_rolling_10', 'goals_conceded_rolling_5', 'goals_conceded_rolling_10'],  # 38. Defensive metrics
    ['lagged_value_x_lagged_total_points', 'lagged_value_x_lagged_ict_index', 'lagged_value_x_lagged_bps', 'lagged_value_x_lagged_minutes'],  # 39. Interaction terms
    # 40. All features (WARNING: This could be prone to overfitting)
    ['total_points', 'element', 'opponent_team', 'was_home', 'season', 'GW', 'team_name', 'ict_index_rolling_5', 'ict_index_rolling_10',
    'minutes_rolling_5', 'minutes_rolling_10', 'clean_sheets_rolling_5', 'clean_sheets_rolling_10', 'value_rolling_5', 'transfers_in_rolling_5',
    'transfers_in_rolling_10', 'influence_rolling_5', 'influence_rolling_10', 'goals_conceded_rolling_5', 'goals_conceded_rolling_10',
    'threat_rolling_5', 'threat_rolling_10', 'bps_rolling_5', 'bps_rolling_10', 'creativity_rolling_5', 'creativity_rolling_10',
    'selected_rolling_5', 'lagged_value', 'lagged_selected', 'lagged_influence', 'lagged_ict_index', 'lagged_bps', 'lagged_minutes',
    'lagged_ict_index_rolling_5', 'lagged_ict_index_rolling_10', 'lagged_minutes_rolling_5', 'lagged_minutes_rolling_10', 'lagged_clean_sheets_rolling_5',
    'lagged_clean_sheets_rolling_10', 'lagged_value_rolling_5', 'lagged_transfers_in_rolling_5', 'lagged_transfers_in_rolling_10', 'lagged_influence_rolling_5',
    'lagged_influence_rolling_10', 'lagged_goals_conceded_rolling_5', 'lagged_goals_conceded_rolling_10', 'lagged_threat_rolling_5', 'lagged_threat_rolling_10',
    'lagged_bps_rolling_5', 'lagged_bps_rolling_10', 'lagged_creativity_rolling_5', 'lagged_creativity_rolling_10', 'lagged_selected_rolling_5', 'lagged_total_points_rolling_5',
    'lagged_total_points_rolling_10', 'lagged_index_x_lagged_minutes', 'lagged_value_x_lagged_influence', 'lagged_value_x_lagged_total_points', 'lagged_value_x_lagged_ict_index',
    'lagged_value_x_lagged_bps', 'lagged_value_x_lagged_minutes', 'lagged_selected_x_lagged_minutes', 'lagged_ict_index_x_lagged_minutes', 'lagged_bps_x_lagged_minutes',
    'lagged_minutes_squared', 'lagged_minutes_cubed']
]


In [19]:
print("Unique seasons in all_data:", all_data['season'].unique())
print("Seasons in test_season:", test_season)



Unique seasons in all_data: ['2016-17' '2017-18' '2018-19' '2019-20' '2020-21' '2021-22' '2022-23']
Seasons in test_season: ['2022-23']


##Parallel

In [20]:
# Create a DataFrame to store results
results_df = pd.DataFrame(columns=['Feature_Set', 'Train_RMSE', 'Test_RMSE'])

def evaluate_feature_set(feature_set):
    global results_df  # Declare results_df as global so that we can modify it

    print(f"Evaluating feature set: {feature_set}")

    # Define X and y based on feature set
    X = all_data[feature_set]
    y = all_data['total_points']

    # Create train and test data
    X_train = X[all_data['season'].isin(train_seasons)]
    y_train = y[all_data['season'].isin(train_seasons)]
    X_test = X[all_data['season'].isin(test_season)]
    y_test = y[all_data['season'].isin(test_season)]

    # Initialize Random Forest model with best parameters
    rf_model = RandomForestRegressor(
        bootstrap=True,
        max_depth=9,
        max_features='sqrt',
        min_samples_leaf=7,
        min_samples_split=2,
        n_estimators=450,
        n_jobs=-1  # This tells sklearn to use all available CPUs for this individual model
    )

    # Fit the model
    rf_model.fit(X_train, y_train)

    # Make predictions
    y_pred_train = rf_model.predict(X_train)
    y_pred_test = rf_model.predict(X_test)

    # Calculate RMSE for train and test sets
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

    # Lock to make it thread-safe
    with lock:
        # Append results to results_df
        new_row = pd.DataFrame([[feature_set, rmse_train, rmse_test]], columns=['Feature_Set', 'Train_RMSE', 'Test_RMSE'])
        results_df = pd.concat([results_df, new_row], ignore_index=True)

        # Save to CSV
        results_df.to_csv('feature_evaluation_results.csv', index=False)

    print(f"Train RMSE: {rmse_train}")
    print(f"Test RMSE: {rmse_test}")
    print("-----")

# Parallelizing the evaluation
with ThreadPoolExecutor() as executor:
    executor.map(evaluate_feature_set, feature_sets)


Evaluating feature set: ['minutes_rolling_5', 'opponent_team', 'element']
Evaluating feature set: ['ict_index_rolling_10', 'lagged_bps_rolling_5', 'creativity_rolling_5']
Evaluating feature set: ['bps_rolling_5', 'lagged_influence_rolling_5', 'value_rolling_5']
Evaluating feature set: ['influence_rolling_5', 'threat_rolling_5', 'team_name']
Evaluating feature set: ['lagged_value', 'lagged_selected', 'lagged_influence']
Evaluating feature set: ['lagged_bps_rolling_5', 'lagged_creativity_rolling_10']
Evaluating feature set: ['ict_index_rolling_5', 'ict_index_rolling_10']
Evaluating feature set: ['minutes_rolling_5', 'minutes_rolling_10']
Evaluating feature set: ['transfers_in_rolling_5', 'transfers_in_rolling_10']
Evaluating feature set: ['lagged_value_x_lagged_total_points', 'lagged_value_x_lagged_influence']
Evaluating feature set: ['clean_sheets_rolling_5', 'goals_conceded_rolling_5']
Evaluating feature set: ['lagged_value', 'lagged_ict_index', 'lagged_bps', 'lagged_minutes']
Train RM

##Sequential

In [16]:
def evaluate_feature_set(feature_set):
    print(f"Evaluating feature set: {feature_set}")


    # Define X and y based on feature set
    X = all_data[feature_set]
    y = all_data['total_points']

    # Create train and test data
    X_train = X[all_data['season'].isin(train_seasons)]
    y_train = y[all_data['season'].isin(train_seasons)]
    X_test = X[all_data['season'].isin(test_season)]
    y_test = y[all_data['season'].isin(test_season)]
    print(X_train.shape)
    print(y_train.shape)
    print("Number of samples in X_test:", X_test.shape[0])

    # Initialize Random Forest model with best parameters
    rf_model = RandomForestRegressor(
        bootstrap=True,
        max_depth=9,
        max_features='sqrt',
        min_samples_leaf=7,
        min_samples_split=2,
        n_estimators=450,
        n_jobs=-1  # This tells sklearn to use all available CPUs for this individual model
    )

    # Fit the model
    rf_model.fit(X_train, y_train)

    # Make predictions
    y_pred_train = rf_model.predict(X_train)
    y_pred_test = rf_model.predict(X_test)

    # Calculate RMSE for train and test sets
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

    print(f"Train RMSE: {rmse_train}")
    print(f"Test RMSE: {rmse_test}")
    print("-----")


# Evaluating each feature set sequentially
for feature_set in feature_sets:
    evaluate_feature_set(feature_set)

Evaluating feature set: ['minutes_rolling_5', 'opponent_team', 'element']
(159792, 3)
(159792,)
Number of samples in X_test: 0


ValueError: ignored