In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Load Bukayo Saka's data
player_data = pd.read_csv('bukayo_player_data.csv')

# Extracting the year from the 'Season' column
player_data['Season'] = player_data['Season'].str.split('/').str[0]

# Read team data
team_data = pd.read_csv('arsenal_data.csv')

# Extracting the year from the 'Season' column
team_data['Season'] = team_data['Season'].str.split('/').str[0]

# Merge Bukayo Saka's data with team data based on season and team
merged_data = pd.merge(player_data, team_data, on=['Season', 'Team'])

# Prepare data for modeling
predicted_apps = 37  # Define the predicted number of appearances
merged_data['xG_trend'] = merged_data['xG_x'] / merged_data['Apps_x'] * predicted_apps
merged_data['xA_trend'] = merged_data['xA_x'] / merged_data['Apps_x'] * predicted_apps

# Define features and target variables
X_columns = ['Season', 'xG_x', 'xA_x', 'Apps_x', 'Min_x', 'G_x', 'A_x',
             'Sh90_x', 'KP90_x', 'xG90_x', 'xA90_x', 'Apps_y', 'G_y', 'A_y',
             'Sh90_y', 'KP90_y', 'xG_y', 'xA_y', 'xG90_y', 'xA90_y']
X = merged_data[X_columns].astype(float)
y_xG = merged_data['xG_trend'].values
y_xA = merged_data['xA_trend'].values

# Set up parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Create Random Forest models
model_xG = RandomForestRegressor(random_state=42)
model_xA = RandomForestRegressor(random_state=42)

# Create GridSearchCV objects
grid_search_xG = GridSearchCV(estimator=model_xG, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search_xA = GridSearchCV(estimator=model_xA, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit models to find best parameters
grid_search_xG.fit(X, y_xG)
grid_search_xA.fit(X, y_xA)

# Retrieve best parameters and best scores
best_params_xG = grid_search_xG.best_params_
best_score_xG = grid_search_xG.best_score_

best_params_xA = grid_search_xA.best_params_
best_score_xA = grid_search_xA.best_score_

print("Best parameters for xG model:", best_params_xG)
print("Best negative mean squared error for xG model:", best_score_xG)
print("Best parameters for xA model:", best_params_xA)
print("Best negative mean squared error for xA model:", best_score_xA)


Best parameters for xG model: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 1000}
Best negative mean squared error for xG model: -32.08880892841858
Best parameters for xA model: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}
Best negative mean squared error for xA model: -7.562981889239015
