##Part 1: Import Libraries and Unzip the Data

In [9]:
# Import required libraries
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

##Part 2: Load the Data

In [13]:
# Load the game week data
gw_path = 'FPL/data/2023-24'  # Adjust this to the correct path where your 2023-24 game week data resides
gw_files = [f for f in os.listdir(gw_path) if os.path.isfile(os.path.join(gw_path, f))]
gw_dfs = [pd.read_csv(os.path.join(gw_path, f)) for f in gw_files]
merged_gw_df = pd.concat(gw_dfs, ignore_index=True)

# Load the teams data
teams_2023_24_df = pd.read_csv('FPL/data/master_team_list.csv')  # Adjust this to the correct path where your teams data resides


UnicodeDecodeError: ignored

##Part 3: Data Preprocessing and Merge


In [None]:
# Merge the game week data with the team data based on the 'team' column
merged_gw_df = pd.merge(merged_gw_df, teams_2023_24_df, left_on='team', right_on='id', how='left')

# Fill missing 'form' values with zero
merged_gw_df['form'].fillna(0, inplace=True)



##Part 4: Feature Engineering - Rolling Averages and Lag Variables

In [None]:
# Generate rolling averages and lag variables for key metrics
def generate_rolling_and_lag(df, column, window=3):
    df = df.sort_values(by=['name', 'GW'])
    df[f'{column}_rolling_avg_{window}'] = df.groupby('name')[column].transform(lambda x: x.rolling(window=window, min_periods=1).mean())
    df[f'{column}_lag_1'] = df.groupby('name')[column].shift(1)
    return df

key_metrics = ['total_points', 'assists', 'goals_scored', 'minutes', 'bps']
for metric in key_metrics:
    merged_gw_df = generate_rolling_and_lag(merged_gw_df, metric)


##Part 5: Additional Features and One-Hot Encoding

In [None]:
# Generate additional features
positional_strength_multiplier = merged_gw_df.groupby('position')['total_points'].transform('mean')
merged_gw_df['positional_strength'] = merged_gw_df['total_points'] * positional_strength_multiplier
merged_gw_df['player_team_interaction'] = merged_gw_df['total_points_rolling_avg_3'] * merged_gw_df['strength_overall_home']
merged_gw_df.fillna(0, inplace=True)

# One-hot encode categorical variables
merged_gw_df_encoded = pd.get_dummies(merged_gw_df, columns=['position', 'team'], drop_first=True)

##Part 6: Feature Selection Using Random Forest

In [None]:
# Identify non-numeric columns and remove them
non_numeric_columns = merged_gw_df_encoded.select_dtypes(include=['object']).columns.tolist()
X = merged_gw_df_encoded.drop(non_numeric_columns + ['total_points'], axis=1)
y = merged_gw_df_encoded['total_points']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest Regressor for feature importance
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
feature_importances = pd.DataFrame(rf.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
