In [1]:
import pandas as pd

In [2]:
def add_fixture_difficulty(row, fixtures_dict):
    fixture_id = row['fixture']
    if row['was_home']:
        team = 'team_h_difficulty'
    else:
        team = 'team_a_difficulty'
    row['fixture_difficulty'] = fixtures_dict[fixture_id][team]

    return row

In [None]:
# load in raw data
merged_gws_24_25 = pd.read_csv('2425gw10.csv')

# pick out useful subset of data
merged_gws_24_25 = merged_gws_24_25[
    [
        'element',      # player ID
        'name',
        'position',
        'GW',
        'total_points',
        'value',
        'minutes',
        'expected_goals',
        'expected_assists',
        'expected_goals_conceded',
        'goals_scored',
        'assists',
        'goals_conceded',
        'clean_sheets',
        'ict_index',
        'fixture',
        'was_home'      # we will use 'fixture' and 'was_home' to retrieve fixture difficulty
    ]
]

# here we write code to extract fixture difficulty based on the columns 'fixture' and 'was_home'
fixtures_24_25 = pd.read_csv('../../data/raw/2024-25/fixtures.csv')
fixtures_dict_24_25 = fixtures_24_25[['id', 'team_h_difficulty', 'team_a_difficulty']].set_index('id').T.to_dict()

merged_gws_24_25 = merged_gws_24_25.apply(lambda row: add_fixture_difficulty(row=row, fixtures_dict=fixtures_dict_24_25), axis=1)

# save processed data
merged_gws_24_25.to_csv('processed_2425gw10.csv', index=False)

In [4]:
# helper function to calculate rolling features
def calculate_rolling_features(df, window, min_p):
    grouped = df.groupby('element')

    # roll statistics
    df[f'avg_score_last_{window}'] = (
        grouped['total_points'].apply(
            lambda x: x.shift(1).rolling(window, min_periods=min_p).mean()
        ).reset_index(level=0, drop=True)
    )

    df[f'avg_mins_last_{window}'] = (
        grouped['minutes'].apply(
            lambda x: x.shift(1).rolling(window, min_periods=min_p).mean()
        ).reset_index(level=0, drop=True)
    )

    df[f'goals_last_{window}'] = (
        grouped['goals_scored'].apply(
            lambda x: x.shift(1).rolling(window, min_periods=min_p).sum()
        ).reset_index(level=0, drop=True)
    )

    df[f'assists_last_{window}'] = (
        grouped['assists'].apply(
            lambda x: x.shift(1).rolling(window, min_periods=min_p).sum()
        ).reset_index(level=0, drop=True)
    )

    df[f'xG_last_{window}'] = (
        grouped['expected_goals'].apply(
            lambda x: x.shift(1).rolling(window, min_periods=min_p).sum()
        ).reset_index(level=0, drop=True)
    )

    df[f'xA_last_{window}'] = (
        grouped['expected_assists'].apply(
            lambda x: x.shift(1).rolling(window, min_periods=min_p).sum()
        ).reset_index(level=0, drop=True)
    )

    df[f'ict_last_{window}'] = (
        grouped['ict_index'].apply(
            lambda x: x.shift(1).rolling(window, min_periods=min_p).sum()
        ).reset_index(level=0, drop=True)
    )

    df[f'goals_conceded_last_{window}'] = (
        grouped['goals_conceded'].apply(
            lambda x: x.shift(1).rolling(window, min_periods=min_p).sum()
        ).reset_index(level=0, drop=True)
    )

    df[f'avg_xGC_last_{window}'] = (
        grouped['expected_goals_conceded'].apply(
            lambda x: x.shift(1).rolling(window, min_periods=min_p).mean()
        ).reset_index(level=0, drop=True)
    )

    df[f'clean_sheets_last_{window}'] = (
        grouped['clean_sheets'].apply(
            lambda x: x.shift(1).rolling(window, min_periods=min_p).sum()
        ).reset_index(level=0, drop=True)
    )

    return df

In [5]:
# general function to faciliate feature engineering process
def feature_engineering(df):
    # sort by player and GW to ensure proper rolling calculations
    df = df.sort_values(by=['element', 'GW'])

    # main rolling calculations
    fe_df = calculate_rolling_features(df, window=3, min_p=2)
    fe_df = calculate_rolling_features(df, window=1, min_p=1)

    # drop invalid rows (with NaN values)
    fe_df = fe_df.dropna()
    
    return fe_df

In [6]:
# load in processed data 
processed_merged_gws_24_25 = pd.read_csv('processed_2425gw10.csv')

# transform
feature_engineered_24_25 = feature_engineering(processed_merged_gws_24_25)

# output
feature_engineered_24_25.to_csv('engineered_2425gw10.csv', index=False)

In [7]:
def feature_selection(df):
    # filter out rows where minutes = 0
    # df = df[df['minutes'] > 0]

    # select relevant features
    df = df[[
        'element',
        'name',
        'position',
        'GW',
        'total_points',
        'value',
        'avg_score_last_1',
        'avg_score_last_3',
        'ict_last_1',
        'ict_last_3',
        'avg_mins_last_1',
        'avg_mins_last_3',
        'xG_last_1',
        'xG_last_3',
        'xA_last_1',
        'xA_last_3',
        'clean_sheets_last_1',
        'clean_sheets_last_3',
        'fixture_difficulty'
    ]].rename(columns={
        'element':'player_id',
        'name':'full_name',
        'total_points':'gameweek_points'
    })

    return df

In [8]:
# load in feature engineered data
feature_engineered_24_25 = pd.read_csv('engineered_2425gw10.csv')

# transform
feature_selected_24_25 = feature_selection(feature_engineered_24_25)

# output
feature_selected_24_25.to_csv('selected_2425gw10.csv', index=False)