In [None]:
import os
import re
import pandas as pd
from IPython.display import display  # For displaying the DataFrame in Google Colab

# Initialize an empty list to store individual DataFrames
dfs = []

# Initialize a dictionary to store unique columns for each season
unique_columns_per_season = {}

# Loop through the folders for each season
root_dir = '/Users/takoda/Documents/FPL/data/'
seasons = [
    '2016-17', '2017-18', '2018-19',
    '2019-20', '2020-21', '2021-22',
    '2022-23', '2023-24'
]

print("Starting to read files...")

for season in seasons:
    season_dir = os.path.join(root_dir, season, 'gws')

    # Check if the folder exists
    if not os.path.exists(season_dir):
        print(f"Skipping missing folder: {season_dir}")
        continue

    print(f"Processing folder: {season_dir}")

    # Initialize set to keep track of unique columns for this season
    unique_columns_this_season = set()

    # Loop through the files in the folder
    for gw_file in os.listdir(season_dir):
        # Use regex to filter out unwanted files
        if re.match(r'gw\d+.csv', gw_file):
            filepath = os.path.join(season_dir, gw_file)

            # Extract gameweek from filename
            gameweek = int(gw_file[2:-4])

            # Read the data into a DataFrame
            try:
                df = pd.read_csv(filepath)
            except UnicodeDecodeError:
                try:
                    df = pd.read_csv(filepath, encoding='ISO-8859-1')
                except Exception as e:
                    print(f"Could not read {filepath}. Error: {e}")
                    continue
            # Update the set of unique columns for this season
            unique_columns_this_season.update(df.columns.tolist())

            # Add a new column for the season and gameweek
            df['season'] = season
            df['GW'] = gameweek

            # Append to the list of DataFrames
            dfs.append(df)

    # Store unique columns for this season
    unique_columns_per_season[season] = unique_columns_this_season

# Concatenate all the DataFrames to form the final DataFrame
print("Concatenating all DataFrames...")
all_data = pd.concat(dfs, ignore_index=True)

# Find similar and unique columns across seasons
all_seasons_columns = set.intersection(*(set(x) for x in unique_columns_per_season.values()))
unique_columns = {season: cols - all_seasons_columns for season, cols in unique_columns_per_season.items()}

print(f"Similar columns across all seasons: {all_seasons_columns}")
for season, cols in unique_columns.items():
    print(f"Unique columns in season {season}: {cols}")


In [None]:
# Keep only the columns that are similar across all seasons, plus the "Season" column
final_columns = list(all_seasons_columns) + ['season'] + ['GW']
all_data = all_data[final_columns]

In [None]:
# Identify the columns to be removed
columns_to_remove = [col for col in all_data.columns if col not in final_columns]

# Remove those columns
if columns_to_remove:  # Only proceed if there are columns to remove
    all_data.drop(columns=columns_to_remove, inplace=True)


In [None]:
# Assuming all_data is your DataFrame
all_data = all_data.drop(['kickoff_time', 'team_h_score', 'fixture', 'team_a_score', 'round'], axis=1)

In [None]:
# Convert all columns except 'name', 'Season', and 'was_home' to numeric
cols_to_convert = [col for col in all_data.columns if col not in ['name', 'season', 'was_home']]
all_data[cols_to_convert] = all_data[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert 'was_home' to boolean
all_data['was_home'] = all_data['was_home'].astype(bool)

In [None]:
import pandas as pd

# Assuming all_data is your DataFrame
all_data.to_csv('all_data.csv', index=False)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np

# Display summary statistics
print(all_data.describe())

# Plotting numeric features
numeric_features = all_data.select_dtypes(include=[np.number]).columns.tolist()

# Correlation Matrix Heatmap
corrmat = all_data[numeric_features].corr()
plt.figure(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True, annot=True, fmt='.2f', cmap='coolwarm')

# Count of NaN values for each column
missing_values_count = all_data.isnull().sum()
missing_values_count = missing_values_count[missing_values_count > 0]
missing_values_count.sort_values(inplace=True)

if not missing_values_count.empty:
    plt.figure(figsize=(10, 6))
    sns.barplot(x=missing_values_count.index, y=missing_values_count.values, palette='viridis')
    plt.title('Missing Values Count by Column')
    plt.xlabel('Columns')
else:
    print("No missing values to plot.")


plt.show()

In [None]:
import pandas as pd

# Assume `df` is your main DataFrame and `team_df` is loaded from 'master_team_list.csv'
team_df = pd.read_csv(os.path.join(root_dir, 'master_team_list.csv'))

# Merge to get opponent_team_name
all_data = pd.merge(all_data, team_df, left_on=['season', 'opponent_team'], right_on=['season', 'team'], how='left')

# Assuming df is your DataFrame

# List of columns to compute rolling averages for
cols_to_average = [
    "bonus", "transfers_balance", "ict_index",
    "penalties_saved", "opponent_team", "minutes", "clean_sheets",
    "value", "was_home", "saves", "transfers_in", "influence",
    "penalties_missed", "assists", "goals_conceded", "threat",
    "own_goals", "transfers_out", "red_cards", "goals_scored",
    "bps", "creativity", "selected", "yellow_cards"
]

# Create rolling averages (over the last 5 game weeks for instance)
for col in cols_to_average:
    all_data[f"{col}_rolling_5"] = all_data.groupby("element")[col].transform(lambda x: x.rolling(window=5, min_periods=1).mean().shift(1))
    all_data[f"{col}_rolling_10"] = all_data.groupby("element")[col].transform(lambda x: x.rolling(window=10, min_periods=1).mean().shift(1))
all_data.reset_index(inplace=True)
# Loop through each player and opponent to calculate rolling averages for 5 and 10 GWs
for player in all_data['element'].unique():
    player_indices = all_data.index[all_data['element'] == player]

    for opponent in all_data.loc[player_indices, 'opponent_team'].unique():
        opponent_indices = all_data.index[(all_data['element'] == player) & (all_data['opponent_team'] == opponent)]

        # Sort by 'GW'
        # Sort by 'GW' within each 'element'
        opponent_indices = opponent_indices.sortlevel(level='GW')[0]


        # Compute the rolling averages for 'total_points' against this specific opponent
        total_points_rolling_5 = all_data.loc[opponent_indices, 'total_points'].rolling(window=5, min_periods=1).mean().shift(1)
        total_points_rolling_10 = all_data.loc[opponent_indices, 'total_points'].rolling(window=10, min_periods=1).mean().shift(1)

        # Update the original all_data DataFrame
        all_data.loc[opponent_indices, 'total_opp_points_rolling_5'] = total_points_rolling_5.values
        all_data.loc[opponent_indices, 'total_opp_points_rolling_10'] = total_points_rolling_10.values

# Now, all_data should have the new columns 'total_opp_points_rolling_5' and 'total_opp_points_rolling_10'


Big Data creation

In [None]:
all_columns = all_data.columns.tolist()
static_columns = ['name', 'element', 'season', 'GW', 'team', 'team_name', 'opponent_team', 'was_home']
dynamic_columns = [col for col in all_columns if col not in static_columns]  # Include 'total_points'

# Create lagged features for dynamic columns
for col in dynamic_columns:
    all_data[f'lagged_{col}'] = all_data.groupby('name')[col].shift(1)


In [None]:
all_data['lagged_total_points_rolling_5'] = all_data.groupby('name')['lagged_total_points'].transform(lambda x: x.rolling(window=5, min_periods=1).mean().shift(1))
all_data['lagged_total_points_rolling_10'] = all_data.groupby('name')['lagged_total_points'].transform(lambda x: x.rolling(window=10, min_periods=1).mean().shift(1))

In [None]:
from itertools import combinations
import numpy as np
import pandas as pd

from itertools import combinations

def create_interaction_terms(df, columns, max_order=2):
    all_interactions = {}

    for order in range(2, max_order + 1):  # From 2-way to max_order-way interactions
        all_combinations = combinations(columns, order)
        for combo in all_combinations:
            col_name = '_x_'.join(combo)
            all_interactions[col_name] = np.prod(df[list(combo)], axis=1)

    return pd.DataFrame(all_interactions)

def create_polynomial_and_other_features(df, column):
    polynomial_features = {
        f'{column}_squared': df[column] ** 2,
        f'{column}_cubed': df[column] ** 3,
        f'{column}_sqrt': np.sqrt(df[column]),
        f'{column}_cbrt': np.cbrt(df[column]),
        f'{column}_log': np.log(df[column] + 1)  # Adding 1 to avoid log(0)
    }
    return pd.DataFrame(polynomial_features)


# Select all columns, don't filter out 'rolling' ones
lagged_cols = ['lagged_' + col for col in dynamic_columns if 'rolling' not in col]




# Concatenate all
all_data = pd.concat([all_data, interaction_df, polynomial_df], axis=1)


In [None]:
# Drop or fill missing values
all_data.dropna(inplace=True)  # or data_with_const.fillna(0, inplace=True)

In [None]:
# Check data types
print(all_data.dtypes)


Save to CSV so we only have to do this once

In [None]:
# Your allowed list of column names
allowed_columns = ["name", "element", "opponent_team", "was_home", "season", "GW", "team_name"]

# Create a list of columns that are either in allowed_columns or have 'lagged' in their name
columns_to_keep = [col for col in all_data.columns if col in allowed_columns or 'lagged' in col]

# Drop all other columns
all_data_filtered = all_data[columns_to_keep]


In [None]:
# Drop specified columns
columns_to_drop = ["value", "influence", "ict_index", "creativity", "threat",
                   "bonus", "bps", "minutes", "clean_sheets", "goals_scored", "assists"]

all_data = all_data.drop(columns=columns_to_drop)

# Now, all_data will not have the specified columns.


In [None]:
import pandas as pd
all_data.to_csv('all_data.csv', index=False)

In [None]:
for i in all_data.columns:
  print(i)