# Data Processing

In [14]:
import os
import pandas as pd
import time

Since the 24-25 Season has now concluded, the live fpl api will soon be wiped in preperation of the new season. As such, data for the rest of this project will use archived data from the entire season, from Vaastav's github repo: "https://github.com/vaastav/Fantasy-Premier-League"

In [None]:
print("--- Fetching Season Summary Data ---")
ARCHIVE_DIR = "archive_data/2024-25"
os.makedirs(ARCHIVE_DIR, exist_ok = True)

LOCAL_PATH_PS_DATA = os.path.join(ARCHIVE_DIR, "player_summaries.csv")

if os.path.exists(LOCAL_PATH_PS_DATA):
    print("Loading data from local archive...")
    df_fpl = pd.read_csv(LOCAL_PATH_PS_DATA)
else:
    print("Downloading archive...")
    archive_url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2024-25/players_raw.csv"

if df_fpl.empty == False:
    print("Data Loaded.")
    print(df_fpl.head())
    df_fpl.to_csv(LOCAL_PATH_PS_DATA, index = False)
    print("Local copy saved.")
else:
    print("Error Loading Data")

Downloading archive...
Data downloaded successfully from GitHub.
Data Loaded.
   assists  birth_date  bonus  bps  can_select  can_transact  \
0        0         NaN      0    0       False          True   
1        2  1997-04-03      6  152       False          True   
2        2  1997-12-19      9  459       False          True   
3        3  1999-06-11     14  343       False          True   
4        0         NaN      0    0       False          True   

   chance_of_playing_next_round  chance_of_playing_this_round  clean_sheets  \
0                           0.0                           0.0             0   
1                           0.0                           0.0             2   
2                           0.0                           0.0            10   
3                         100.0                         100.0             7   
4                           0.0                           0.0             0   

   clean_sheets_per_90  ...  threat_rank_type  total_points  t

In [None]:
print("--- Fetching Gameweek Data ---")


gw_dataframes = {}
gameweeks = range(1,39)

for gameweek in gameweeks:
    local_gw_filename = f"gw_{gameweek}.csv"
    local_gw_path = os.path.join(ARCHIVE_DIR, 'gws', local_gw_filename)
    os.makedirs(os.path.join(ARCHIVE_DIR, 'gws'), exist_ok=True)

    archive_gw_url = f"https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2024-25/gws/gw{gameweek}.csv"

    df_gw = pd.DataFrame()

    if os.path.exists(local_gw_path):
        print("Loading data from local archive...")
        df_gw = pd.read_csv(local_gw_path)
    if df_gw.empty:
        print("Downloading archive...")

        try:
            df_gw = pd.read_csv(archive_gw_url, on_bad_lines='warn', encoding='utf-8')
            print(f"Gameweek {gameweek} data downloaded successfully from GitHub.")
            
            df_gw.to_csv(local_gw_path, index=False)
            print(f"Local copy for GW{gameweek} saved to: {local_gw_path}")

        except Exception as e:
            print(f"Error downloading GW{gameweek} data: {e}")

    if df_gw.empty == False:
        # Store the DataFrame in dictionary
        gw_dataframes[gameweek] = df_gw
        print(f"Gameweek {gameweek} DataFrame added to 'gw_dataframes' dictionary (Shape: {df_gw.shape}).")
        
    else:
        print(f"Skipping Gameweek {gameweek}")

    time.sleep(0.5)        

--- Fetching Gameweek Data ---
Loading data from local archive...
Gameweek 1 DataFrame added to 'gw_dataframes' dictionary (Shape: (616, 41)).
Downloading archive...
Gameweek 2 data downloaded successfully from GitHub.
Local copy for GW2 saved to: archive_data/2024-25/gws/gw_2.csv
Gameweek 2 DataFrame added to 'gw_dataframes' dictionary (Shape: (627, 41)).
Downloading archive...
Gameweek 3 data downloaded successfully from GitHub.
Local copy for GW3 saved to: archive_data/2024-25/gws/gw_3.csv
Gameweek 3 DataFrame added to 'gw_dataframes' dictionary (Shape: (648, 41)).
Downloading archive...
Gameweek 4 data downloaded successfully from GitHub.
Local copy for GW4 saved to: archive_data/2024-25/gws/gw_4.csv
Gameweek 4 DataFrame added to 'gw_dataframes' dictionary (Shape: (659, 41)).
Downloading archive...
Gameweek 5 data downloaded successfully from GitHub.
Local copy for GW5 saved to: archive_data/2024-25/gws/gw_5.csv
Gameweek 5 DataFrame added to 'gw_dataframes' dictionary (Shape: (661,

At one point in the season, the manager chip was added, meaning that later gameweek csvs have more columns. Since the manager points could come in use for later analysis, the columns from gw 38 will be selected, and earlier weeks will have Na values added to those columns

In [16]:
print("--- Standardizing Columns Across Gameweek DataFrames (Using GW38 as Reference) ---")

# 1. Get the list of ALL columns from Gameweek 38
reference_columns = gw_dataframes[38].columns.tolist()
print(f"Reference columns from Gameweek {38} ({len(reference_columns)} columns)")

# 2. Iterate through all DataFrames in the dictionary and reindex them
standardized_gw_dataframes = {} # Create a new dictionary to store standardized DFs
for gw_num, df_gameweek in gw_dataframes.items():
    print(f"Standardizing Gameweek {gw_num} (Original columns: {len(df_gameweek.columns)})...")
    # Use reindex to align columns:
    # - Existing columns in reference_columns are kept (and reordered if needed).
    # - Columns in reference_columns that are missing are added and filled with NaN.
    # - Columns in df_gameweek NOT in reference_columns are dropped.
    df_standardized = df_gameweek.reindex(columns=reference_columns)
    standardized_gw_dataframes[gw_num] = df_standardized
    print(f"  -> Standardized GW {gw_num} now has {len(df_standardized.columns)} columns.")
    # Replace the original dictionary with the standardized one
    gw_dataframes = standardized_gw_dataframes
    print("\nColumn standardization complete for all loaded gameweek DataFrames.")

# --- Verification ---
print("\n--- Verification: Column Counts After Standardization ---")
for gw_num, df_gameweek in gw_dataframes.items():
    print(f"Gameweek {gw_num}: {len(df_gameweek.columns)} columns")
    # You can also check if all column sets are identical to the reference:
    # print(f"  Columns match GW{GW_REFERENCE}: {set(df_gameweek.columns) == set(reference_columns)}")

# Example: Check the head of an early gameweek (e.g., GW1) after standardization
if 1 in gw_dataframes:
    print(f"\nHead of Standardized Gameweek 1 DataFrame (expecting NaN for later columns):")
    display(gw_dataframes[1].head())
    print("\nNull counts for Gameweek 1 after reindexing:")
    display(gw_dataframes[1].isnull().sum()[gw_dataframes[1].isnull().sum() > 0])
else:
    print("\nGameweek 1 data not available for display.")

--- Standardizing Columns Across Gameweek DataFrames (Using GW38 as Reference) ---
Reference columns from Gameweek 38 (48 columns)
Standardizing Gameweek 1 (Original columns: 41)...
  -> Standardized GW 1 now has 48 columns.

Column standardization complete for all loaded gameweek DataFrames.
Standardizing Gameweek 2 (Original columns: 41)...
  -> Standardized GW 2 now has 48 columns.

Column standardization complete for all loaded gameweek DataFrames.
Standardizing Gameweek 3 (Original columns: 41)...
  -> Standardized GW 3 now has 48 columns.

Column standardization complete for all loaded gameweek DataFrames.
Standardizing Gameweek 4 (Original columns: 41)...
  -> Standardized GW 4 now has 48 columns.

Column standardization complete for all loaded gameweek DataFrames.
Standardizing Gameweek 5 (Original columns: 41)...
  -> Standardized GW 5 now has 48 columns.

Column standardization complete for all loaded gameweek DataFrames.
Standardizing Gameweek 6 (Original columns: 41)...
  -

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards
0,Alex Scott,MID,Bournemouth,1.6,0,0,11,0,12.8,77,...,1,1,0.0,2,0,0,0,50,False,0
1,Carlos Miguel dos Santos Pereira,GK,Nott'm Forest,2.2,0,0,0,0,0.0,427,...,1,1,0.0,0,0,0,0,45,True,0
2,Tomiyasu Takehiro,DEF,Arsenal,0.0,0,0,0,0,0.0,22,...,0,2,0.0,0,0,0,0,50,True,0
3,Malcolm Ebiowei,MID,Crystal Palace,0.0,0,0,0,0,0.0,197,...,1,2,0.0,0,0,0,0,45,False,0
4,Ben Brereton Díaz,MID,Southampton,1.0,0,0,-2,0,14.0,584,...,0,1,16.0,1,0,0,0,55,False,1



Null counts for Gameweek 1 after reindexing:


mng_clean_sheets     616
mng_draw             616
mng_goals_scored     616
mng_loss             616
mng_underdog_draw    616
mng_underdog_win     616
mng_win              616
dtype: int64

In [19]:
print("--- Merging Dataframes into one CSV ---")

MERGED_SEASON_CSV_PATH = os.path.join(ARCHIVE_DIR, "merged_gw_data.csv")

#prepare the dataframes
dfs_to_concat = []
for gw_num, df_gameweek in gw_dataframes.items():
    df_gameweek_copy = df_gameweek.copy()
    df_gameweek_copy['gameweek'] = gw_num # add a column to know og gw
    dfs_to_concat.append(df_gameweek_copy)

#merge
df_merged = pd.concat(dfs_to_concat, ignore_index=True)
print(f"Merged {len(dfs_to_concat)} gameweek DataFrames.")

#check new gameweek column
print("-- New DF --")
display(df_merged.head())
print("Gameweeks present in the merged DataFrame:")
print(df_merged['gameweek'].unique())

#save to csv
print(f"Saving merged DataFrame to: {MERGED_SEASON_CSV_PATH}")
df_merged.to_csv(MERGED_SEASON_CSV_PATH, index=False)
print("Merged season data saved successfully!")

--- Merging Dataframes into one CSV ---
Merged 38 gameweek DataFrames.
-- New DF --


Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,gameweek
0,Alex Scott,MID,Bournemouth,1.6,0,0,11,0,12.8,77,...,1,0.0,2,0,0,0,50,False,0,1
1,Carlos Miguel dos Santos Pereira,GK,Nott'm Forest,2.2,0,0,0,0,0.0,427,...,1,0.0,0,0,0,0,45,True,0,1
2,Tomiyasu Takehiro,DEF,Arsenal,0.0,0,0,0,0,0.0,22,...,2,0.0,0,0,0,0,50,True,0,1
3,Malcolm Ebiowei,MID,Crystal Palace,0.0,0,0,0,0,0.0,197,...,2,0.0,0,0,0,0,45,False,0,1
4,Ben Brereton Díaz,MID,Southampton,1.0,0,0,-2,0,14.0,584,...,1,16.0,1,0,0,0,55,False,1,1


Gameweeks present in the merged DataFrame:
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38]
Saving merged DataFrame to: archive_data/2024-25/merged_gw_data.csv
Merged season data saved successfully!
