# DATA LOADING

In [1]:
import pandas as pd
import requests
import os
import json
import time

## Data Collection - GitHub Archive

Since the 24-25 Season has now concluded, the live fpl api will soon be wiped in preperation of the new season. As such, data for the rest of this project will use archived data from the entire season, from Vaastav's github repo: "https://github.com/vaastav/Fantasy-Premier-League"

### Raw Player Data

In [2]:
print("--- Fetching Season Summary Data ---")
ARCHIVE_DIR = "archive_data/2024-25"
os.makedirs(ARCHIVE_DIR, exist_ok = True)

LOCAL_PATH_PS_DATA = os.path.join(ARCHIVE_DIR, "player_summaries.csv")

if os.path.exists(LOCAL_PATH_PS_DATA):
    print("Loading data from local archive...")
    df_fpl = pd.read_csv(LOCAL_PATH_PS_DATA)
else:
    print("Downloading archive...")
    archive_url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2024-25/players_raw.csv"

if df_fpl.empty == False:
    print("Data Loaded.")
    print(df_fpl.head())
    df_fpl.to_csv(LOCAL_PATH_PS_DATA, index = False)
    print("Local copy saved.")
else:
    print("Error Loading Data")

--- Fetching Season Summary Data ---
Loading data from local archive...
Data Loaded.
   assists  birth_date  bonus  bps  can_select  can_transact  \
0        0         NaN      0    0       False          True   
1        2  1997-04-03      6  152       False          True   
2        2  1997-12-19      9  459       False          True   
3        3  1999-06-11     14  343       False          True   
4        0         NaN      0    0       False          True   

   chance_of_playing_next_round  chance_of_playing_this_round  clean_sheets  \
0                           0.0                           0.0             0   
1                           0.0                           0.0             2   
2                           0.0                           0.0            10   
3                         100.0                         100.0             7   
4                           0.0                           0.0             0   

   clean_sheets_per_90  ...  threat_rank_type  total_po

### Gameweek Data

In [3]:
print("--- Fetching Gameweek Data ---")
gw_dataframes = {}
gameweeks = range(1,39)

for gameweek in gameweeks:
    local_gw_filename = f"gw_{gameweek}.csv"
    local_gw_path = os.path.join(ARCHIVE_DIR, 'gws', local_gw_filename)
    os.makedirs(os.path.join(ARCHIVE_DIR, 'gws'), exist_ok=True)

    archive_gw_url = f"https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2024-25/gws/gw{gameweek}.csv"

    df_gw = pd.DataFrame()

    if os.path.exists(local_gw_path):
        print("Loading data from local archive...")
        df_gw = pd.read_csv(local_gw_path)
    if df_gw.empty:
        print("Downloading archive...")

        try:
            df_gw = pd.read_csv(archive_gw_url, on_bad_lines='warn', encoding='utf-8')
            print(f"Gameweek {gameweek} data downloaded successfully from GitHub.")
            
            df_gw.to_csv(local_gw_path, index=False)
            print(f"Local copy for GW{gameweek} saved to: {local_gw_path}")

        except Exception as e:
            print(f"Error downloading GW{gameweek} data: {e}")

    if df_gw.empty == False:
        # Store the DataFrame in dictionary
        gw_dataframes[gameweek] = df_gw
        print(f"Gameweek {gameweek} DataFrame added to 'gw_dataframes' dictionary (Shape: {df_gw.shape}).")
        
    else:
        print(f"Skipping Gameweek {gameweek}")

    time.sleep(0.5)        

--- Fetching Gameweek Data ---
Loading data from local archive...
Gameweek 1 DataFrame added to 'gw_dataframes' dictionary (Shape: (616, 41)).
Loading data from local archive...
Gameweek 2 DataFrame added to 'gw_dataframes' dictionary (Shape: (627, 41)).
Loading data from local archive...
Gameweek 3 DataFrame added to 'gw_dataframes' dictionary (Shape: (648, 41)).
Loading data from local archive...
Gameweek 4 DataFrame added to 'gw_dataframes' dictionary (Shape: (659, 41)).
Loading data from local archive...
Gameweek 5 DataFrame added to 'gw_dataframes' dictionary (Shape: (661, 41)).
Loading data from local archive...
Gameweek 6 DataFrame added to 'gw_dataframes' dictionary (Shape: (664, 41)).
Loading data from local archive...
Gameweek 7 DataFrame added to 'gw_dataframes' dictionary (Shape: (666, 41)).
Loading data from local archive...
Gameweek 8 DataFrame added to 'gw_dataframes' dictionary (Shape: (667, 41)).
Loading data from local archive...
Gameweek 9 DataFrame added to 'gw_data

## Data Collection - PL API

In [4]:
#FPL API
data_url = "https://fantasy.premierleague.com/api/"

#Create Directory for Data
data_dir = "raw_data"
os.makedirs(data_dir, exist_ok=True)

In [5]:
#Fetch & Save Data
def get_save_data(url, filename, subdir = None):
    
    if subdir:
        full_dir = os.path.join(data_dir, subdir)
        os.makedirs(full_dir, exist_ok = True)
        filepath = os.path.join(full_dir, filename)
    else:
        filepath = os.path.join(data_dir, filename) 

    print(f"Fetching data from: {url}")

    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        with open(filepath, 'w', encoding = "utf-8") as f:
            json.dump(data, f, indent = 5)
        print(f"Data saved to: {filepath}")
        return data
    except Exception as e:
        print(f"Error occurred at {url}: {e}")
        return None
    finally:
        time.sleep(1)

In [6]:
#Collect Season Data
static_data = get_save_data(f"{data_url}bootstrap-static/", "static_data.json", )

if static_data:
    print("Saved")
else:
    print("Error getting data")

Fetching data from: https://fantasy.premierleague.com/api/bootstrap-static/
Data saved to: raw_data/static_data.json
Saved


In [7]:
#Collect Player Details
if static_data:
    all_player_ids = [player['id'] for player in static_data.get('elements', [])]
    print(f"Found {len(all_player_ids)} players")

    #players_to_collect = all_player_ids
    players_to_collect = all_player_ids[:50] #smaller sample to limit dwnld times. swap comments to dwnld full set.

    collect_count = 0
    for player_id in players_to_collect:
        filename = f"player_{player_id}_details.json"
        player_data = get_save_data(f"{data_url}element-summary/{player_id}/", filename, subdir = "player_details")
        if player_data:
            collect_count += 1
        time.sleep(0.1)
    print(f"\nCollected Details for {collect_count} players")
else:
    print("No data was collected")
    

Found 804 players
Fetching data from: https://fantasy.premierleague.com/api/element-summary/1/
Data saved to: raw_data/player_details/player_1_details.json
Fetching data from: https://fantasy.premierleague.com/api/element-summary/2/
Data saved to: raw_data/player_details/player_2_details.json
Fetching data from: https://fantasy.premierleague.com/api/element-summary/3/
Data saved to: raw_data/player_details/player_3_details.json
Fetching data from: https://fantasy.premierleague.com/api/element-summary/4/
Data saved to: raw_data/player_details/player_4_details.json
Fetching data from: https://fantasy.premierleague.com/api/element-summary/5/
Data saved to: raw_data/player_details/player_5_details.json
Fetching data from: https://fantasy.premierleague.com/api/element-summary/6/
Data saved to: raw_data/player_details/player_6_details.json
Fetching data from: https://fantasy.premierleague.com/api/element-summary/7/
Data saved to: raw_data/player_details/player_7_details.json
Fetching data fr