In [1]:
# Imports
import os
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

save_initial = False

In [2]:
# Aggregate all seasons into one directory
if save_initial:
    input_paths    = [f"./data/20{i}-{i+1}/players/" for i in range(16, 23)]
    init_path      = "./data/2023-24/players/"
    save_path      = "./data/2016-24/"
    for i in os.listdir(init_path):
        if i[0] == '.':
            continue
        df         = pd.read_csv(os.path.join(init_path, i, "gw.csv"))
        for j in input_paths:
            for k in os.listdir(j):
                if k.split("_")[:-1] != i.split("_")[:-1]:
                    continue
                d2 = pd.read_csv(os.path.join(j, k, "gw.csv"))
                df = pd.concat([df, d2], ignore_index=True)
        df.to_csv(os.path.join(save_path, f'{"_".join(i.split("_")[:-1])}.csv'), index=False)

In [3]:
# Make cummulative frequency curve of games listed per player
# Figure out what columns are not present in all dfs

if save_initial:
    dfs            = []
    cols           = set()
    not_all        = set()
    for i in os.listdir(save_path):
        dfs       += [pd.read_csv(os.path.join(save_path, i))]
        cols       = cols.union(set(dfs[-1].columns))
        not_all    = not_all.union(set(cols).difference(set(dfs[-1].columns)))

    numGames       = [len(i) for i in dfs]
    total          = max(numGames)
    freq           = [0] * (total + 1)
    for i in numGames:
        freq[i]   += 1
    temp           = 0
    for i in range(len(frequency) - 1, -1, -1):
        freq[i]   += temp
        temp       = freq[i]
    plt.plot(range(len(freq)), freq)

    print(not_all)

In [4]:
# Create training dataset: 
    #    train.csv containing (nx5m values), where each row is a flattened stat summary from past 5 games for one player
    #    labels.csv containing (nx1) values, total points in next game for each player
    
not_all      = {'target_missed', 'fouls', 'recoveries', 'key_passes', 
                'clearances_blocks_interceptions', 'big_chances_created', 
                'errors_leading_to_goal_attempt', 'completed_passes', 'tackled', 
                'kickoff_time_formatted', 'attempted_passes', 'tackles', 'offside', 
                'big_chances_missed', 'loaned_in', 'errors_leading_to_goal', 'penalties_conceded', 
                'loaned_out', 'winning_goals', 'open_play_crosses', 'dribbles', 'id', 'ea_index'}
columnNames  = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'element',
                'expected_assists', 'expected_goal_involvements', 'expected_goals',
                'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
                'ict_index', 'influence', 'kickoff_time', 'minutes', 'opponent_team',
                'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
                'round', 'saves', 'selected', 'starts', 'team_a_score', 'team_h_score',
                'threat', 'total_points', 'transfers_balance', 'transfers_in',
                'transfers_out', 'value', 'was_home', 'yellow_cards',
                'attempted_passes', 'big_chances_created', 'big_chances_missed',
                'clearances_blocks_interceptions', 'completed_passes', 'dribbles',
                'ea_index', 'errors_leading_to_goal', 'errors_leading_to_goal_attempt',
                'fouls', 'id', 'key_passes', 'kickoff_time_formatted', 'loaned_in',
                'loaned_out', 'offside', 'open_play_crosses', 'penalties_conceded',
                'recoveries', 'tackled', 'tackles', 'target_missed', 'winning_goals']
drop_columns = {'kickoff_time', 'transfers_balance', 'fixture', 'creativity', 'element',
                'expected_assists', 'expected_goal_involvements', 'expected_goals',
                'fixture', 'ict_index', 'influence', 'starts', 'team_a_score', 
                'threat', 'team_h_score', 'expected_goals_conceded', 'opponent_team', 'was_home'}.union(not_all)

columnNames  = [i for i in columnNames if i not in drop_columns]
print(columnNames)

['assists', 'bonus', 'bps', 'clean_sheets', 'goals_conceded', 'goals_scored', 'minutes', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'saves', 'selected', 'total_points', 'transfers_in', 'transfers_out', 'value', 'yellow_cards']


In [31]:
input_path       = "./data/2023-24/players"
players          = set(["_".join(i.split("_")[:-1]) for i in os.listdir(input_path) if i[0] != '.'])
cleaner_map      = {" ".join(i.split("_")) : i for i in players}

position_map     = {i : set() for i in ["FWD", "MID", "DEF", "GK"]}
input_path       = "./data/2023-24/gws/"
for game in os.listdir(input_path):
    if game[:2] != "gw":
        continue
    df           = pd.read_csv(os.path.join(input_path, game))
    for row in range(len(df)):
        row = df.iloc[row]
        position_map[row["position"]].add(cleaner_map[row["name"]])
        players  = players.difference({cleaner_map[row["name"]]})
    if not players:
        break
print([(i, len(position_map[i])) for i in position_map])
print(players)

[('FWD', 104), ('MID', 354), ('DEF', 261), ('GK', 93)]
set()


In [37]:
for position, players in position_map.items():
    # NaN analysis
    input_path       = "./data/2016-24/"
    dfs              = [pd.read_csv(os.path.join(input_path, i + ".csv")).loc[:, columnNames] 
                        for i in players]
    megaDf           = pd.concat(dfs, axis=0, ignore_index=True)
    counts           = megaDf.isna().sum()
    assert counts.sum() == 0 # Insert NaN handling if NaNs

    # Normalization
    save_path        = f"./data/2016-24_processed/{position}/"
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    means            = {col:count for col, count in megaDf.mean().items()}
    sds              = {col:count for col, count in megaDf.std().items()}
    print(position)
    print(pd.concat([megaDf.mean(), megaDf.std()], axis = 1))

    # Don't normalize total_points
    means['total_points'] = 0
    sds['total_points'] = 1

    for name, df in zip(os.listdir(input_path), dfs):
        for col, mean in means.items():
            df[col] -= mean
            df[col] /= sds[col]
        df.to_csv(os.path.join(save_path, name), index = False)

FWD
                              0             1
assists                0.066149  2.828338e-01
bonus                  0.207820  6.888757e-01
bps                    5.687199  1.207310e+01
clean_sheets           0.103374  3.044676e-01
goals_conceded         0.500268  9.303449e-01
goals_scored           0.144081  4.145099e-01
minutes               31.859802  3.831453e+01
own_goals              0.000536  2.313879e-02
penalties_missed       0.003080  5.541420e-02
penalties_saved        0.000000  0.000000e+00
red_cards              0.001875  4.325970e-02
round                 18.657338  1.102580e+01
saves                  0.000000  0.000000e+00
selected          457434.295795  1.078711e+06
total_points           1.793117  3.055606e+00
transfers_in       36256.564676  1.022056e+05
transfers_out      32662.341591  9.011547e+04
value                 60.849223  1.801343e+01
yellow_cards           0.048473  2.147789e-01
141
MID
                              0              1
assists              

In [38]:
trainColumns   = []
for j in ["_1", "_2", "_3", "_4", "_5"]: # 5 games
    trainColumns += [i+j for i in columnNames]

for position, players in position_map.items():
    train          = []
    labels         = []

    input_path     = f"./data/2016-24_processed/{position}/"
    for i in tqdm(os.listdir(input_path)):
        if i[0] == '.':
            continue
        curr_path  = os.path.join(input_path, i)
        df         = pd.read_csv(curr_path)
        for label_row in range(5, len(df)):
            vector = df.iloc[label_row - 5:label_row].values.flatten()
            label  = df['total_points'].iloc[label_row]
            train += [vector]
            labels+= [[label]]

    train          = pd.DataFrame(train, columns = trainColumns)
    labels         = pd.DataFrame(labels, columns = ["Points"])
    save_path      = f"./train_2016-24/{position}/"
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    train.to_csv(os.path.join(save_path, 'inputs.csv'), index=False)
    labels.to_csv(os.path.join(save_path, 'labels.csv'), index=False)

100%|██████████| 104/104 [00:00<00:00, 288.29it/s]
100%|██████████| 354/354 [00:01<00:00, 292.21it/s]
100%|██████████| 261/261 [00:00<00:00, 296.62it/s]
100%|██████████| 93/93 [00:00<00:00, 214.51it/s]


In [12]:
train.head

<bound method NDFrame.head of      assists_1   bonus_1     bps_1  clean_sheets_1  goals_conceded_1  \
0    -0.204959 -0.239029 -0.478723       -0.368031          1.434485   
1    -0.204959 -0.239029 -0.675123       -0.368031         -0.559123   
2    -0.204959  1.655498  2.467279        2.717120         -0.559123   
3    -0.204959 -0.239029  0.601478       -0.368031          0.437681   
4    -0.204959 -0.239029 -0.282323       -0.368031         -0.559123   
..         ...       ...       ...             ...               ...   
777  -0.204959 -0.239029  1.779879       -0.368031          0.437681   
778  -0.204959 -0.239029 -0.675123       -0.368031         -0.559123   
779  -0.204959 -0.239029 -0.282323       -0.368031         -0.559123   
780  -0.204959 -0.239029 -0.675123       -0.368031         -0.559123   
781  -0.204959 -0.239029 -0.380523       -0.368031         -0.559123   

     goals_scored_1  minutes_1  own_goals_1  penalties_missed_1  \
0         -0.210107   1.287610    -0.0