In [1]:
import json
import pandas as pd
import numpy as np

# Load player season history and create features

In [2]:
# Load all player season history into a single data frame
with open('data/during-season/bootstrap-static.json', 'r') as json_file:
    data = json.loads(json_file.read())
    
df_player_overview = pd.json_normalize(data, record_path='elements')

df_players_history = pd.DataFrame()
for index, player in df_player_overview.iterrows():
    player_id = str(player['id'])
    first_name = player['first_name']
    last_name = player['second_name']
    team_id = player['team']
    file_name = f"{player_id.rjust(3, '0')}_{first_name}_{last_name}".replace(' ', '_')

    with open(f'data/during-season/players/{file_name}.json', 'r') as json_file:
        player_data = json.loads(json_file.read())
        
    df_player_history = pd.json_normalize(player_data, record_path='history')
    df_player_history['name'] = f'{first_name} {last_name}'
    df_player_history['team_id'] = team_id
        
    df_players_history = df_players_history.append(df_player_history, ignore_index=True)

# Rename columns
df_players_history.rename(columns={
    'element': 'player_id',
    'opponent_team': 'opponent_team_id'
}, inplace=True)

# Set correct data types
df_players_history['round'] = df_players_history['round'].astype(int)
df_players_history['player_id'] = df_players_history['player_id'].astype(int)
df_players_history['opponent_team_id'] = df_players_history['opponent_team_id'].astype(int)
    
df_players_history.shape

(14422, 35)

In [3]:
df_players_history.sample(3).T

Unnamed: 0,11032,14197,5310
player_id,368,503,177
fixture,184,155,233
opponent_team_id,17,16,12
total_points,0,0,0
was_home,True,False,False
kickoff_time,2022-09-18T13:00:00Z,2022-08-27T15:30:00Z,2022-11-06T14:00:00Z
team_h_score,1,3,0
team_a_score,1,1,1
round,23,20,30
minutes,0,0,0


In [4]:
# Load teams data
df_teams = pd.json_normalize(data, record_path='teams')
df_teams.set_index('id', inplace=True)
df_teams

Unnamed: 0_level_0,code,draw,form,loss,name,played,points,position,short_name,strength,team_division,unavailable,win
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,993,0,,0,Malmö FF,0,0,0,MFF,,,False,0
2,1987,0,,0,Hammarby,0,0,0,HAM,,,False,0
3,3056,0,,0,IK Sirius,0,0,0,IKS,,,False,0
4,244,0,,0,IFK Göteborg,0,0,0,GBG,,,False,0
5,375,0,,0,IF Elfsborg,0,0,0,IFE,,,False,0
6,1959,0,,0,BK Häcken,0,0,0,BKH,,,False,0
7,1406,0,,0,Djurgården,0,0,0,DIF,,,False,0
8,2596,0,,0,Örebro SK,0,0,0,ÖSK,,,True,0
9,483,0,,0,IFK Norrköping,0,0,0,NOR,,,False,0
10,2594,0,,0,Kalmar FF,0,0,0,KFF,,,False,0


In [5]:
# Map team ids to team names for better readability
df_players_history['team'] = df_players_history['team_id'].map(df_teams['name'])
df_players_history['opponent_team'] = df_players_history['opponent_team_id'].map(df_teams['name'])

In [6]:
df_players_history.tail(5).T

Unnamed: 0,14417,14418,14419,14420,14421
player_id,530,531,531,532,532
fixture,237,226,238,232,240
opponent_team_id,4,17,13,13,16
total_points,0,0,0,0,0
was_home,True,False,True,False,True
kickoff_time,2022-11-06T14:00:00Z,2022-10-30T14:00:00Z,2022-11-06T14:00:00Z,2022-10-30T16:30:00Z,2022-11-06T14:00:00Z
team_h_score,1,2,2,2,2
team_a_score,4,3,3,5,2
round,30,29,30,29,30
minutes,0,0,0,0,0


In [7]:
df_players_history.nunique()

player_id                           532
fixture                             240
opponent_team_id                     16
total_points                         26
was_home                              2
kickoff_time                        140
team_h_score                          6
team_a_score                          7
round                                30
minutes                              89
goals_scored                          4
assists                               4
clean_sheets                          2
goals_conceded                        7
penalties_saved                       2
penalties_missed                      2
yellow_cards                          2
red_cards                             2
saves                                14
own_goals                             2
attacking_bonus                       3
defending_bonus                       3
winning_goals                         2
crosses                              10
key_passes                            9


In [8]:
df_players_history.to_csv('data/during-season/player_history.csv', index=False)

## Create data file for player future events

In [9]:
# Load all player season future events into a single csv
with open('data/during-season/bootstrap-static.json', 'r') as json_file:
    data = json.loads(json_file.read())
    
df_player_overview = pd.json_normalize(data, record_path='elements')

df_all_player_fixtures = pd.DataFrame()
for index, player in df_player_overview.iterrows():
    player_id = str(player['id'])
    first_name = player['first_name']
    last_name = player['second_name']
    team_id = player['team']
    file_name = f"{player_id.rjust(3, '0')}_{first_name}_{last_name}".replace(' ', '_')

    with open(f'data/during-season/players/{file_name}.json', 'r') as json_file:
        player_data = json.loads(json_file.read())
        
    df_player_fixtures = pd.json_normalize(player_data, record_path='fixtures')
    df_player_fixtures['player_id'] = player_id
    df_player_fixtures['name'] = f'{first_name} {last_name}'
        
    df_all_player_fixtures = df_all_player_fixtures.append(df_player_fixtures, ignore_index=True)

In [10]:
df_all_player_fixtures.sample()

ValueError: a must be greater than 0 unless no samples are taken

In [None]:
# Set home and away team
df_all_player_fixtures['team_id'] = np.where(df_all_player_fixtures['is_home'], df_all_player_fixtures['team_h'], df_all_player_fixtures['team_a'])
df_all_player_fixtures['opponent_team_id'] = np.where(df_all_player_fixtures['is_home'] == False, df_all_player_fixtures['team_h'], df_all_player_fixtures['team_a'])

# Map team ids to team names for better readability
df_all_player_fixtures['team'] = df_all_player_fixtures['team_id'].map(df_teams['name'])
df_all_player_fixtures['opponent_team'] = df_all_player_fixtures['opponent_team_id'].map(df_teams['name'])

In [None]:
df_all_player_fixtures.tail(20)

In [None]:
df_all_player_fixtures.to_csv('data/during-season/player_fixtures.csv', index=False)