Imports for data retrieving and player data

In [1]:
from data_retriever import match_reports, players_with_team_position, score_and_fixtures, file_create
from player_data import PlayerData

In [41]:
p = PlayerData(17)
f = p.data_lister()

In [42]:
f.keys()

dict_keys(['Arsenal', 'Bournemouth', 'Brighton', 'Burnley', 'Chelsea', 'Crystal Palace', 'Everton', 'Huddersfield', 'Leicester City', 'Liverpool', 'Manchester City', 'Manchester Utd', 'Newcastle Utd', 'Southampton', 'Stoke City', 'Swansea City', 'Tottenham', 'Watford', 'West Brom', 'West Ham', 'teams_stats', 'played_fixtures'])

Data generation

In [None]:
# for season in range(17, 22):
#     player_info = players_with_team_position(season)
#     file_create(
#         file_details=player_info, 
#         file_name=f"20{season}-20{season + 1} player_info.json", 
#         file_path=f"data/Premier League/player information"
#         )

#     score_and_fixtures(season)
#     match_reports(season)


In [None]:
all_data = {}

all_data["17/18"] = PlayerData(17)
all_data["18/19"] = PlayerData(18)
all_data["19/20"] = PlayerData(19)

categories = all_data["17/18"].headers['header']
others = ['transfers_balance', 'creativity', 'transfers_in', 'sub_ins', 'ict_index', 'sub_outs', 'played_60', 'bonus', 
'appearances', 'total_points', 'transfers_out', 'bps', 'starts', 'threat', 'influence', 'value', 'value_change']

seasons = ['17/18', '18/19', '19/20']

teams = {key: sorted(value.players.keys()) for key, value in all_data.items()}
for season in teams.keys():
  teams[season].append('all_teams')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy import stats
from pprint import pprint
import warnings

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
plt.rcParams['figure.figsize'] = [15, 20]

In [None]:
def column_creator(df: pd.DataFrame, expression: str):
  column_name, formula = expression.split(" = ")
  formula = formula.split(' ')
  df[column_name] = df[formula[0]]
  is_operation = True
  operation = ''

  for string in formula[1:]:
    if is_operation:
      operation = string

    else:
      if operation == '+':
        df[column_name] += df[string]

      elif operation == '-':
        df[column_name] -= df[string]

      elif operation == '/':
        df[column_name] /= df[string]

      elif operation == '*':
        df[column_name] *= df[string]

    is_operation = not is_operation
    
  df[column_name].replace([np.nan, np.inf], 0, inplace=True)



In [None]:
extras = ['minutes_per_goal = minutes / goals',
          'minutes_per_assist = minutes / assists',
          'goals_xg_diff = goals - xg',
          'assists_xa_diff = assists - xa',
          'minutes_per_sca = minutes / sca',
          'minutes_per_gca = minutes / gca',
          'goals_to_shot_ratio = goals / shots_total',
          'target_to_shot_ratio = shots_on_target / shots_total',
          'dribbles_per_app = dribbles / appearances',
          'blocks_per_app = blocks / appearances',
          'clearances_per_app = clearances / appearances',
          'errors_per_app = errors / appearances',
          'points_per_app = total_points / appearances',
          'tackles_pct = tackles_won / tackles',
          'interceptions_per_app = interceptions / appearances',
          'touches_per_app = touches / appearances',
          'minutes_per_cross = minutes / crosses',
          'minutes_per_fouled = minutes / fouled',
          'minutes_per_foul = minutes / fouls',
          'minutes_per_offside = minutes / offsides',
          'ball_recoveries_per_app = ball_recoveries / appearances',
          'through_pass_ratio = through_balls / passes',
          'dist_per_carry = carry_distance / carries',
          'prog_dist_per_carry = carry_progressive_distance / progressive_carries',
          'prog_to_total_ratio = passes_progressive_distance / passes_total_distance',
          'sub_outs_per_start = sub_outs / starts',
          'sub_ins_per_app = sub_ins / appearances',
          'minutes_per_yellow = cards_yellow / minutes',
          'fouls_per_yellow = fouls / cards_yellow',
          'miscontrols_per_app = miscontrols / appearances',
          'dispossessed_per_app = dispossessed / appearances',
          'played_60_per_app = played_60 / appearances',
          'oob_per_pass = passes_oob / passes',
          'intercepted_per_pass = passes_intercepted / passes',
          'blocked_per_pass = passes_blocked / passes',
          'prog_per_pass_recieved = progressive_passes_received / passes_received',
          'bonus_per_app = bonus / appearances',
          'transfers_ratio = transfers_in / transfers_out',

          'carries_into_final_third_pct = carries_into_final_third / carries',
          'carries_into_pen_area_pct = carries_into_penalty_area / carries',

          'passes_pct_short = passes_completed_short / passes_short',
          'passes_pct_long = passes_completed_long / passes_long',
          'passes_pct_medium = passes_completed_medium / passes_medium',

          'touches_att_pen_area_pct = touches_att_pen_area / touches',
          'touches_att_3rd_pct = touches_att_3rd / touches',
          'touches_mid_3rd_pct = touches_mid_3rd / touches',
          'touches_def_3rd_pct = touches_def_3rd / touches',
          'touches_def_pen_area_pct = touches_def_pen_area / touches'
]

In [None]:
batch = 'entire_season'

data = {key: value.data_lister() for key, value in all_data.items()}

In [None]:
threshold = 380

In [None]:
observations = {}

for season in teams.keys():
  observations[season] = {}
  for team in teams[season][:-1]:
    df = data[season][team]['player_stats']
    obsrvtns = len(df[df['minutes'] > threshold])
    observations[season][team] = obsrvtns

for season in teams.keys():
  datapoints = list(observations[season].values())
  print(f"{season}: {sum(datapoints) / len(datapoints)}")


In [None]:
for season in seasons:
  for team in teams[season][:-1]:
    positions = data[season][team]['player_stats']['position']
    places = []
    for p in positions:
      keys = list(p.keys())
      values = list(p.values())
      if len(values) > 0:
        mx = max(values)
        ind = values.index(mx)

        top_pstn = keys[ind]
        if top_pstn[-1] == 'K':
          places.append('Gkp')
        elif top_pstn[-1] == 'B':
          places.append('Def')
        elif top_pstn[-1] == 'M':
            places.append('Mid')
        else:
            places.append('Fwd')
      else:
        places.append("Nil")

    data[season][team]['player_stats']['places'] = places
    ages = data[season][team]['player_stats']['age']
    ages = [int(age.split('-')[0]) if age != 0 else 0 for age in ages]
    data[season][team]['player_stats']['age'] = ages


In [None]:
columns = {
  "player_info":
  ['player',
  'position',
  'age'],

  "appearances":
  ['minutes',
  'appearances',
  'starts',
  'played_60',
  'own_goals'],

  "passing":
  ['passes_pct',
  'passes_pct_short',
  'passes_pct_medium',
  'passes_pct_long',
  'passes_received_pct'],

  "fpl":
  ['influence',
  'creativity',
  'threat',
  'ict_index',
  'total_points',
  'value',
  'value_change'],

  "expected":
  ['xg',
  'npxg',
  'xa'],
   
   "attack_ratios":
  ['minutes_per_goal',
  'minutes_per_assist',
  'goals_xg_diff',
  'assists_xa_diff',
  'minutes_per_sca',
  'minutes_per_gca',
  'goals_to_shot_ratio',
  'target_to_shot_ratio'],

  "fpl_ratios":
  ['points_per_app',
  'transfers_ratio',
  'bonus_per_app'],

  "defense":
  ['tackles_pct',
  'interceptions_per_app',
  'blocks_per_app',
  'clearances_per_app',
  'ball_recoveries_per_app',
  'dribble_tackles_pct',
  'pressure_regain_pct'],

  "error_ratios":
  ['errors_per_app',
  'dispossessed_per_app',
  'miscontrols_per_app',
  'oob_per_pass',
  'intercepted_per_pass',
  'blocked_per_pass',
  'minutes_per_yellow',
  'fouls_per_yellow'],
  
  "foul_ratios":
  ['minutes_per_fouled',
  'minutes_per_foul'],

  "possession_ratios":
  ['minutes_per_offside',
  'through_pass_ratio',
  'dist_per_carry',
  'prog_dist_per_carry',
  'prog_per_pass_recieved',
  'dribbles_per_app',
  'touches_per_app',
  'minutes_per_cross',
  'dribbles_completed_pct',
  'aerials_won_pct',
  'carries_into_pen_area_pct',
  'carries_into_final_third'
  ],

  "start_ratios":
  ['sub_outs_per_start',
  'sub_ins_per_app',
  'played_60_per_app'],

  "touches":
  ['touches_att_pen_area_pct',
  'touches_att_3rd_pct',
  'touches_mid_3rd_pct',
  'touches_def_3rd_pct',
  'touches_def_pen_area_pct']
}

In [None]:
extra_column_names = [exprssn.split(' = ')[0] for exprssn in extras]

for exprssn in extras:
  for season in seasons:
    for team in teams[season][:-1]:
      column_creator(df=data[season][team]['player_stats'], expression=exprssn)

In [None]:
for season in seasons:
  dfs = []
  for team in teams[season][:-1]:
    dfs.append(data[season][team]['player_stats'])

  data[season]["all_teams"] = {"player_stats": None}
  data[season]["all_teams"]['player_stats'] = pd.concat(dfs)
  data[season]["all_teams"]['player_stats'].reset_index(inplace=True)

In [None]:
cols = []

for key, value in columns.items():
  if key != "player_info":
    cols += value

In [None]:
data['17/18']['teams_stats'].columns

In [None]:
extra_team_columns = [
    'pts_per_game = pts / matches_played',
    'home_pts_ratio = home_pts / pts',
    'away_pts_ratio = away_pts / pts',
    'wins_ratio = wins / matches_played',
    'draws_ratio = draws / matches_played',
    'h_cleansheets_ratio = home_cleansheets / cleansheets',
    'a_cleansheets_ratio = away_cleansheets / cleansheets',
    'pct_possession = pct_possession / matches_played',
    'goal_ratio = goals_for / goals_against',
    'home_goal_ratio = home_goals_for / home_goals_against',
    'away_goal_ratio = away_goals_for / away_goals_against',
    'home_win_ratio = home_wins / wins',
    'away_win_ratio = away_wins / wins',
    'home_draw_ratio = home_draws / draws',
    'away_draw_ratio = away_draws / draws',
    'home_loss_ratio = home_losses / losses',
    'away_loss_ratio = away_losses / losses'
]

In [None]:
extra_team_column_names = [exprssn.split(' = ')[0] for exprssn in extra_team_columns]
for exprssn in extra_team_columns:
  for season in seasons:
    column_creator(df=data[season]['teams_stats'], expression=exprssn)

In [None]:
data['17/18']['teams_stats']

In [None]:
for season in seasons:
  df = data[season]['all_teams']['player_stats']
  
  correlation = df[cols].corr()
  correlation.to_csv(f"eda/{batch}/{batch}_{season.split('/')[0]}_correlation.csv")

  covariance = df[cols].cov()
  covariance.to_csv(f"eda/{batch}/{batch}_{season.split('/')[0]}_covariance.csv")
  
  description = df[cols].describe()
  description.to_csv(f"eda/{batch}/{batch}_{season.split('/')[0]}_description.csv")



In [None]:
desc = {}
for season in seasons:
    df = pd.read_csv(f"eda\entire_season\entire_season_{season.split('/')[0]}_description.csv", index_col=0)
    desc[season] = df

corr = {}
for season in seasons:    
    df = pd.read_csv(f"eda\entire_season\entire_season_{season.split('/')[0]}_correlation.csv", index_col=0)
    corr[season] = df

covr = {}
for season in seasons:    
    df = pd.read_csv(f"eda\entire_season\entire_season_{season.split('/')[0]}_covariance.csv", index_col=0)
    covr[season] = df

In [None]:
desc_17_compare = (desc['17/18'] - desc['18/19']) / desc['17/18']
desc_17_compare.fillna(0, inplace=True)
sns.set(rc={'figure.figsize':(20, 20)})
sns.heatmap(desc_17_compare, cmap="YlGnBu")

In [None]:
corr_17_compare = (corr['17/18'] - corr['18/19']) / corr['17/18']
sns.heatmap(corr_17_compare)

In [None]:
covr_17_compare = (covr['17/18'] - covr['18/19']) / covr['17/18']
sns.heatmap(covr_17_compare)

In [None]:
data = PlayerData(17)
a = data.data_lister(gameweek_range=1)

In [None]:
a['Arsenal']['player_stats']