# EDA

In [1]:
# Import packages
import numpy as np
import pandas as pd
import pickle
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Preparing data
The data acquired from the APIs is unpickled, merged, and binned so that the effect of precipitation, wind, and temperature on a particular event (ground passes, aerial passes, etc.) may be analyzed.

In [2]:
# Load pickled data
with open('events.p', 'rb') as f:
    events = pickle.load(f)

with open('teams.p', 'rb') as f:
    teams = pickle.load(f)
    
with open('matches.p', 'rb') as f:
    matches = pickle.load(f)
    
with open('players.p', 'rb') as f:
    players = pickle.load(f)

In [3]:
match_dicts = { match['match_id']: match for match in matches }

In [4]:
# Function to include weather data to each event
def add_weather(event):
    weather = match_dicts[event['match_id']]
    event['temp_c'] = weather['temp_c']
    event['wind_kmph'] = weather['wind_kmph']
    event['precip_mm'] = weather['precip_mm']
    event['location'] = weather['location']
    return event

In [5]:
events = list(map(add_weather, events))

In [6]:
df = pd.DataFrame(events)

In [7]:
df.head()

Unnamed: 0,match_id,time,period,player,team,tags,event,subevent,start_pos_x,start_pos_y,end_pos_x,end_pos_y,temp_c,wind_kmph,precip_mm,location
0,2499719,2.758649,1H,A. Lacazette,Arsenal,[Accurate],Pass,Simple pass,49,49,31,78,19,15,0.0,London
1,2499719,4.94685,1H,R. Holding,Arsenal,[Accurate],Pass,High pass,31,78,51,75,19,15,0.0,London
2,2499719,6.542188,1H,M. Özil,Arsenal,[Accurate],Pass,Head pass,51,75,35,71,19,15,0.0,London
3,2499719,8.143395,1H,Mohamed Elneny,Arsenal,[Accurate],Pass,Head pass,35,71,41,95,19,15,0.0,London
4,2499719,10.302366,1H,Bellerín,Arsenal,[Accurate],Pass,Simple pass,41,95,72,88,19,15,0.0,London


In [8]:
# Create pass specific DataFrame
pass_df = df.loc[df['event'] == 'Pass'][['match_id', 'location', 'team', 'player', 'tags', 'subevent', 'precip_mm', 'wind_kmph', 'temp_c']].copy()

In [9]:
# Function to code action success
def acc_or_not(tags):
    if 'Accurate' in tags:
        return 1
    else:
        return 0

In [10]:
pass_df['tags'] = pass_df['tags'].map(acc_or_not)

In [11]:
# Bin precipitation
x = pass_df['precip_mm']

conditions = [
    x == 0,
    (x > 0) & (x <= 1),
    (x > 1) & (x <= 2),
    x > 2
]

choices = [
    'no',
    'light',
    'medium',
    'heavy'
]

pass_df['precipitation'] = np.select(conditions, choices)

In [12]:
# Bin wind
x = pass_df['wind_kmph']

conditions = [
    x <= 11,
    (x > 11) & (x <= 17),
    (x > 17) & (x <= 22),
    x > 22
]

choices = [
    'very_light',
    'light',
    'medium',
    'heavy'
]

pass_df['wind'] = np.select(conditions, choices)

In [13]:
# Bin temperature
x = pass_df['temp_c']

conditions = [
    x <= 0,
    (x > 0) & (x <= 10),
    (x > 10) & (x <= 20),
    x > 20
]

choices = [
    'freezing',
    'cool',
    'mild',
    'warm'
]

pass_df['temperature'] = np.select(conditions, choices)

In [14]:
# Excluding throw-ins (outfield and goalkeeper), goalkeeper long balls, and headed passes
pdf = pd.DataFrame(
    pass_df.loc[
        (pass_df['subevent'] != 'Hand pass') &
        (pass_df['subevent'] != 'Launch') &
        (pass_df['subevent'] != 'Head pass')
    ]
)

In [15]:
pdf.drop(columns=['precip_mm', 'wind_kmph', 'temp_c'], inplace=True)

In [16]:
pdf.head()

Unnamed: 0,match_id,location,team,player,tags,subevent,precipitation,wind,temperature
0,2499719,London,Arsenal,A. Lacazette,1,Simple pass,no,light,mild
1,2499719,London,Arsenal,R. Holding,1,High pass,no,light,mild
4,2499719,London,Arsenal,Bellerín,1,Simple pass,no,light,mild
5,2499719,London,Arsenal,M. Özil,0,Simple pass,no,light,mild
11,2499719,London,Arsenal,S. Kolašinac,1,High pass,no,light,mild


## Overall Rates
Note: Smart passes (by WyScout) are defined as passes made between 2+ opposing players

Completion Rates:
- Overall: 84%
- Simple: 91%
- High: 53%
- Cross: 30%
- Smart: 39%

In [17]:
# Overall accuracy rate of passes
pdf['tags'].value_counts(normalize=True)

1    0.848726
0    0.151274
Name: tags, dtype: float64

In [18]:
# Overall rate of pass types
pdf['subevent'].value_counts(normalize=True)

Simple pass    0.853366
High pass      0.085087
Cross          0.041585
Smart pass     0.019962
Name: subevent, dtype: float64

In [19]:
# Overall success rate of pass types
pdf.groupby('subevent')['tags'].value_counts(normalize=True)

subevent     tags
Cross        0       0.692923
             1       0.307077
High pass    1       0.537160
             0       0.462840
Simple pass  1       0.916859
             0       0.083141
Smart pass   0       0.607550
             1       0.392450
Name: tags, dtype: float64

In [49]:
# Function to group categories
def group_passes(df, options=None):
    if options == None:
        options = [ col for col in list(df.columns) if col != 'tags' ]
    else:
        options = options
        
    data = pd.DataFrame(
        df.groupby(options)['tags'].value_counts()
    ).unstack()
    
    data.columns = [ col[1] for col in data.columns ]
    data.reset_index(inplace=True)
    data.rename(columns={0: 'not_accurate', 1: 'accurate', 'subevent': 'pass_type'}, inplace=True)
    
    data['not_accurate'] = np.where(data['not_accurate'].isna(), 0, data['not_accurate'])
    data['accurate'] = np.where(data['accurate'].isna(), 0, data['accurate'])
    data['total'] = data['not_accurate'] + data['accurate']
    
    return data

In [72]:
options = ['match_id', 'team', 'subevent', 'temperature', 'wind', 'precipitation']
data = group_passes(pdf, options=options)

In [85]:
data.groupby(['temperature'])['total'].mean()

temperature
cool        97.553515
freezing    97.526786
mild        96.631537
warm        99.281250
Name: total, dtype: float64

In [75]:
data.groupby(['temperature', 'pass_type'])['total'].mean()

temperature  pass_type  
cool         Cross           16.295455
             High pass       33.204545
             Simple pass    331.851010
             Smart pass       7.728900
freezing     Cross           12.892857
             High pass       32.678571
             Simple pass    335.142857
             Smart pass       9.392857
mild         Cross           16.237179
             High pass       32.910256
             Simple pass    328.153846
             Smart pass       7.801303
warm         Cross           15.458333
             High pass       30.625000
             Simple pass    342.666667
             Smart pass       8.375000
Name: total, dtype: float64

In [74]:
def choose_cat(df, col, cat):
    data = df.loc[df[col] == cat]  
    return data

In [68]:
plot_df = choose_cat(data, 'team', 'Arsenal')

In [70]:
plot_df.groupby('temperature')['total'].mean()

temperature
cool        571.285714
freezing    461.000000
mild        546.230769
warm        555.000000
Name: total, dtype: float64