# EDA

In [1]:
# Import packages
import numpy as np
import pandas as pd
import pickle
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## Preparing data
The data acquired from the APIs is unpickled, merged, and binned so that the effect of precipitation, wind, and temperature on a particular event (ground passes, aerial passes, etc.) may be analyzed.

In [2]:
# Load pickled data
with open('events.p', 'rb') as f:
    events = pickle.load(f)

with open('teams.p', 'rb') as f:
    teams = pickle.load(f)
    
with open('matches.p', 'rb') as f:
    matches = pickle.load(f)
    
with open('players.p', 'rb') as f:
    players = pickle.load(f)

In [3]:
match_dicts = { match['match_id']: match for match in matches }

In [4]:
# Function to include weather data to each event
def add_weather(event):
    weather = match_dicts[event['match_id']]
    event['temp_c'] = weather['temp_c']
    event['wind_kmph'] = weather['wind_kmph']
    event['precip_mm'] = weather['precip_mm']
    event['location'] = weather['location']
    return event

In [5]:
events = list(map(add_weather, events))

In [6]:
df = pd.DataFrame(events)

In [7]:
df.head()

Unnamed: 0,match_id,time,period,player,team,tags,event,subevent,start_pos_x,start_pos_y,end_pos_x,end_pos_y,temp_c,wind_kmph,precip_mm,location
0,2499719,2.758649,1H,A. Lacazette,Arsenal,[Accurate],Pass,Simple pass,49,49,31,78,19,15,0.0,London
1,2499719,4.94685,1H,R. Holding,Arsenal,[Accurate],Pass,High pass,31,78,51,75,19,15,0.0,London
2,2499719,6.542188,1H,M. Özil,Arsenal,[Accurate],Pass,Head pass,51,75,35,71,19,15,0.0,London
3,2499719,8.143395,1H,Mohamed Elneny,Arsenal,[Accurate],Pass,Head pass,35,71,41,95,19,15,0.0,London
4,2499719,10.302366,1H,Bellerín,Arsenal,[Accurate],Pass,Simple pass,41,95,72,88,19,15,0.0,London


In [8]:
# Create pass specific DataFrame
pass_df = df.loc[df['event'] == 'Pass'][['match_id', 'location', 'team', 'player', 'tags', 'subevent', 'precip_mm', 'wind_kmph', 'temp_c']].copy()

In [9]:
# Function to code action success
def acc_or_not(tags):
    if 'Accurate' in tags:
        return 1
    else:
        return 0

In [10]:
pass_df['tags'] = pass_df['tags'].map(acc_or_not)

In [11]:
# Bin precipitation
x = pass_df['precip_mm']

conditions = [
    x == 0,
    (x > 0) & (x <= 1),
    (x > 1) & (x <= 2),
    x > 2
]

choices = [
    'no',
    'light',
    'medium',
    'heavy'
]

pass_df['precipitation'] = np.select(conditions, choices)

In [12]:
# Bin wind
x = pass_df['wind_kmph']

conditions = [
    x <= 11,
    (x > 11) & (x <= 17),
    (x > 17) & (x <= 22),
    x > 22
]

choices = [
    'very_light',
    'light',
    'medium',
    'heavy'
]

pass_df['wind'] = np.select(conditions, choices)

In [13]:
# Bin temperature
x = pass_df['temp_c']

conditions = [
    x <= 0,
    (x > 0) & (x <= 10),
    (x > 10) & (x <= 20),
    x > 20
]

choices = [
    'freezing',
    'cool',
    'mild',
    'warm'
]

pass_df['temperature'] = np.select(conditions, choices)

In [14]:
# Excluding throw-ins (outfield and goalkeeper), goalkeeper long balls, and headed passes
pdf = pd.DataFrame(
    pass_df.loc[
        (pass_df['subevent'] != 'Hand pass') &
        (pass_df['subevent'] != 'Launch') &
        (pass_df['subevent'] != 'Head pass')
    ]
)

In [15]:
pdf.drop(columns=['precip_mm', 'wind_kmph', 'temp_c'], inplace=True)

In [16]:
pdf.head()

Unnamed: 0,match_id,location,team,player,tags,subevent,precipitation,wind,temperature
0,2499719,London,Arsenal,A. Lacazette,1,Simple pass,no,light,mild
1,2499719,London,Arsenal,R. Holding,1,High pass,no,light,mild
4,2499719,London,Arsenal,Bellerín,1,Simple pass,no,light,mild
5,2499719,London,Arsenal,M. Özil,0,Simple pass,no,light,mild
11,2499719,London,Arsenal,S. Kolašinac,1,High pass,no,light,mild


## Passing Rates
### Overall Rate
Note: Smart passes (by WyScout) are defined as passes made between 2+ opposing players

Completion Rates:
- Overall: 84%
- Simple: 91%
- High: 53%
- Cross: 30%
- Smart: 39%

In [17]:
# Overall accuracy rate of passes
pdf['tags'].value_counts(normalize=True)

1    0.848726
0    0.151274
Name: tags, dtype: float64

In [18]:
# Overall rate of pass types
pdf['subevent'].value_counts(normalize=True)

Simple pass    0.853366
High pass      0.085087
Cross          0.041585
Smart pass     0.019962
Name: subevent, dtype: float64

In [19]:
# Overall success rate of pass types
pdf.groupby('subevent')['tags'].value_counts(normalize=True)

subevent     tags
Cross        0       0.692923
             1       0.307077
High pass    1       0.537160
             0       0.462840
Simple pass  1       0.916859
             0       0.083141
Smart pass   0       0.607550
             1       0.392450
Name: tags, dtype: float64

### Rates of Passing by Weather
It appears, on first glance, that increasing precipitation, increasing winds, and lower temperature leads to a greater number of ground passes (and fewer aerial passes) and more passes in total.  There was a possibility that later in season (when teams were more cohesive) that winds or precipitation were greater and temperatures were lower.  However, after examining the mean wind, precipitation, and temperature by matchday spring was less windy, less rainy, and warmer.

Unfortunately, zero precipitation was a dominant class (278 out of 380 matches).  Further inspections are required before any inference can be drawn from the data.

Note: Heavy precipitation, warm and freezing temperature were extremely rare relative to other binned categories.  Additionally, even without statistical tests, precipitation appears to have the greatest affect on passing rates.

In [20]:
# Function to group categories
def group_passes(df, options=None):
    if options == None:
        options = [ col for col in list(df.columns) if col != 'tags' ]
    else:
        options = options
        
    data = pd.DataFrame(
        df.groupby(options)['tags'].value_counts()
    ).unstack()
    
    data.columns = [ col[1] for col in data.columns ]
    data.reset_index(inplace=True)
    data.rename(columns={0: 'not_accurate', 1: 'accurate', 'subevent': 'pass_type'}, inplace=True)
    
    data['not_accurate'] = np.where(data['not_accurate'].isna(), 0, data['not_accurate'])
    data['accurate'] = np.where(data['accurate'].isna(), 0, data['accurate'])
    data['total'] = data['not_accurate'] + data['accurate']
    
    return data

In [198]:
options = ['match_id', 'team', 'subevent', 'temperature', 'wind', 'precipitation']
data = group_passes(pdf, options=options)

In [297]:
data.loc[data['precipitation'] == 'no'].groupby('match_id')['precipitation'].value_counts()

match_id  precipitation
2499719   no               8
2499721   no               8
2499722   no               8
2499725   no               8
2499726   no               8
                          ..
2500091   no               8
2500092   no               8
2500093   no               8
2500094   no               8
2500096   no               8
Name: precipitation, Length: 276, dtype: int64

In [221]:
def choose_team_avg(df, weather, team=None):
    # Average by team
    if team:
        df = df.loc[df['team'] == team]
    
    total = df.groupby(['match_id', weather])['total'].sum().reset_index()
    avg = total.groupby(weather)['total'].mean().reset_index()
    avg['pass_type'] = 'Total'
    
    # By pass type
    pt = df.groupby(['match_id', weather, 'pass_type'])['total'].sum().reset_index()
    pt = pt.groupby([weather, 'pass_type'])['total'].mean().reset_index()
    
    data = pd.concat([pt, avg])
    data.reset_index(drop=True, inplace=True)
    
    return data

In [282]:
def plot_passes(df, team=None):
    fig = make_subplots(rows=1, cols=3)
    ws = ['temperature', 'precipitation', 'wind']
    for i in range(len(ws)):
        data = choose_team_avg(df, ws[i], team)
        fig = px.bar(
            data,
            x='pass_type',
            y='total',
            color=ws[i],
            barmode='group',
            category_orders={
                'temperature': ['freezing', 'cool', 'mild', 'warm'],
                'precipitation': ['no', 'light', 'medium', 'heavy'],
                'wind': ['very_light', 'light', 'medium', 'heavy']
            },
            title=f'Average Passes by {ws[i].capitalize()}'  
        )
        fig.show()

In [283]:
# By match
plot_passes(data)

In [258]:
mdf = pd.DataFrame(matches)

In [292]:
# Mean weather by matchday
for w in ['temp_c', 'wind_kmph', 'precip_mm']:
    fig = px.bar(mdf.groupby('date')[w].mean())
    fig.show()