In [1]:
import json
from pandas import json_normalize
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import seaborn as sns

In [2]:
# !pip install statsbombpy
# !pip install mplsoccer

pd.set_option('display.max_seq_items', None)

In [3]:
# Code for pitch dimensions
from mplsoccer.pitch import Pitch
from mplsoccer import Pitch, Sbopen
# from mplsoccer.statsbomb import read_event, EVENT_SLUG
from matplotlib import rcParams

In [1]:
#Code for pulling out data from statsbomb. 
#Statsbomb is a UK-based football analytics and data visualisation company introducing common data analytics practices seen in business and tech to the world of football analytics.
from statsbombpy import sb

In [5]:
# list of competitions in statsbomb data
sb.competitions()

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
0,16,4,Europe,Champions League,male,False,False,2018/2019,2022-08-14T16:57:15.866765,2021-06-13T16:17:31.694,,2022-08-14T16:57:15.866765
1,16,1,Europe,Champions League,male,False,False,2017/2018,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2021-01-23T21:55:30.425330
2,16,2,Europe,Champions League,male,False,False,2016/2017,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2020-07-29T05:00
3,16,27,Europe,Champions League,male,False,False,2015/2016,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2020-07-29T05:00
4,16,26,Europe,Champions League,male,False,False,2014/2015,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2020-07-29T05:00
5,16,25,Europe,Champions League,male,False,False,2013/2014,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2020-07-29T05:00
6,16,24,Europe,Champions League,male,False,False,2012/2013,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2021-07-10T13:41:45.751
7,16,23,Europe,Champions League,male,False,False,2011/2012,2021-08-27T11:26:39.802832,2021-06-13T16:17:31.694,,2020-07-29T05:00
8,16,22,Europe,Champions League,male,False,False,2010/2011,2022-01-26T21:07:11.033473,2021-06-13T16:17:31.694,,2022-01-26T21:07:11.033473
9,16,21,Europe,Champions League,male,False,False,2009/2010,2022-02-12T16:13:49.294747,2021-06-13T16:17:31.694,,2022-02-12T16:13:49.294747


In [6]:
# do a function to extract passing data, dropping null values, separates location data to enable plotting


def clean_data(comp_id, season_id):
    
    # extract passing data from statsbomb
    df = sb.matches(competition_id=comp_id, season_id=season_id)
    
    matches_touse = df.match_id.values.tolist()
    
    all_events = pd.DataFrame()
    
    for match in matches_touse:
        events = sb.events(match_id=match)
        all_events = pd.concat([all_events, events], axis = 0)
    
    all_passes = all_events[['period',
                             'match_id',
                             'pass_aerial_won', 
                        'pass_angle', 
                        'pass_assisted_shot_id',
                        'pass_body_part', 
                        'pass_cross', 
                        'pass_cut_back', 
                        'pass_deflected',
                        'location',
                        'pass_end_location', 
                        'pass_goal_assist', 
                        'pass_height',
                        'pass_inswinging', 
                        'pass_length', 
                        'pass_miscommunication',
                        'pass_no_touch', 
                        'pass_outcome', 
                        'pass_outswinging', 
                        'pass_recipient',
                        'pass_shot_assist', 
                        'pass_straight', 
                        'pass_switch', 
                        'pass_technique',
                        'pass_through_ball', 
                        'pass_type',
                        'possession_team',
                        'possession_team_id',
                        'player',
                        'player_id']]

    data_passes = all_passes[['period',
                              'location',
                          'pass_end_location',
                        'pass_outcome',
                          'pass_angle',
                          'pass_height',
                          'pass_length',
                          'pass_recipient',
                         'possession_team',
                         'possession_team_id',
                         'player',
                         'player_id',
                        'match_id']]

    # under pass_outcome, as completed passes are blank, to fill in 'complete' for them
    data_passes.pass_outcome.fillna(value='Complete', inplace = True)
    data_passes = pd.get_dummies(data_passes, columns=["pass_outcome"])
    
    # dropping null values
    data_passes_clean = data_passes.dropna()
    data_passes_clean.reset_index(inplace = True)
    
    #separates location data to enable plotting
    data_passes_clean_v1 = data_passes_clean['location'].apply(pd.Series)
    data_passes_clean_v1['start_x'] = data_passes_clean_v1[0]
    data_passes_clean_v1['start_y'] = data_passes_clean_v1[1]
    data_passes_clean_v1.drop([0,1], axis=1, inplace=True)

    data_passes_clean_v2 = data_passes_clean['pass_end_location'].apply(pd.Series)
    data_passes_clean_v2['end_x'] = data_passes_clean_v2[0]
    data_passes_clean_v2['end_y'] = data_passes_clean_v2[1]
    data_passes_clean_v2.drop([0,1], axis=1, inplace=True)

    dataset = pd.merge(data_passes_clean, data_passes_clean_v1, how="left", left_index=True, right_index=True)
    dataset_1 = pd.merge(dataset, data_passes_clean_v2, how="left", left_index=True, right_index=True)
    dataset_1 = dataset_1.drop(columns = ['index','location','pass_end_location'])
    dataset_1 = pd.get_dummies(dataset_1, columns=["pass_height"])
    
    
    return dataset_1


In [7]:
# extract Women's world cup 2019 data
pd.set_option('display.max_columns', None)
wc19 = clean_data(72,30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [8]:
wc19.to_pickle('wc19.pkl')

In [9]:
wc19.to_csv('wc19.csv', index=False)

In [10]:
# extract euro 2022 data
pd.set_option('display.max_columns', None)
euro22 = clean_data(53,106)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [11]:
euro22.to_pickle('euro22.pkl')