In [16]:
import os
import pathlib
import warnings

import pandas as pd
import numpy as np
import json

from itertools import combinations_with_replacement
import statsmodels.api as sm
import statsmodels.formula.api as smf

# modelling
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# plotting
from mplsoccer import Pitch
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

In [17]:
df = pd.DataFrame()
file_name = 'events_Germany.json'
path = os.path.join(str(pathlib.Path().resolve().parents[0]), 'data', 'Wyscout', 'events', file_name)
with open(path) as f:
    data = json.load(f)
df = pd.concat([df, pd.DataFrame(data)])
df = df.reset_index()

In [18]:
df.head()

Unnamed: 0,index,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,0,8,Simple pass,[{'id': 1801}],15231,"[{'y': 50, 'x': 50}, {'y': 48, 'x': 50}]",2516739,Pass,2446,1H,2.409746,85,179896442
1,1,8,Simple pass,[{'id': 1801}],14786,"[{'y': 48, 'x': 50}, {'y': 22, 'x': 22}]",2516739,Pass,2446,1H,2.506082,85,179896443
2,2,8,Simple pass,[{'id': 1801}],14803,"[{'y': 22, 'x': 22}, {'y': 46, 'x': 6}]",2516739,Pass,2446,1H,6.946706,85,179896444
3,3,8,Simple pass,[{'id': 1801}],14768,"[{'y': 46, 'x': 6}, {'y': 10, 'x': 20}]",2516739,Pass,2446,1H,10.786491,85,179896445
4,4,8,Simple pass,[{'id': 1801}],14803,"[{'y': 10, 'x': 20}, {'y': 4, 'x': 27}]",2516739,Pass,2446,1H,12.684514,85,179896446


In [None]:
next_event = df.shift(-1, fill_value=0)
df["nextEvent"] = next_event["subEventName"]
df["kickedOut"] = df.apply(lambda x: 1 if x.nextEvent == "Ball out of the field" else 0, axis = 1)

#interruptions out
interruption = df.loc[df["eventName"] == "Interruption"]
#probably need to drop "others on the ball event" - nope

# filter out non-accurate duels - in wyscout they are 2 way - attacking and defending
lost_duels = df.loc[df["eventName"] == "Duel"]
lost_duels = lost_duels.loc[lost_duels.apply (lambda x:{'id':1802} in x.tags, axis = 1)]
df = df.drop(lost_duels.index)

# filter ball out of the field - I can get this anyway
out_of_ball = df.loc[df["subEventName"] == "Ball out of the field"]
df = df.drop(out_of_ball.index)

# save attempts can be dropped
goalies = df.loc[df["subEventName"].isin(["Goalkeeper leaving line", "Save attempt", "Reflexes"])]
df = df.drop(goalies.index)

In [None]:
df.head()

Unnamed: 0,index,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,nextEvent,kickedOut
0,0,8,Simple pass,[{'id': 1801}],15231,"[{'y': 50, 'x': 50}, {'y': 48, 'x': 50}]",2516739,Pass,2446,1H,2.409746,85,179896442,Simple pass,0
1,1,8,Simple pass,[{'id': 1801}],14786,"[{'y': 48, 'x': 50}, {'y': 22, 'x': 22}]",2516739,Pass,2446,1H,2.506082,85,179896443,Simple pass,0
2,2,8,Simple pass,[{'id': 1801}],14803,"[{'y': 22, 'x': 22}, {'y': 46, 'x': 6}]",2516739,Pass,2446,1H,6.946706,85,179896444,Simple pass,0
3,3,8,Simple pass,[{'id': 1801}],14768,"[{'y': 46, 'x': 6}, {'y': 10, 'x': 20}]",2516739,Pass,2446,1H,10.786491,85,179896445,Simple pass,0
4,4,8,Simple pass,[{'id': 1801}],14803,"[{'y': 10, 'x': 20}, {'y': 4, 'x': 27}]",2516739,Pass,2446,1H,12.684514,85,179896446,Simple pass,0


In [21]:
def isolateChains(df):
    """
    Parameters
    ----------
    df : dataframe
        dataframe with Wyscout event data.

    Returns
    -------
    df: dataframe
        dataframe with isolated possession chains

    """
    df["nextTeamId"] = df.shift(-1, fill_value=0)["teamId"]
    #potential +0s
    chain_team = df.iloc[0]["teamId"]
    period = df.iloc[0]["matchPeriod"]
    stop_criterion = 0
    chain = 0
    df["possession_chain"] = 0
    df["possession_chain_team"] = 0

    for i, row in df.iterrows():
        #add value
        df.at[i, "possession_chain"] = chain
        df.at[i, "possession_chain_team"] = chain_team
        # if pass not accurate/lost duel, add 1 to stop criterion
        if row["eventName"] == "Pass" or row["eventName"] == "Duel":
            if row["teamId"] == chain_team and {"id": 1802} in row["tags"]:
                stop_criterion += 1
            if row["teamId"] != chain_team and {"id": 1801} in row["tags"]:
                stop_criterion += 1
        #if ball intercepted, add 2 to stop criterion
        if row["eventName"] == "Others on the ball":
            if row["teamId"] == row["nextTeamId"]:
                stop_criterion += 2
        #if shot, add 2 to stop criterion
        if row["eventName"] in ["Shot", "Foul", "Offside"]:
            stop_criterion += 2
        #if ball out of field, add 2 to stop criterion
        if row["kickedOut"] == 1:
            stop_criterion += 2
        #criterion for stopping when half ended
        if row["matchPeriod"] != period:
            chain += 1
            stop_criterion = 0
            chain_team = row['teamId']
            period = row["matchPeriod"]
            df.at[i, "possession_chain"] = chain
            df.at[i, "possession_chain_team"] = chain_team
        #possession chain ended
        if stop_criterion >= 2:
            chain += 1
            stop_criterion = 0
            chain_team = row['nextTeamId']
    return df

df = isolateChains(df)
#investigate a chain
df.loc[df["possession_chain"] == 4][["eventName", "possession_chain"]]

Unnamed: 0,eventName,possession_chain
24,Pass,4
25,Others on the ball,4
26,Pass,4


In [22]:
df.head()

Unnamed: 0,index,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,nextEvent,kickedOut,nextTeamId,possession_chain,possession_chain_team
0,0,8,Simple pass,[{'id': 1801}],15231,"[{'y': 50, 'x': 50}, {'y': 48, 'x': 50}]",2516739,Pass,2446,1H,2.409746,85,179896442,Simple pass,0,2446,0,2446
1,1,8,Simple pass,[{'id': 1801}],14786,"[{'y': 48, 'x': 50}, {'y': 22, 'x': 22}]",2516739,Pass,2446,1H,2.506082,85,179896443,Simple pass,0,2446,0,2446
2,2,8,Simple pass,[{'id': 1801}],14803,"[{'y': 22, 'x': 22}, {'y': 46, 'x': 6}]",2516739,Pass,2446,1H,6.946706,85,179896444,Simple pass,0,2446,0,2446
3,3,8,Simple pass,[{'id': 1801}],14768,"[{'y': 46, 'x': 6}, {'y': 10, 'x': 20}]",2516739,Pass,2446,1H,10.786491,85,179896445,Simple pass,0,2446,0,2446
4,4,8,Simple pass,[{'id': 1801}],14803,"[{'y': 10, 'x': 20}, {'y': 4, 'x': 27}]",2516739,Pass,2446,1H,12.684514,85,179896446,Simple pass,0,2446,0,2446


In [23]:
def calulatexG(df):
    """
    Parameters
    ----------
    df : dataframe
        dataframe with Wyscout event data.

    Returns
    -------
    xG_sum: dataframe
        dataframe with xG for each shot

    """
    #very basic xG model based on
    shots = df.loc[df["eventName"] == "Shot"].copy()
    shots["X"] = shots.positions.apply(lambda cell: (100 - cell[0]['x']) * 105/100)
    shots["Y"] = shots.positions.apply(lambda cell: cell[0]['y'] * 68/100)
    shots["C"] = shots.positions.apply(lambda cell: abs(cell[0]['y'] - 50) * 68/100)
    
    #calculate distance and angle
    shots["Distance"] = np.sqrt(shots["X"]**2 + shots["C"]**2)
    shots["Angle"] = np.where(np.arctan(7.32 * shots["X"] / (shots["X"]**2 + shots["C"]**2 - (7.32/2)**2)) > 0, 
                              np.arctan(7.32 * shots["X"] /(shots["X"]**2 + shots["C"]**2 - (7.32/2)**2)), 
                              np.arctan(7.32 * shots["X"] /(shots["X"]**2 + shots["C"]**2 - (7.32/2)**2)) + np.pi)
    
    #if you ever encounter problems (like you have seen that model treats 0 as 1 and 1 as 0) while modelling - change the dependant variable to object
    shots["Goal"] = shots.tags.apply(lambda x: 1 if {'id':101} in x else 0).astype(object)
    
    #headers have id = 403
    headers = shots.loc[shots.apply (lambda x:{'id':403} in x.tags, axis = 1)]
    non_headers = shots.drop(headers.index)
    headers_model = smf.glm(formula="Goal ~ Distance + Angle" , data=headers, family=sm.families.Binomial()).fit()
    nonheaders_model = smf.glm(formula="Goal ~ Distance + Angle" , data=non_headers, family=sm.families.Binomial()).fit()
    
    #assigning xG
    df["xG"] = 0.0
    #headers
    b_head = headers_model.params
    xG = 1/(1+np.exp(b_head[0]+b_head[1]*headers['Distance'] + b_head[2]*headers['Angle']))
    headers = headers.assign(xG = xG)
    for index, row in headers.iterrows():
        df.at[index, "xG"] = row["xG"]
    #non-headers
    b_nhead = nonheaders_model.params
    xG = 1/(1+np.exp(b_nhead[0]+b_nhead[1]*non_headers['Distance'] + b_nhead[2]*non_headers['Angle']))
    non_headers = non_headers.assign(xG = xG)
    for index, row in non_headers.iterrows():
        df.at[index, "xG"] = row["xG"]

    penalties = df.loc[df["subEventName"] == "Penalty"]
    #treating penalties like shots
    penalties["X"] = 11
    #calculate distance and angle
    penalties["Distance"] = 11
    penalties["Angle"] = np.arctan(7.32 * 11 /(11**2 - (7.32/2)**2))
    #if you ever encounter problems (like you have seen that model treats 0 as 1 and 1 as 0) while modelling - change the dependant variable to object
    penalties["Goal"] = penalties.tags.apply(lambda x: 1 if {'id':101} in x else 0).astype(object)
    penalties = penalties.assign(xG = xG)
    for index, row in penalties.iterrows():
        df.at[index, "xG"] = row["xG"]
    return df

df = calulatexG(df)
#investigate a chain
df.loc[df["possession_chain"].isin([3,4])][["eventName", "possession_chain", "xG"]]

Unnamed: 0,eventName,possession_chain,xG
15,Pass,3,0.0
17,Duel,3,0.0
18,Pass,3,0.0
19,Pass,3,0.0
20,Duel,3,0.0
23,Duel,3,0.0
24,Pass,4,0.0
25,Others on the ball,4,0.0
26,Pass,4,0.0


In [24]:
def prepareChains(df):
    """
    Parameters
    ----------
    df : dataframe
        dataframe with Wyscout event data.

    Returns
    -------
    xG_sum: dataframe
        dataframe with assigned values for chains

    """
    df["non_shot_end"] = 1
    #get number of chains
    no_chains = max(df["possession_chain"].unique())
    indicies = []
    for i in range(no_chains+1):
        #all events get possession chain
        possession_chain_df = df.loc[df["possession_chain"] == i]
        #check if the possession chain is not empty
        if len(possession_chain_df) > 0:
            #if ended with shot
            if possession_chain_df.iloc[-1]["eventName"] == "Shot":
                #assign values
                df.loc[df["possession_chain"] == i, "shot_end"] = 0
                xG = possession_chain_df.iloc[-1]["xG"]
                df.loc[df["possession_chain"] == i, "xG"] = xG
                #check if the previous ones did not end with foul
                k = i-1
                if k > 0:
                    try:
                        prev = df.loc[df["possession_chain"] == k]
                        #create a loop if e.g. 2 chains before and 1 chain before didn;t end with shot
                        while prev.iloc[-1]["eventName"] == "Foul":
                            #assign value for them
                            df.loc[df["possession_chain"] == k, "xG"] = xG
                            df.loc[df["possession_chain"] == k, "shot_end"] = 0
                            k = k-1
                            prev = df.loc[df["possession_chain"] == k]
                    except:
                        k = k-1
            #get indiices of events made by possession team
            team_indicies = possession_chain_df.loc[possession_chain_df["teamId"] == possession_chain_df.teamId.mode().iloc[0]].index.values.tolist()
            indicies.extend(team_indicies)

    df = df.loc[indicies]
    return df

df = prepareChains(df)
df.loc[df["possession_chain"].isin([3,4])][["eventName", "possession_chain", "xG"]]

Unnamed: 0,eventName,possession_chain,xG
15,Pass,3,0.0
17,Duel,3,0.0
18,Pass,3,0.0
19,Pass,3,0.0
24,Pass,4,0.0
26,Pass,4,0.0


In [25]:
df.head(3)

Unnamed: 0,index,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,...,subEventId,id,nextEvent,kickedOut,nextTeamId,possession_chain,possession_chain_team,xG,non_shot_end,shot_end
0,0,8,Simple pass,[{'id': 1801}],15231,"[{'y': 50, 'x': 50}, {'y': 48, 'x': 50}]",2516739,Pass,2446,1H,...,85,179896442,Simple pass,0,2446,0,2446,0.0,1,
1,1,8,Simple pass,[{'id': 1801}],14786,"[{'y': 48, 'x': 50}, {'y': 22, 'x': 22}]",2516739,Pass,2446,1H,...,85,179896443,Simple pass,0,2446,0,2446,0.0,1,
2,2,8,Simple pass,[{'id': 1801}],14803,"[{'y': 22, 'x': 22}, {'y': 46, 'x': 6}]",2516739,Pass,2446,1H,...,85,179896444,Simple pass,0,2446,0,2446,0.0,1,


In [26]:
df.loc[df["possession_chain"].isin([0,1,3,4])][["eventName", "possession_chain", "possession_chain_team", "xG", "non_shot_end"]]

Unnamed: 0,eventName,possession_chain,possession_chain_team,xG,non_shot_end
0,Pass,0,2446,0.0,1
1,Pass,0,2446,0.0,1
2,Pass,0,2446,0.0,1
3,Pass,0,2446,0.0,1
4,Pass,0,2446,0.0,1
5,Pass,0,2446,0.0,1
6,Pass,0,2446,0.0,1
7,Pass,0,2446,0.0,1
9,Pass,1,2444,0.0,1
10,Pass,1,2444,0.0,1


In [27]:
# preparing data for modelling
#filter out dodgy
df = df.loc[df.apply(lambda x: len(x.positions) == 2, axis = 1)]

#columns with coordinates for plotting
df["x0"] = df.positions.apply(lambda cell: (cell[0]['x']) * 105/100)
df["c0"] = df.positions.apply(lambda cell: abs(50 - cell[0]['y']) * 68/100)
df["x1"] = df.positions.apply(lambda cell: (cell[1]['x']) * 105/100)
df["c1"] = df.positions.apply(lambda cell: abs(50 - cell[1]['y']) * 68/100)
df["y0"] = df.positions.apply(lambda cell: (100 - cell[0]['y']) * 68/100)
df["y1"] = df.positions.apply(lambda cell: (100 - cell[1]['y']) * 68/100)

#assign (105, 0) to end of the shot
df.loc[df["eventName"] == "Shot", "x1"] = 105
df.loc[df["eventName"] == "Shot", "c1"] = 0
df.loc[df["eventName"] == "Shot", "y1"] = 34

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 348914 entries, 0 to 519405
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   index                  348914 non-null  int64  
 1   eventId                348914 non-null  int64  
 2   subEventName           348914 non-null  object 
 3   tags                   348914 non-null  object 
 4   playerId               348914 non-null  int64  
 5   positions              348914 non-null  object 
 6   matchId                348914 non-null  int64  
 7   eventName              348914 non-null  object 
 8   teamId                 348914 non-null  int64  
 9   matchPeriod            348914 non-null  object 
 10  eventSec               348914 non-null  float64
 11  subEventId             348914 non-null  object 
 12  id                     348914 non-null  int64  
 13  nextEvent              348914 non-null  object 
 14  kickedOut              348914 non-nu

In [29]:
# save the df containing possession chains for future use
df.to_json('../data/Wyscout/events/def_possession_chains_Germany.json')

In [30]:
#model variables
var = ["x0", "x1", "c0", "c1"]

#combinations
inputs = []
#one variable combinations
inputs.extend(combinations_with_replacement(var, 1))
#2 variable combinations
inputs.extend(combinations_with_replacement(var, 2))
#3 variable combinations
inputs.extend(combinations_with_replacement(var, 3))

#make new columns
for i in inputs:
    #columns length 1 already exist
    if len(i) > 1:
        #column name
        column = ''
        x = 1
        for c in i:
            #add column name to be x0x1c0 for example
            column += c
            #multiply values in column
            x = x*df[c]
        #create a new column in df
        df[column] = x
        #add column to model variables
        var.append(column)
#investigate 3 columns
df[var[-3:]].head(3)

Unnamed: 0,c0c0c1,c0c1c1,c1c1c1
0,0.0,0.0,2.515456
1,35.216384,493.029376,6902.411264
2,986.058752,140.865536,20.123648


In [31]:
### TRAINING, it's not perfect ML procedure, but results in AUC 0.2 higher than Logistic Regression ###
# note that this is different df, with data from BL loaded into a possession chain df
passes = df.loc[ df["eventName"].isin(["Pass"])]
X = passes[var].values
y = passes["non_shot_end"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123, stratify=y)
xgb_model = xgb.XGBRegressor(n_estimators=100, ccp_alpha=0, max_depth=4, min_samples_leaf=10, random_state=123)

scores = cross_val_score(estimator=xgb_model, X=X_train, y=y_train, cv=10, n_jobs=-1)
print(np.mean(scores), np.std(scores))

xgb_model.fit(X_train, y_train)
print(xgb_model.score(X_train, y_train))

y_pred = xgb_model.predict(X_test)
print(xgb_model.score(X_test, y_test))

0.0 0.0
Parameters: { "ccp_alpha", "min_samples_leaf" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


0.0
0.0


In [32]:
# save model for future reference
file_name = "../data/Wyscout/models/def_xgb_model_Germany.pkl"
# save
pickle.dump(xgb_model, open(file_name, "wb"))

Parameters: { "ccp_alpha", "min_samples_leaf" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "ccp_alpha", "min_samples_leaf" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "ccp_alpha", "min_samples_leaf" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such c