In [1]:
from statsbombpy import sb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import warnings; warnings.simplefilter('ignore')


In [2]:
euro_matches = sb.matches(competition_id=55, season_id=282)

In [3]:
matches = euro_matches["match_id"]

In [4]:
df_list = []
for i in matches:
    df = sb.events(i)
    df_list.append(df)

events = pd.concat(df_list, ignore_index=True)

In [5]:
passes = events[events["type"] == "Pass"]
passes = passes[["pass_length", "pass_angle","pass_height","pass_end_location","pass_deflected","pass_cross","pass_switch", "pass_body_part","pass_type","pass_outcome","pass_technique"]]
passes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53890 entries, 6 to 185489
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pass_length        53890 non-null  float64
 1   pass_angle         53890 non-null  float64
 2   pass_height        53890 non-null  object 
 3   pass_end_location  53890 non-null  object 
 4   pass_deflected     38 non-null     object 
 5   pass_cross         1228 non-null   object 
 6   pass_switch        1356 non-null   object 
 7   pass_body_part     51250 non-null  object 
 8   pass_type          8490 non-null   object 
 9   pass_outcome       8512 non-null   object 
 10  pass_technique     571 non-null    object 
dtypes: float64(2), object(9)
memory usage: 4.9+ MB


In [6]:
# passes_comp = passes[passes["pass_outcome"].isnull()]
# passes_comp = passes_comp[["pass_length", "pass_angle","pass_height","pass_end_location","pass_deflected","pass_cross","pass_switch", "pass_body_part","pass_type","pass_outcome","pass_technique"]]
# passes_comp.info()

# deflected dropped cause non null too low
passes.drop(columns=["pass_deflected", "pass_technique"],axis=1, inplace=True)
passes

Unnamed: 0,pass_length,pass_angle,pass_height,pass_end_location,pass_cross,pass_switch,pass_body_part,pass_type,pass_outcome
6,34.738163,-2.982596,Ground Pass,"[25.7, 34.5]",,,Right Foot,Kick Off,
7,65.695740,0.462286,High Pass,"[89.2, 53.4]",,,Left Foot,,Incomplete
8,30.236732,0.340577,High Pass,"[59.4, 36.8]",,,Head,Recovery,Incomplete
9,25.678005,0.316825,High Pass,"[85.1, 51.3]",,,Head,Recovery,
10,15.160475,2.601173,Ground Pass,"[37.1, 24.1]",,,Left Foot,Free Kick,
...,...,...,...,...,...,...,...,...,...
185485,27.655200,-0.808412,Ground Pass,"[118.3, 2.2]",,,Left Foot,,
185486,17.042593,3.012143,Ground Pass,"[100.2, 4.2]",,,Left Foot,,
185487,15.110923,1.804540,Ground Pass,"[99.2, 19.2]",,,Right Foot,,
185488,12.553884,0.392340,Ground Pass,"[110.3, 23.9]",,,Left Foot,,


In [7]:
height_mapping = {"Ground Pass": 1, "Low Pass": 2, "High Pass": 3}
cross_mapping = {"True": 1}
switch_mapping = {"True": 1}
body_part_mapping = {"Drop Kick": 1, "Head": 2, "Keeper Arm": 3, "Left Foot": 4, "Other": 5, "Right Foot": 6, "No Touch": 7}
type_mapping = {"Corner": 1, "Free Kick": 2, "Goal Kick": 3, "Interception": 4, "Kick Off": 5, "Recover": 6, "Throw-in": 7}
outcome_mapping = {"Incomplete": 0, "Injury Clearance": 0, "Out": 0, "Pass Offside": 0, "Unknown": 0}


In [8]:
passes["pass_height"]  = passes["pass_height"].map(height_mapping)
passes["pass_cross"]  = passes["pass_cross"].map(cross_mapping)
passes["pass_switch"]  = passes["pass_switch"].map(switch_mapping)
passes["pass_body_part"]  = passes["pass_body_part"].map(body_part_mapping)
passes["pass_type"]  = passes["pass_type"].map(type_mapping)
passes["pass_outcome"]  = passes["pass_outcome"].map(outcome_mapping)

In [9]:
# Replace NaN values with the specified number
passes["pass_cross"].fillna(0, inplace=True)
passes["pass_switch"].fillna(0, inplace=True)
passes["pass_body_part"].fillna(0, inplace=True)
passes["pass_type"].fillna(0, inplace=True)
passes["pass_outcome"].fillna(1, inplace=True)

In [10]:
passes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53890 entries, 6 to 185489
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pass_length        53890 non-null  float64
 1   pass_angle         53890 non-null  float64
 2   pass_height        53890 non-null  int64  
 3   pass_end_location  53890 non-null  object 
 4   pass_cross         53890 non-null  float64
 5   pass_switch        53890 non-null  float64
 6   pass_body_part     53890 non-null  float64
 7   pass_type          53890 non-null  float64
 8   pass_outcome       53890 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 4.1+ MB


In [11]:
passes = passes.reset_index(drop=True)

In [12]:
def classify_location(location):
    x = location[0] 
    if x <= 60:
        return 1
    elif x <= 120:
        return 2
    else:
        return None 

In [13]:
passes["pass_end_location"] = passes["pass_end_location"].apply(classify_location)

In [14]:
passes

Unnamed: 0,pass_length,pass_angle,pass_height,pass_end_location,pass_cross,pass_switch,pass_body_part,pass_type,pass_outcome
0,34.738163,-2.982596,1,1,0.0,0.0,6.0,5.0,1.0
1,65.695740,0.462286,3,2,0.0,0.0,4.0,0.0,0.0
2,30.236732,0.340577,3,1,0.0,0.0,2.0,0.0,0.0
3,25.678005,0.316825,3,2,0.0,0.0,2.0,0.0,1.0
4,15.160475,2.601173,1,1,0.0,0.0,4.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...
53885,27.655200,-0.808412,1,2,0.0,0.0,4.0,0.0,1.0
53886,17.042593,3.012143,1,2,0.0,0.0,4.0,0.0,1.0
53887,15.110923,1.804540,1,2,0.0,0.0,6.0,0.0,1.0
53888,12.553884,0.392340,1,2,0.0,0.0,4.0,0.0,1.0


In [15]:
X = passes[["pass_length", "pass_angle","pass_height","pass_end_location","pass_cross","pass_switch", "pass_body_part","pass_type"]]
y = passes["pass_outcome"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
model = LinearRegression()
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

In [19]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 0.0967788103486658
R^2 Score: 0.2811068914124908


In [20]:
# Statsmodels for detailed summary
X_train_sm = sm.add_constant(X_train)  # Adding a constant
ols_model = sm.OLS(y_train, X_train_sm).fit()
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:           pass_outcome   R-squared:                       0.274
Model:                            OLS   Adj. R-squared:                  0.273
Method:                 Least Squares   F-statistic:                     2705.
Date:                Sat, 20 Jul 2024   Prob (F-statistic):               0.00
Time:                        13:09:02   Log-Likelihood:                -10732.
No. Observations:               43112   AIC:                         2.148e+04
Df Residuals:                   43105   BIC:                         2.154e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 1.3125      0.00

## Part 2

In [28]:
X2 = passes[["pass_length"]]
y2= passes["pass_outcome"]

In [29]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [30]:
model = LinearRegression()
model.fit(X2_train, y2_train)

In [31]:
y2_pred = model.predict(X2_test)

In [33]:
mse2 = mean_squared_error(y2_test, y2_pred)
r22 = r2_score(y2_test, y2_pred)

print(f'Mean Squared Error: {mse2}')
print(f'R^2 Score: {r22}')

Mean Squared Error: 0.12672990024933925
R^2 Score: 0.05862397343998327


In [34]:
# Statsmodels for detailed summary
X_train_sm2 = sm.add_constant(X2_train)  # Adding a constant
ols_model2 = sm.OLS(y2_train, X_train_sm2).fit()
print(ols_model2.summary())

                            OLS Regression Results                            
Dep. Variable:           pass_outcome   R-squared:                       0.066
Model:                            OLS   Adj. R-squared:                  0.066
Method:                 Least Squares   F-statistic:                     3042.
Date:                Sat, 20 Jul 2024   Prob (F-statistic):               0.00
Time:                        13:13:54   Log-Likelihood:                -16151.
No. Observations:               43112   AIC:                         3.231e+04
Df Residuals:                   43110   BIC:                         3.232e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.9807      0.003    324.460      