In [138]:
import numpy as np
import duckdb
import pandas as pd
import seaborn as sns
import math
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score
import matplotlib.pyplot as plt

In [139]:
data = pd.read_csv("BDC_2024_Womens_Data.csv")

In [140]:
data.columns

Index(['Date', 'Home Team', 'Away Team', 'Period', 'Clock',
       'Home Team Skaters', 'Away Team Skaters', 'Home Team Goals',
       'Away Team Goals', 'Team', 'Player', 'Event', 'X Coordinate',
       'Y Coordinate', 'Detail 1', 'Detail 2', 'Detail 3', 'Detail 4',
       'Player 2', 'X Coordinate 2', 'Y Coordinate 2'],
      dtype='object')

# Part 3: The Relationship Between Traffic and Scoring a Goal
# By Lauren Mok
In this portion, we will investigate the relationship between the presence of traffic and scoring a goal, as well as the relationship between one-timers and scoring a goal. Traffic refers to the gathering of many players in one area, usually in front of the net. A one-timer is a shot that occurs when a player meets a teammate's pass with an immediate slapshot, without any attempt to control the puck on their stick. We will first create a dataframe containing the necessary columns: 'Event', 'Detail 3', and 'Detail 4' ('Detail 3' notes the presence or absence of traffic, and 'Detail 4' notes whether or not the shot was a one-timer ). Then, we will prepare the data for analysis by converting the data to binary values. We will proceed by fitting two logistic regression models to the data, each with either traffic or one-timer shot as the only input variable, respectively. The input coefficients generated by the model will be analysed to eludicate the extent to which traffic and one-timer shots influence goal-scoring patterns and their statistical significance in predicting goal outcomes. 

### 3.1 Data Preprocessing

In [141]:
# Selecting the relevant columns from the dataframe
filtered_data = duckdb.sql("""
                            SELECT Event, "Detail 3", "Detail 4"
                           FROM data
                           WHERE (Event = 'Goal' OR Event = 'Shot')
                            """).df()

# Renaming the dataframe columns
filtered_data = filtered_data.rename(columns={'Detail 3': 'Traffic', 'Detail 4': 'One Timer'})

# Removing rows with NaN values
filtered_data = filtered_data.dropna()

filtered_data

Unnamed: 0,Event,Traffic,One Timer
0,Goal,f,f
1,Goal,f,f
2,Goal,f,t
3,Goal,f,f
4,Shot,f,f
...,...,...,...
418,Shot,t,f
419,Shot,f,f
420,Shot,t,f
421,Shot,f,f


In [142]:
# Checking that 't' and 'f' are the only unique values in 'Traffic' and 'One Timer'
print(filtered_data['Traffic'].unique())
print(filtered_data['One Timer'].unique())

['f' 't']
['f' 't']


In [143]:
# Creating a function that converts 't' to 1 and 'f' to 0
def convert_bool(value):
    if value == 't':
        return 1
    else:
        assert value == 'f'
        return 0
    
# Applying the function to 'Traffic' and 'One Timer'
filtered_data['Traffic'] = filtered_data['Traffic'].apply(convert_bool)
filtered_data['One Timer'] = filtered_data['One Timer'].apply(convert_bool)

# Printing the dataframe to check that 'Traffic' and 'One Timer' has now been converted to binary values
filtered_data

Unnamed: 0,Event,Traffic,One Timer
0,Goal,0,0
1,Goal,0,0
2,Goal,0,1
3,Goal,0,0
4,Shot,0,0
...,...,...,...
418,Shot,1,0
419,Shot,0,0
420,Shot,1,0
421,Shot,0,0


In [144]:
# Creating a function that converts 'Goal' (successful) to 1 and 'Shot'(unsucessful) to 0.
def convert_event(value):
    if value == "Goal":
        return 1
    else:
        assert value == 'Shot'
        return 0
    
# Applying the function to 'Event'
filtered_data['Event'] = filtered_data['Event'].apply(convert_event)

filtered_data

Unnamed: 0,Event,Traffic,One Timer
0,1,0,0
1,1,0,0
2,1,0,1
3,1,0,0
4,0,0,0
...,...,...,...
418,0,1,0
419,0,0,0
420,0,1,0
421,0,0,0


### 3.2 Analyzing the Statistical Significance of Traffic in Predicting Goal Outcomes

In [145]:
# Extracting features and target variable
X = filtered_data[['Traffic']].values
y = filtered_data['Event'].values

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fitting the logistic regression model
model = sm.Logit(endog = y, exog = X).fit()

# Print the model summary and identify the input coefficient 
print (model.summary())

Optimization terminated successfully.
         Current function value: 0.378973
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  423
Model:                          Logit   Df Residuals:                      422
Method:                           MLE   Df Model:                            0
Date:                Sat, 17 Feb 2024   Pseudo R-squ.:                 -0.9901
Time:                        13:11:25   Log-Likelihood:                -160.31
converged:                       True   LL-Null:                       -80.552
Covariance Type:            nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -4.6347      0.711     -6.523      0.000      -6.027      -3.242


From the model summary, we can see that the input variable 'Traffic' has a coefficient of -4.637, suggesting an inverse relationship between the presence of traffic and scoring a goal. Despite this result being statistically significant, indicated by the p-value of 0, when interpreted within the context of a logistic model, if traffic is present, we expect the odds of a goal being scored to be multiplied by e^(-4.6347), which approximately equals 0.0097. In other words, when traffic is present, the probability of scoring a goal is approximately 0.97%. This implies that the influence of traffic on the likelihood of scoring a goal is quite negligible. 

In [146]:
# Printing the predicted probabilities 
y_prob = model.predict(X_test)
print(y_prob)

[0.00961538 0.00961538 0.5        0.5        0.5        0.5
 0.5        0.00961538 0.00961538 0.00961538 0.5        0.00961538
 0.5        0.5        0.5        0.5        0.00961538 0.5
 0.5        0.00961538 0.5        0.00961538 0.00961538 0.00961538
 0.5        0.5        0.00961538 0.5        0.00961538 0.5
 0.00961538 0.5        0.00961538 0.5        0.5        0.5
 0.5        0.00961538 0.5        0.5        0.5        0.00961538
 0.00961538 0.00961538 0.00961538 0.5        0.5        0.00961538
 0.5        0.5        0.00961538 0.00961538 0.5        0.00961538
 0.00961538 0.5        0.5        0.5        0.00961538 0.00961538
 0.5        0.5        0.5        0.00961538 0.00961538 0.00961538
 0.5        0.5        0.00961538 0.00961538 0.5        0.00961538
 0.00961538 0.5        0.00961538 0.00961538 0.00961538 0.5
 0.00961538 0.00961538 0.00961538 0.00961538 0.00961538 0.5
 0.00961538 0.00961538 0.5        0.5        0.00961538 0.00961538
 0.00961538 0.00961538 0.5        0.0

From the predicted probabilities above, we can see that the only two distinct values are 0.00961538 and 0.5. The former value confirms our interpretation of the model's input coffefficient, which is that the probability of scoring a goal is approximately 0.0097 or 0.97%. The latter value suggests that in the absence of traffic, there is a 50% of scoring a goal. To evluate the performance of the model, we will print the accuracy, precision, and recall scores. In order to do so, we must first decide on a threshold for classifying the predictions as positive or negative. It is already apparent that the evaluation metrics will yield limited insights due to the lack of variability in the predicted probabilities. For instance, with a threshold of 0.5, the model will likely struggle to identify true negatives (actual goals scored). However, setting the threshold any higher would lead the model to predict all instances as negatives. Regardless, we will set the threshold to 0.5 to confirm our prediction.

In [147]:
# Converting probabilities to binary predictions
y_pred = np.zeros(len(y_prob))
for i in range (len(y_pred)):
    if y_prob[i] >= 0.5:
        y_pred[i] = 1
    else:
        y_pred[i] = 0

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy: 0.5196850393700787
Precision: 0.0625
Recall: 0.8


As anticipated, the precision score is close to null, as setting a threshold of 0.5 has led to a significant number of false positives. This means that the model has misclassified numerous instances as positive (goal scored) when they were actually negative (no goal scored). 

### 3.3 Analyzing the Statistical Signifiance of One Timer Shots in Predicting Goal Outcomes

In [148]:
X = filtered_data[['One Timer']].values
y = filtered_data['Event'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = sm.Logit(endog = y, exog = X).fit()
print (model.summary())

y_prob = model.predict(X_test)

# coefficient is not big enough to make a different in the predicted value for y
# therefore, although the model has a coefficent of -4.6347, it converts to approximately
# 0.0097 in probability of the goal being scored. So although this result is significant,
# there is not a huge impact on whether there is traffic on whether the goal is made.

y_pred = np.zeros(len(y_prob))
for i in range (len(y_pred)):
    if y_prob[i] >= 0.5:
        y_pred[i] = 1
    else:
        y_pred[i] = 0
y_prob

Optimization terminated successfully.
         Current function value: 0.618092
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  423
Model:                          Logit   Df Residuals:                      422
Method:                           MLE   Df Model:                            0
Date:                Sat, 17 Feb 2024   Pseudo R-squ.:                  -2.246
Time:                        13:11:25   Log-Likelihood:                -261.45
converged:                       True   LL-Null:                       -80.552
Covariance Type:            nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -2.5953      0.464     -5.598      0.000      -3.504      -1.687


array([0.5       , 0.5       , 0.5       , 0.06944444, 0.5       ,
       0.5       , 0.5       , 0.5       , 0.06944444, 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.06944444, 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.06944444, 0.5       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.06944444, 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.06944444,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.06944444, 0.5       , 0.5       ,
       0.5       , 0.5       , 0.06944444, 0.5       , 0.5       ,
       0.5       , 0.5       , 0.06944444, 0.06944444, 0.5       ,
       0.5       , 0.5       , 0.06944444, 0.5       , 0.5    

In [149]:
model_2 = LogisticRegression().fit(X, y)
model_2.coef_

predicted_y = model_2.predict(X_test)

In [150]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy: 0.1968503937007874
Precision: 0.04672897196261682
Recall: 1.0
