In [324]:
import numpy as np
import duckdb
import pandas as pd
import seaborn as sns
import math
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

In [325]:
data = pd.read_csv("BDC_2024_Womens_Data.csv")

In [326]:
data.columns

Index(['Date', 'Home Team', 'Away Team', 'Period', 'Clock',
       'Home Team Skaters', 'Away Team Skaters', 'Home Team Goals',
       'Away Team Goals', 'Team', 'Player', 'Event', 'X Coordinate',
       'Y Coordinate', 'Detail 1', 'Detail 2', 'Detail 3', 'Detail 4',
       'Player 2', 'X Coordinate 2', 'Y Coordinate 2'],
      dtype='object')

# Part 3: The Relationship Between Traffic and Scoring a Goal
# By Lauren Mok
In this portion, we will investigate the relationship between the presence of traffic and scoring a goal, as well as the relationship between one-timers and scoring a goal. Traffic refers to the gathering of many players in one area, usually in front of the net. A one-timer is a shot that occurs when a player meets a teammate's pass with an immediate slapshot, without any attempt to control the puck on their stick. We will first create a dataframe containing the necessary columns: 'Event', 'Detail 3', and 'Detail 4' ('Detail 3' notes the presence or absence of traffic, and 'Detail 4' notes whether or not the shot was a one-timer ). Then, we will prepare the data for analysis by converting the data to binary values. We will proceed by fitting two logistic regression models to the data, each with either traffic or one-timer shot as the only input variable, respectively. The input coefficients generated by the model will be analysed to eludicate the extent to which traffic and one-timer shots influence goal-scoring patterns and their statistical significance in predicting goal outcomes. 

### 3.1 Data Preprocessing

In [327]:
# Selecting the relevant columns from the dataframe
filtered_data = duckdb.sql("""
                            SELECT Event, "Detail 3", "Detail 4"
                           FROM data
                           WHERE (Event = 'Goal' OR Event = 'Shot')
                            """).df()

# Renaming the dataframe columns
filtered_data = filtered_data.rename(columns={'Detail 3': 'Traffic', 'Detail 4': 'One Timer'})

# Removing rows with NaN values
filtered_data = filtered_data.dropna()

filtered_data

Unnamed: 0,Event,Traffic,One Timer
0,Goal,f,f
1,Goal,f,f
2,Goal,f,t
3,Goal,f,f
4,Shot,f,f
...,...,...,...
418,Shot,t,f
419,Shot,f,f
420,Shot,t,f
421,Shot,f,f


In [328]:
# Checking that 't' and 'f' are the only unique values in 'Traffic' and 'One Timer'
print(filtered_data['Traffic'].unique())
print(filtered_data['One Timer'].unique())

['f' 't']
['f' 't']


In [329]:
# Creating a function that converts 't' to 1 and 'f' to 0
def convert_bool(value):
    if value == 't':
        return 1
    else:
        assert value == 'f'
        return 0
    
# Applying the function to 'Traffic' and 'One Timer'
filtered_data['Traffic'] = filtered_data['Traffic'].apply(convert_bool)
filtered_data['One Timer'] = filtered_data['One Timer'].apply(convert_bool)

# Printing the dataframe to check that 'Traffic' and 'One Timer' has now been converted to binary values
filtered_data

Unnamed: 0,Event,Traffic,One Timer
0,Goal,0,0
1,Goal,0,0
2,Goal,0,1
3,Goal,0,0
4,Shot,0,0
...,...,...,...
418,Shot,1,0
419,Shot,0,0
420,Shot,1,0
421,Shot,0,0


In [330]:
# Creating a function that converts 'Goal' (successful) to 1 and 'Shot'(unsucessful) to 0.
def convert_event(value):
    if value == "Goal":
        return 1
    else:
        assert value == 'Shot'
        return 0
    
# Applying the function to 'Event'
filtered_data['Event'] = filtered_data['Event'].apply(convert_event)

filtered_data

Unnamed: 0,Event,Traffic,One Timer
0,1,0,0
1,1,0,0
2,1,0,1
3,1,0,0
4,0,0,0
...,...,...,...
418,0,1,0
419,0,0,0
420,0,1,0
421,0,0,0


### 3.2 Analyzing the Statistical Significance of Traffic in Predicting Goal Outcomes

In [331]:
# Extracting features and target variable
X = filtered_data['Traffic'].values
y = filtered_data[['Event']].values

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fitting the logistic regression model
model = sm.Logit(endog = y_train, exog = X_train).fit()

# Print the model summary and identify the input coefficient 
print (model.summary())

Optimization terminated successfully.
         Current function value: 0.373779
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  296
Model:                          Logit   Df Residuals:                      295
Method:                           MLE   Df Model:                            0
Date:                Mon, 19 Feb 2024   Pseudo R-squ.:                 -0.8642
Time:                        00:00:21   Log-Likelihood:                -110.64
converged:                       True   LL-Null:                       -59.348
Covariance Type:            nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -4.9698      1.003     -4.953      0.000      -6.937      -3.003


From the model summary, we can see that the input variable 'Traffic' has a coefficient of -4.9698, suggesting an inverse relationship between the presence of traffic and scoring a goal. Despite this result being statistically significant, indicated by the p-value of 0, when interpreted within the context of a logistic model, if traffic is present, we expect the odds of a goal being scored to be multiplied by e^(-4.9698), which approximately equals 0.0069. In other words, when traffic is present, the probability of scoring a goal is approximately 0.69%. This implies that the influence of traffic on the likelihood of scoring a goal is quite negligible. 

In [332]:
# Printing the predicted probabilities 
y_prob = model.predict(X_test)
print(y_prob)

[0.00689655 0.00689655 0.5        0.5        0.5        0.5
 0.5        0.00689655 0.00689655 0.00689655 0.5        0.00689655
 0.5        0.5        0.5        0.5        0.00689655 0.5
 0.5        0.00689655 0.5        0.00689655 0.00689655 0.00689655
 0.5        0.5        0.00689655 0.5        0.00689655 0.5
 0.00689655 0.5        0.00689655 0.5        0.5        0.5
 0.5        0.00689655 0.5        0.5        0.5        0.00689655
 0.00689655 0.00689655 0.00689655 0.5        0.5        0.00689655
 0.5        0.5        0.00689655 0.00689655 0.5        0.00689655
 0.00689655 0.5        0.5        0.5        0.00689655 0.00689655
 0.5        0.5        0.5        0.00689655 0.00689655 0.00689655
 0.5        0.5        0.00689655 0.00689655 0.5        0.00689655
 0.00689655 0.5        0.00689655 0.00689655 0.00689655 0.5
 0.00689655 0.00689655 0.00689655 0.00689655 0.00689655 0.5
 0.00689655 0.00689655 0.5        0.5        0.00689655 0.00689655
 0.00689655 0.00689655 0.5        0.0

From the predicted probabilities above, we can see that the only two distinct values are 0.00689655 and 0.5. The former value confirms our interpretation of the model's input coffefficient, which is that the probability of scoring a goal is approximately 0.0069 or 0.69%. The latter value suggests that in the absence of traffic, there is a 50% of scoring a goal. To evluate the performance of the model, we will print the accuracy, precision, and recall scores. In order to do so, we must first decide on a threshold for classifying the predictions as positive or negative. It is already apparent that the evaluation metrics will yield limited insights due to the lack of variability in the predicted probabilities. For instance, with a threshold of 0.5, the model will likely struggle to identify true negatives (actual goals scored). However, setting the threshold any higher would lead the model to predict all instances as negatives. Regardless, we will set the threshold to 0.5 to confirm our prediction.

In [333]:
# Converting probabilities to binary predictions
y_pred = np.zeros(len(y_prob))
for i in range (len(y_pred)):
    if y_prob[i] >= 0.5:
        y_pred[i] = 1
    else:
        y_pred[i] = 0

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy: 0.5196850393700787
Precision: 0.0625
Recall: 0.8


As anticipated, the precision score is close to null, as setting a threshold of 0.5 has led to a significant number of false positives. This means that the model has misclassified numerous instances as positive (goal scored) when they were actually negative (no goal scored). 

### 3.3 Analyzing the Statistical Signifiance of One Timer Shots in Predicting Goal Outcomes

In [334]:
# Extracting features and target variable
X = filtered_data[['One Timer']].values
y = filtered_data['Event'].values

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fitting the logistic regression model
model = sm.Logit(endog = y_train, exog = X_train).fit()
print (model.summary())

y_prob = model.predict(X_test)

# coefficient is not big enough to make a different in the predicted value for y
# therefore, although the model has a coefficent of -4.6347, it converts to approximately
# 0.0097 in probability of the goal being scored. So although this result is significant,
# there is not a huge impact on whether there is traffic on whether the goal is made.

y_pred = np.zeros(len(y_prob))
for i in range (len(y_pred)):
    if y_prob[i] >= 0.5:
        y_pred[i] = 1
    else:
        y_pred[i] = 0
y_prob

Optimization terminated successfully.
         Current function value: 0.626988
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  296
Model:                          Logit   Df Residuals:                      295
Method:                           MLE   Df Model:                            0
Date:                Mon, 19 Feb 2024   Pseudo R-squ.:                  -2.127
Time:                        00:00:21   Log-Likelihood:                -185.59
converged:                       True   LL-Null:                       -59.348
Covariance Type:            nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -2.2407      0.470     -4.763      0.000      -3.163      -1.319


array([0.5       , 0.5       , 0.5       , 0.09615385, 0.5       ,
       0.5       , 0.5       , 0.5       , 0.09615385, 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.09615385, 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.09615385, 0.5       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.09615385, 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.09615385,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.09615385, 0.5       , 0.5       ,
       0.5       , 0.5       , 0.09615385, 0.5       , 0.5       ,
       0.5       , 0.5       , 0.09615385, 0.09615385, 0.5       ,
       0.5       , 0.5       , 0.09615385, 0.5       , 0.5    

In [335]:
model_2 = LogisticRegression().fit(X, y)
model_2.coef_

predicted_y = model_2.predict(X_test)

In [336]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy: 0.1968503937007874
Precision: 0.04672897196261682
Recall: 1.0


## Part 4: Do teams that win more faceoffs tend to win games more often?

In [337]:
faceoff_df = duckdb.sql("""
                   SELECT Date, Period, Clock, "Home Team Goals", "Away Team Goals", Team
                   FROM data
                   WHERE (Event = 'Faceoff Win')
                  """).df()
faceoff_df

Unnamed: 0,Date,Period,Clock,Home Team Goals,Away Team Goals,Team
0,2023-11-08,1,20:00,0,0,Women - Canada
1,2023-11-08,1,14:10,0,0,Women - United States
2,2023-11-08,1,13:50,0,0,Women - United States
3,2023-11-08,1,12:45,0,0,Women - United States
4,2023-11-08,1,12:10,0,0,Women - United States
...,...,...,...,...,...,...
204,2023-12-16,3,3:05,2,2,Women - United States
205,2023-12-16,3,0:20,2,2,Women - United States
206,2023-12-16,4,5:00,2,2,Women - United States
207,2023-12-16,4,3:54,2,2,Women - United States


In [338]:
grouped_by_game = faceoff_df.groupby(['Date'])

In [339]:
for game, data in grouped_by_game:
    print(game)
    print(data)

('2023-11-08',)
          Date  Period  Clock  Home Team Goals  Away Team Goals  \
0   2023-11-08       1  20:00                0                0   
1   2023-11-08       1  14:10                0                0   
2   2023-11-08       1  13:50                0                0   
3   2023-11-08       1  12:45                0                0   
4   2023-11-08       1  12:10                0                0   
5   2023-11-08       1  11:29                0                0   
6   2023-11-08       1  10:40                0                0   
7   2023-11-08       1  10:05                0                0   
8   2023-11-08       1   8:18                0                0   
9   2023-11-08       1   8:05                0                0   
10  2023-11-08       1   7:33                0                0   
11  2023-11-08       1   5:39                0                0   
12  2023-11-08       1   3:40                0                0   
13  2023-11-08       1   3:03                0

In [340]:
total_faceoff_wins_usa = 0
total_faceoff_wins_canada = 0


for date, game in grouped_by_game:
    faceoff_wins_usa = (game['Team'] == 'Women - United States').sum()
    faceoff_wins_canada = (game['Team'] == 'Women - Canada').sum()    

    total_faceoff_wins_usa += faceoff_wins_usa
    total_faceoff_wins_canada += faceoff_wins_canada

    print(f"Date: {date}, Faceoff Wins USA: {faceoff_wins_usa}, Faceoff Wins Canada: {faceoff_wins_canada}")
    
print(f"Total Faceoff Wins USA: {total_faceoff_wins_usa}")
print(f"Total Faceoff Wins Canada: {total_faceoff_wins_canada}")

Date: ('2023-11-08',), Faceoff Wins USA: 30, Faceoff Wins Canada: 22
Date: ('2023-11-11',), Faceoff Wins USA: 25, Faceoff Wins Canada: 25
Date: ('2023-12-14',), Faceoff Wins USA: 19, Faceoff Wins Canada: 28
Date: ('2023-12-16',), Faceoff Wins USA: 40, Faceoff Wins Canada: 20
Total Faceoff Wins USA: 114
Total Faceoff Wins Canada: 95


In [341]:
wins_usa = 0
wins_canada = 0
ties = 0

# Iterate through each game group
for date, game_group in grouped_by_game:
    # Determine the outcome of the game
    last_row = game_group.iloc[-1]  # Access the last row of the group
    goals_usa = last_row['Home Team Goals']
    goals_canada = last_row['Away Team Goals']
    
    if goals_usa > goals_canada:
        wins_usa += 1
    elif goals_usa < goals_canada:
        wins_canada += 1
    else:
        ties += 1

total_games = wins_usa + wins_canada + ties

# Calculate the win percentage for each team
win_percentage_usa = (wins_usa / total_games) * 100
win_percentage_canada = (wins_canada / total_games) * 100
tie_percentage = (ties/total_games) * 100

# Calculate the faceoff win percentage for each team
total_faceoff_wins = total_faceoff_wins_usa + total_faceoff_wins_canada
faceoff_win_percentage_usa = (total_faceoff_wins_usa / total_faceoff_wins) * 100
faceoff_win_percentage_canada = (total_faceoff_wins_canada / total_faceoff_wins) * 100

# Perform correlation analysis to determine if there is a correlation between win percentage and faceoff win percentage

print("Win Percentage for Women's United States:", win_percentage_usa)
print("Win Percentage for Women's Canada:", win_percentage_canada)
print("Tie Percentage:", tie_percentage)
print("Faceoff Win Percentage for Women's United States:", faceoff_win_percentage_usa)
print("Faceoff Win Percentage for Women's Canada:", faceoff_win_percentage_canada)

Win Percentage for Women's United States: 50.0
Win Percentage for Women's Canada: 0.0
Tie Percentage: 50.0
Faceoff Win Percentage for Women's United States: 54.54545454545454
Faceoff Win Percentage for Women's Canada: 45.45454545454545


In [342]:
win_percentages = np.array([win_percentage_usa, win_percentage_canada])
faceoff_win_percentages = np.array([faceoff_win_percentage_usa, faceoff_win_percentage_canada])

# Perform Pearson correlation analysis
correlation_coefficient, p_value = pearsonr(win_percentages, faceoff_win_percentages)

print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)

Pearson correlation coefficient: 1.0
P-value: 1.0
