In [2]:
import numpy as np
import duckdb
import pandas as pd
import seaborn as sns
import math
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
data = pd.read_csv('BDC_2024_Womens_Data.csv')

# Focus Question: How does power plays influence the success of goals?
To prepare for this bigger question, we modeled each variable to the success of a goal. With this we look to reduce the amount of variables and eliminate less significant covariates and complexity of our final model. 

Below are the links to our code where we ran simple logistic models to analyze each individual factor (Player, Distance to Goal, Shot Type, Traffic, One-Timer) as a relation to whether or not the goal was successful.

In [7]:
query = """
        SELECT Event, "Detail 1" AS "Shot Type", 
        FROM data
        WHERE Event = 'Goal' OR Event = 'Shot'
        """
shots_df = duckdb.sql(query).df()
shots_df

shot_dummies = pd.get_dummies(shots_df["Shot Type"], drop_first = True)
shot_df = pd.concat([shots_df, shot_dummies], axis = 1)

# Conversion of the booleans into integer to use as binary variable
shot_df["Deflection"].replace({False: 0, True: 1}, inplace=True)
shot_df["Fan"].replace({False: 0, True: 1}, inplace=True)
shot_df["Poke"].replace({False: 0, True: 1}, inplace=True)
shot_df["Slapshot"].replace({False: 0, True: 1}, inplace=True)
shot_df["Snapshot"].replace({False: 0, True: 1}, inplace=True)
shot_df["Wrap Around"].replace({False: 0, True: 1}, inplace=True)
shot_df["Wristshot"].replace({False: 0, True: 1}, inplace=True)
shot_df["Event"].replace({"Shot": 0, "Goal": 1}, inplace=True)

shot_x = shot_df[shot_df.columns.difference(['Event', 'Shot Type'])]
goal_y = shot_df["Event"]

lm_goal = sm.OLS(endog = goal_y, exog = shot_x).fit()

print (lm_goal.pvalues)
# We can see here that the significant shot types are snapshot and wristshot. 
print ("The coefficient for snapshot is", lm_goal.params[4])
print ("The coefficient for wristshot is", lm_goal.params[6])
# However, th

Deflection     0.072377
Fan            1.000000
Poke           1.000000
Slapshot       0.368292
Snapshot       0.003964
Wrap Around    1.000000
Wristshot      0.002089
dtype: float64
The coefficient for snapshot is 0.04790419161676644
The coefficient for wristshot is 0.048648648648648624
