In [5]:
import numpy as np
import duckdb
import pandas as pd
import seaborn as sns
import math
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
from scipy.stats import norm

# Hypothesis 1: What type of shots yield more success?

There are many different types of shots that a hockey player can play, however some type of shots might have a greater success rate than others in making a goal.

In [6]:
df = pd.read_csv('BDC_2024_Womens_Data.csv')

In [7]:
df.columns

Index(['Date', 'Home Team', 'Away Team', 'Period', 'Clock',
       'Home Team Skaters', 'Away Team Skaters', 'Home Team Goals',
       'Away Team Goals', 'Team', 'Player', 'Event', 'X Coordinate',
       'Y Coordinate', 'Detail 1', 'Detail 2', 'Detail 3', 'Detail 4',
       'Player 2', 'X Coordinate 2', 'Y Coordinate 2'],
      dtype='object')

In [8]:
query = """
        SELECT Event, "Detail 1" AS "Shot Type", 
        FROM df
        WHERE Event = 'Goal' OR Event = 'Shot'
        """

shots_df = duckdb.sql(query).df()
shots_df

Unnamed: 0,Event,Shot Type
0,Goal,Wristshot
1,Goal,Wristshot
2,Goal,Snapshot
3,Goal,Wristshot
4,Shot,Snapshot
...,...,...
418,Shot,Wristshot
419,Shot,Wristshot
420,Shot,Wristshot
421,Shot,Wristshot


In [9]:
shots_df['Shot Type'].unique()

array(['Wristshot', 'Snapshot', 'Fan', 'Slapshot', 'Deflection',
       'Wrap Around', 'Bat', 'Poke'], dtype=object)

There are 8 different types of shots: Wristshot, Snapshot, Fan, Slapshot, Deflection, Wrap Around, Bat, and Poke.

In [10]:
query = """
        SELECT "Shot Type", 
        COUNT(Event) AS "Total Attempted",
        COUNT (CASE WHEN Event = 'Goal' THEN 1 END) AS "Success"
        FROM shots_df
        GROUP BY "Shot Type"
        """

shot_type_df = duckdb.sql(query).df()
shot_type_df

Unnamed: 0,Shot Type,Total Attempted,Success
0,Wristshot,185,9
1,Snapshot,167,8
2,Fan,11,0
3,Slapshot,27,1
4,Deflection,27,2
5,Wrap Around,4,0
6,Bat,1,0
7,Poke,1,0


In [11]:
shot_type_df['Success Rate'] = round((shot_type_df['Success'] / shot_type_df['Total Attempted'] ) , 4) * 100
shot_type_df

Unnamed: 0,Shot Type,Total Attempted,Success,Success Rate
0,Wristshot,185,9,4.86
1,Snapshot,167,8,4.79
2,Fan,11,0,0.0
3,Slapshot,27,1,3.7
4,Deflection,27,2,7.41
5,Wrap Around,4,0,0.0
6,Bat,1,0,0.0
7,Poke,1,0,0.0


From this data set, we can see that out of all the shots, Deflection shots have the highest success rate. To test whether this is actually true we will conduct a hypothesis test.

## Hypothesis Test:

Null Hypothesis: All types of shots have the same chance of yielding success.

Alternative Hypothesis: One type of shot has a higher chance of yielding success, in this case, a deflection shot.

In [12]:
total_attempts = shot_type_df["Total Attempted"].values
successes = shot_type_df["Success"].values

overall_success_rate = successes.sum()/total_attempts.sum()
overall_success_rate

0.04728132387706856

In [13]:
z_scores = []
p_values = []


#Z-test
for attempts, success in zip(total_attempts, successes):
    success_rate = success/attempts

    #standard error
    se = np.sqrt((success_rate * (1 - success_rate)) / attempts)

    #z-score
    if se == 0:
        z = np.nan
        p = np.nan
    else:
        z = (success_rate - overall_success_rate) / se
        p = 2 * (1 - norm.cdf(np.abs(z))) 

    z_scores.append(z)
    p_values.append(p)

In [14]:
sig_shot_types = []
for shot_type, p in zip(shot_type_df['Shot Type'], p_values):
    print(shot_type + ": p-value = " + str(p))
    if p < 0.05:
        sig_shot_types.append(shot_type)

Wristshot: p-value = 0.9311108077426082
Snapshot: p-value = 0.9699347902522513
Fan: p-value = nan
Slapshot: p-value = 0.7780472513035825
Deflection: p-value = 0.5950090286606828
Wrap Around: p-value = nan
Bat: p-value = nan
Poke: p-value = nan


Since all of the p-values are less than 0.05, or equal to nan, we cannot come to a conclusion right now. So to further analyze this, I will perform a chi-square test.

In [15]:
#observed
total_attempts = shot_type_df["Total Attempted"].values
successes = shot_type_df["Success"].values

In [38]:
overall_success_rate = successes.sum() / total_attempts.sum()
#expected = np.array([[rate * overall_success_rate for rate in total_attempts]])

#had to make a small adjustment to avoid getting 0 elements
expected = np.array([[rate * overall_success_rate + 1e-10 for rate in total_attempts]])

In [39]:
x2, p = chi2_contingency(successes)

ValueError: The internally computed table of expected frequencies has a zero element at (2,).

In [40]:
print("Chi square statistic: " + str(x2))
print("P-value: " + str(p))

NameError: name 'x2' is not defined

In [33]:
X = shot_type_df["Shot Type"]
df_dummy = pd.get_dummies(X, columns = ["Shot Type"], drop_first=True)
df_dummy

Unnamed: 0,Deflection,Fan,Poke,Slapshot,Snapshot,Wrap Around,Wristshot
0,False,False,False,False,False,False,True
1,False,False,False,False,True,False,False
2,False,True,False,False,False,False,False
3,False,False,False,True,False,False,False
4,True,False,False,False,False,False,False
5,False,False,False,False,False,True,False
6,False,False,False,False,False,False,False
7,False,False,True,False,False,False,False


In [37]:
X = sm.add_constant(df_dummy).values
X
y = shot_type_df['Success']
dummy_fit = sm.OLS(endog = y, exog = X)
model = dummy_fit.fit()

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''