In [127]:
import numpy as np
import duckdb
import pandas as pd
#import seaborn as sns
import statsmodels.formula.api as sfm
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
#from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
from scipy.stats import norm

# Hypothesis 1: What type of shots yield more success?

There are many different types of shots that a hockey player can play, however some type of shots might have a greater success rate than others in making a goal.

In [111]:
df = pd.read_csv('BDC_2024_Womens_Data.csv')

In [112]:
df.columns

Index(['Date', 'Home Team', 'Away Team', 'Period', 'Clock',
       'Home Team Skaters', 'Away Team Skaters', 'Home Team Goals',
       'Away Team Goals', 'Team', 'Player', 'Event', 'X Coordinate',
       'Y Coordinate', 'Detail 1', 'Detail 2', 'Detail 3', 'Detail 4',
       'Player 2', 'X Coordinate 2', 'Y Coordinate 2'],
      dtype='object')

In [113]:
query = """
        SELECT Event, "Detail 1" AS "Shot Type", 
        FROM df
        WHERE Event = 'Goal' OR Event = 'Shot'
        """

shots_df = duckdb.sql(query).df()
shots_df

Unnamed: 0,Event,Shot Type
0,Goal,Wristshot
1,Goal,Wristshot
2,Goal,Snapshot
3,Goal,Wristshot
4,Shot,Snapshot
...,...,...
418,Shot,Wristshot
419,Shot,Wristshot
420,Shot,Wristshot
421,Shot,Wristshot


In [114]:
shots_df['Shot Type'].unique()

array(['Wristshot', 'Snapshot', 'Fan', 'Slapshot', 'Deflection',
       'Wrap Around', 'Bat', 'Poke'], dtype=object)

There are 8 different types of shots: Wristshot, Snapshot, Fan, Slapshot, Deflection, Wrap Around, Bat, and Poke.

In [115]:
query = """
        SELECT "Shot Type", 
        COUNT(Event) AS "Total Attempted",
        COUNT (CASE WHEN Event = 'Goal' THEN 1 END) AS "Success"
        FROM shots_df
        GROUP BY "Shot Type"
        """

shot_type_df = duckdb.sql(query).df()
shot_type_df

Unnamed: 0,Shot Type,Total Attempted,Success
0,Wristshot,185,9
1,Wrap Around,4,0
2,Snapshot,167,8
3,Poke,1,0
4,Slapshot,27,1
5,Deflection,27,2
6,Bat,1,0
7,Fan,11,0


In [116]:
shot_type_df['Success Rate'] = round((shot_type_df['Success'] / shot_type_df['Total Attempted'] ) , 4) * 100
shot_type_df

Unnamed: 0,Shot Type,Total Attempted,Success,Success Rate
0,Wristshot,185,9,4.86
1,Wrap Around,4,0,0.0
2,Snapshot,167,8,4.79
3,Poke,1,0,0.0
4,Slapshot,27,1,3.7
5,Deflection,27,2,7.41
6,Bat,1,0,0.0
7,Fan,11,0,0.0


From this data set, we can see that out of all the shots, Deflection shots have the highest success rate. To test whether this is actually true we will conduct a hypothesis test.

## Hypothesis Test:

Null Hypothesis: All types of shots have the same chance of yielding success.

Alternative Hypothesis: One type of shot has a higher chance of yielding success, in this case, a deflection shot.

In [117]:
total_attempts = shot_type_df["Total Attempted"].values
successes = shot_type_df["Success"].values

overall_success_rate = successes.sum()/total_attempts.sum()
overall_success_rate

0.04728132387706856

In [118]:
z_scores = []
p_values = []


#Z-test
for attempts, success in zip(total_attempts, successes):
    success_rate = success/attempts

    #standard error
    se = np.sqrt((success_rate * (1 - success_rate)) / attempts)

    #z-score
    if se == 0:
        z = np.nan
        p = np.nan
    else:
        z = (success_rate - overall_success_rate) / se
        p = 2 * (1 - norm.cdf(np.abs(z))) 

    z_scores.append(z)
    p_values.append(p) 

In [119]:
sig_shot_types = []
for shot_type, p in zip(shot_type_df['Shot Type'], p_values):
    print(shot_type + ": p-value = " + str(p))
    if p < 0.05:
        sig_shot_types.append(shot_type)

Wristshot: p-value = 0.9311108077426082
Wrap Around: p-value = nan
Snapshot: p-value = 0.9699347902522513
Poke: p-value = nan
Slapshot: p-value = 0.7780472513035825
Deflection: p-value = 0.5950090286606828
Bat: p-value = nan
Fan: p-value = nan


Since all of the p-values are less than 0.05, or equal to nan, we cannot come to a conclusion right now. So to further analyze this, I will perform a chi-square test.

In [120]:
#observed
total_attempts = shot_type_df["Total Attempted"].values
successes = shot_type_df["Success"].values

In [121]:
overall_success_rate = successes.sum() / total_attempts.sum()
#expected = np.array([[rate * overall_success_rate for rate in total_attempts]])

#had to make a small adjustment to avoid getting 0 elements
expected_successes = total_attempts * overall_success_rate
contingency_table = np.array([successes, expected_successes])

In [122]:
chi2, p, _, _ = chi2_contingency(contingency_table)

In [123]:
print("Chi square statistic: " + str(chi2))
print("P-value: " + str(p))

Chi square statistic: 1.0013863559852174
P-value: 0.9948061416640558


Given that the p-value is greater than the significance level of 0.05, we fail to reject the null hypothesis. This suggests that there is not enough evidence to conclude that there is a significant difference in success rates among the different types of shots. 

(Measuring effect size to see if there is actually an effect of shot type).
We will now calculate the Phi coefficient to measure the association between the success of a shot, the total number of attempts, and the type of shot. To this, we need  to construct a contingency table for each pair of variables. Since the Phi coefficient is typically used for two binary variables, we need to choose one variable as the binary variable and the other as the categorical variable.

Create binary variables for "Success" and "Total Attempted".
Create binary variables for each type of shot (e.g., "Deflection", "Fan", "Poke", etc.).
Calculate the Phi coefficient for each pair of variables (Success vs. Shot Type, Total Attempted vs. Shot Type).


In [135]:
# total_attempts = shot_type_df["Total Attempted"].values
# successes = shot_type_df["Success"].values

# # Determine the categories for each variable
# success_binary = [1 if s > 0 else 0 for s in success]
# total_attempted_binary = [1 if t > 0 else 0 for t in total_attempted]

# # Create a contingency table
# contingency_table = np.array([
#     [sum((success_binary == 0) & (total_attempted_binary == 0)), sum((success_binary == 1) & (total_attempted_binary == 0))],
#     [sum((success_binary == 0) & (total_attempted_binary == 1)), sum((success_binary == 1) & (total_attempted_binary == 1))]
# ])

# # Calculate the Phi coefficient
# def phi_coefficient(contingency_table):
#     chi2 = np.sum(contingency_table) / contingency_table.sum().sum()
#     rows, cols = contingency_table.shape
#     phi = np.sqrt(chi2 / (rows * cols))
#     return phi

# phi = phi_coefficient(contingency_table)
# print("Phi coefficient:", phi)

In [136]:
success = shot_type_df["Success"].values
total_attempted = shot_type_df["Total Attempted"].values
shot_types = shot_type_df[['Shot Type_Deflection', 'Shot Type_Fan', 'Shot Type_Poke', 'Shot Type_Slapshot', 'Shot Type_Snapshot', 'Shot Type_Wrap Around', 'Shot Type_Wristshot']].values

# Convert Success and Total Attempted to binary form
success_binary = np.where(success > 0, 1, 0)
total_attempted_binary = np.where(total_attempted > 0, 1, 0)

# Calculate Phi coefficient for Success vs. Shot Type
phi_success_shot_type = phi_coefficient(np.corrcoef(success_binary, shot_types.T)[0, 1:])

# Calculate Phi coefficient for Total Attempted vs. Shot Type
phi_attempted_shot_type = phi_coefficient(np.corrcoef(total_attempted_binary, shot_types.T)[0, 1:])

print("Phi coefficient for Success vs. Shot Type:", phi_success_shot_type)
print("Phi coefficient for Total Attempted vs. Shot Type:", phi_attempted_shot_type)

KeyError: "None of [Index(['Shot Type_Deflection', 'Shot Type_Fan', 'Shot Type_Poke',\n       'Shot Type_Slapshot', 'Shot Type_Snapshot', 'Shot Type_Wrap Around',\n       'Shot Type_Wristshot'],\n      dtype='object')] are in the [columns]"

Since we have a Phi Coefficient of 0.5, we know that there is a moderate association between the total number of succees

In [124]:
dummy_df = pd.get_dummies(shot_type_df, columns=['Shot Type'])
dummy_df.columns

Index(['Total Attempted', 'Success', 'Success Rate', 'Shot Type_Bat',
       'Shot Type_Deflection', 'Shot Type_Fan', 'Shot Type_Poke',
       'Shot Type_Slapshot', 'Shot Type_Snapshot', 'Shot Type_Wrap Around',
       'Shot Type_Wristshot'],
      dtype='object')

In [128]:
X = df_dummy
y = shot_type_df['Success']
X = sm.add_constant(X)
# logit_model = sm.Logit(y, X)
# result = logit_model.fit()
model = sm.OLS(y, X)
results = model.fit()
#print(results.summary())


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
# reg_exp = 'Success_Rate ~ `Shot Type_Bat` + `Shot Type_Deflection` + `Shot Type_Fan` + `Shot Type_Poke` + `Shot Type_Slapshot` + `Shot Type_Snapshot` + `Shot Type_Wrap Around` + `Shot Type_Wristshot`'
# olsr_model = sfm.ols(formula=reg_exp, data=dummy_df)
# olsr_model_results = olsr_model.fit()
# print(olsr_model_results.summary())