In [47]:
import warnings
warnings.filterwarnings("ignore")

In [48]:
import pandas as pd
pd.set_option('display.max_columns', 500)

import numpy as np

In [49]:
hockey_df = pd.read_csv('canes.csv')

hockey_df.shape

(1876, 29)

In [50]:
hockey_df.head()

Unnamed: 0,year,date_game,game_location,opp_name,goals,opp_goals,game_outcome,overtimes,shots,pen_min,goals_pp,chances_pp,goals_sh,shots_against,pen_min_opp,goals_against_pp,opp_chances_pp,goals_against_sh,corsi_for,corsi_against,corsi_pct,fenwick_for,fenwick_against,fenwick_pct,faceoff_wins,faceoff_losses,faceoff_percentage,zs_offense_pct,pdo
0,1998,1997-10-01,A,Tampa Bay Lightning,2.0,4.0,L,,29.0,39.0,1.0,,0.0,27.0,39.0,2.0,,0.0,,,,,,,,,,,
1,1998,1997-10-03,H,Pittsburgh Penguins,3.0,4.0,L,,28.0,72.0,2.0,,0.0,27.0,43.0,1.0,,0.0,,,,,,,,,,,
2,1998,1997-10-04,A,Ottawa Senators,2.0,3.0,L,,18.0,16.0,1.0,,0.0,27.0,10.0,2.0,,0.0,,,,,,,,,,,
3,1998,1997-10-07,H,Los Angeles Kings,3.0,3.0,T,OT,29.0,18.0,0.0,,0.0,30.0,10.0,0.0,,0.0,,,,,,,,,,,
4,1998,1997-10-10,H,New Jersey Devils,2.0,1.0,W,,27.0,8.0,0.0,,0.0,17.0,8.0,0.0,,0.0,,,,,,,,,,,


In [51]:
games_home = hockey_df[hockey_df['game_location'] == 'H']
games_home['game_outcome'] = games_home['game_outcome'].apply(lambda x : 1 if x == 'W' else 0)

games_home.shape

(940, 29)

In [52]:
games_home.head()

Unnamed: 0,year,date_game,game_location,opp_name,goals,opp_goals,game_outcome,overtimes,shots,pen_min,goals_pp,chances_pp,goals_sh,shots_against,pen_min_opp,goals_against_pp,opp_chances_pp,goals_against_sh,corsi_for,corsi_against,corsi_pct,fenwick_for,fenwick_against,fenwick_pct,faceoff_wins,faceoff_losses,faceoff_percentage,zs_offense_pct,pdo
1,1998,1997-10-03,H,Pittsburgh Penguins,3.0,4.0,0,,28.0,72.0,2.0,,0.0,27.0,43.0,1.0,,0.0,,,,,,,,,,,
3,1998,1997-10-07,H,Los Angeles Kings,3.0,3.0,0,OT,29.0,18.0,0.0,,0.0,30.0,10.0,0.0,,0.0,,,,,,,,,,,
4,1998,1997-10-10,H,New Jersey Devils,2.0,1.0,1,,27.0,8.0,0.0,,0.0,17.0,8.0,0.0,,0.0,,,,,,,,,,,
8,1998,1997-10-15,H,Buffalo Sabres,3.0,3.0,0,OT,37.0,12.0,2.0,,1.0,28.0,10.0,3.0,,0.0,,,,,,,,,,,
14,1998,1997-10-22,H,St. Louis Blues,4.0,3.0,1,,26.0,8.0,0.0,,0.0,31.0,4.0,0.0,,0.0,,,,,,,,,,,


In [53]:
result = np.array(games_home['game_outcome'])

In [54]:
opp_pim = np.array(games_home['pen_min_opp'].fillna(0))

In [55]:
opp_pim = opp_pim.reshape((opp_pim.shape[0], 1))

In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [57]:
range_c = 3

param_grid = {
    'logistic__C' : [10**i for i in range(-range_c, range_c+1) if i != 0],
    'logistic__fit_intercept' : [True, False]
}

pipe_min_max = Pipeline([
    ('scale', MinMaxScaler()),
    ('logistic', LogisticRegression())
])

pipe_standard = Pipeline([
    ('scale', StandardScaler()),
    ('logistic', LogisticRegression())
])

grid_search_min_max = GridSearchCV(pipe_min_max, param_grid=param_grid, cv=5)
grid_search_standard = GridSearchCV(pipe_standard, param_grid=param_grid, cv=5)

grid_search_min_max.fit(opp_pim, result)
grid_search_standard.fit(opp_pim, result)

print (grid_search_min_max.score(opp_pim, result))
print (grid_search_standard.score(opp_pim, result))

0.5553191489361702
0.5553191489361702


In [58]:
grid_search_min_max.best_estimator_

Pipeline(memory=None,
         steps=[('scale', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('logistic',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [59]:
corr = np.corrcoef(games_home[['game_outcome', 'pen_min_opp']].fillna(0).to_numpy(), rowvar=False)

print (corr)

[[1.         0.12249083]
 [0.12249083 1.        ]]


## Cheater Definition 2

What if a "cheater" is whichever team has more penalty minutes?

In [60]:
games_home['pen_min'] = games_home['pen_min'].fillna(0)
games_home['pen_min_opp'] = games_home['pen_min_opp'].fillna(0)

games_home['more_pim'] = games_home['pen_min'] > games_home['pen_min_opp']
games_home['more_pim'] = games_home['more_pim'].apply(lambda x : 1 if x else 0)

In [61]:
more_pim = np.array(games_home['more_pim'])

In [62]:
equal_results = (more_pim == result) 

np.sum(equal_results) / equal_results.shape[0]

0.48404255319148937

In [64]:
corr = np.corrcoef(games_home[['game_outcome', 'more_pim']].to_numpy(), rowvar=False)

corr

array([[ 1.        , -0.03382926],
       [-0.03382926,  1.        ]])

Explore what barrier of opponent penalty minutes allows us to predict best

In [65]:
games_home['penalty_diff'] = abs(games_home['pen_min'] - games_home['pen_min_opp'])

In [66]:
max_pen_diff = int(games_home['penalty_diff'].max())
max_opp_pim = int(games_home['pen_min_opp'].max())

print (max_pen_diff)
print (max_opp_pim)

58
87


In [67]:
best_acc = 0
best_pim = 0

for pim in range(0, max_opp_pim+1):
    lr = LogisticRegression()
    
    games_limit = games_home[games_home['pen_min_opp'] >= pim]
    game_results = np.array(games_limit['game_outcome'])
    opp_pim_limit = np.array(games_limit['pen_min_opp'])
    opp_pim_limit = opp_pim_limit.reshape((opp_pim_limit.shape[0], 1))
    
    try:
        lr.fit(opp_pim_limit, game_results)
    except ValueError:
        continue
    
    acc = lr.score(opp_pim_limit, game_results)
    print (pim, opp_pim_limit.shape[0], acc)
    if acc >= best_acc:
        best_acc = acc
        best_pim = pim
        
print (best_pim, best_acc)

0 940 0.5553191489361702
1 936 0.5555555555555556
2 936 0.5555555555555556
3 903 0.5592469545957918
4 903 0.5592469545957918
5 824 0.5643203883495146
6 824 0.5643203883495146
7 695 0.5597122302158274
8 692 0.5592485549132948
9 554 0.5631768953068592
10 541 0.5637707948243993
11 441 0.5532879818594104
12 428 0.5560747663551402
13 360 0.575
14 332 0.5813253012048193
15 278 0.579136690647482
16 256 0.5859375
17 217 0.5944700460829493
18 192 0.6041666666666666
19 160 0.6125
20 142 0.6338028169014085
21 123 0.6422764227642277
22 111 0.6306306306306306
23 95 0.6526315789473685
24 84 0.6309523809523809
25 74 0.6486486486486487
26 65 0.6615384615384615
27 60 0.6666666666666666
28 56 0.6785714285714286
29 47 0.6808510638297872
30 40 0.7
31 35 0.6857142857142857
32 32 0.6875
33 30 0.6666666666666666
34 27 0.7037037037037037
35 26 0.6923076923076923
36 24 0.7083333333333334
37 23 0.6956521739130435
38 17 0.6470588235294118
39 15 0.6666666666666666
40 14 0.6428571428571429
41 13 0.7692307692307693