In [1]:
import sklearn as sk
import pandas as pd
import glob

In [2]:
def calculate_goal_dif(home_or_away, home_score, away_score):
    if home_or_away == 'HOME':
        return home_score - away_score
    else:
        return away_score - home_score

In [16]:
file_path = '/Users/tylerviducic/dev/hockey_analytics/gamescore_model/data/cleaned_data/'
files = glob.glob(file_path + '*.csv')
df = pd.concat([pd.read_csv(f) for f in files], ignore_index = True)
df.drop(columns = ['Unnamed: 0'], inplace = True)

In [17]:
df['game_goal_differential'] = df.apply(lambda row: calculate_goal_dif(row['home_or_away'], row['home_score'], row['away_score']), axis = 1)

In [18]:
df = df.sample(frac = 1).reset_index(drop = True)

In [19]:
longest = 0
for column in df.columns:
    if len(column) > longest:
        longest = len(column)

for i, column in enumerate(df.columns):
    end = ' ' * (longest - len(column)) + ' | '
    if (i+1) % 2 == 0:
        end = '\n'
    print(column, end = end)

playerId                                                | season
name                                                    | gameId
playerTeam                                              | opposingTeam
home_or_away                                            | gameDate
position                                                | icetime
shifts                                                  | gameScore
iceTimeRank                                             | I_F_xOnGoal
I_F_xGoals                                              | I_F_xRebounds
I_F_xFreeze                                             | I_F_xPlayStopped
I_F_xPlayContinuedInZone                                | I_F_xPlayContinuedOutsideZone
I_F_flurryAdjustedxGoals                                | I_F_scoreVenueAdjustedxGoals
I_F_flurryScoreVenueAdjustedxGoals                      | I_F_primaryAssists
I_F_secondaryAssists                                    | I_F_shotsOnGoal
I_F_missedShots                                        

In [20]:
features = ['iceTimeRank', 'I_F_primaryAssists', 'I_F_secondaryAssists', 'I_F_goals', 'I_F_hits', 'I_F_takeaways', 
            'I_F_dZoneGiveaways', 'shotsBlockedByPlayer','faceoffsWon', 'faceoffsLost', 'I_F_penalityMinutes', 
            'penaltiesDrawn', 'OnIce_F_xGoals', 'OnIce_F_goals', 'OnIce_A_xGoals', 'OnIce_A_goals']
target = ['game_goal_differential']


In [21]:

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df[features], df[target], test_size = 0.2)


In [22]:
from sklearn import linear_model

model = linear_model.LinearRegression()

In [23]:
model.fit(x_train, y_train)

In [24]:
[print(len(model.coef_[0]))]
print(f'coeficcients: ')
for feature, coef in zip(features, model.coef_[0]):
    print(f'{feature}: {coef}')

16
coeficcients: 
iceTimeRank: 0.035527668608758764
I_F_primaryAssists: 0.44793536872343276
I_F_secondaryAssists: 0.4690085878827579
I_F_goals: 0.4601027380248033
I_F_hits: -0.03318778238079951
I_F_takeaways: 0.07268891672766946
I_F_dZoneGiveaways: 0.04870647191082887
shotsBlockedByPlayer: 0.1244882689319765
faceoffsWon: 0.007530787922081278
faceoffsLost: -0.01828666532868575
I_F_penalityMinutes: -0.01454264211256473
penaltiesDrawn: 0.011854260376789699
OnIce_F_xGoals: -0.1443101315080474
OnIce_F_goals: 0.7349116416497683
OnIce_A_xGoals: 0.10479149876051994
OnIce_A_goals: -0.9859682891947973


In [25]:
pred_y = model.predict(x_test)

In [26]:
from sklearn.metrics import mean_squared_error, r2_score

print(f'MSE: {mean_squared_error(y_test, pred_y)}')
print(f'R2: {r2_score(y_test, pred_y)}')

MSE: 4.392751765116583
R2: 0.1944436502638598
