## NHL Game Analysis

#### Import Dependencies

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from scipy.stats import pearsonr

In [50]:
file = "data/game.csv"
raw_df = pd.read_csv(file)
raw_df.head()

Unnamed: 0,game_id,season,type,date_time,date_time_GMT,away_team_id,home_team_id,away_goals,home_goals,outcome,home_rink_side_start,venue,venue_link,venue_time_zone_id,venue_time_zone_offset,venue_time_zone_tz
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,3,4,home win OT,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,4,1,away win REG,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,3,4,home win OT,left,Prudential Center,/api/v1/venues/null,America/New_York,-4,EDT
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,2,4,home win REG,left,Prudential Center,/api/v1/venues/null,America/New_York,-4,EDT
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,3,1,away win REG,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT


In [59]:
raw_df.count()

game_id                   11434
season                    11434
type                      11434
date_time                 11434
date_time_GMT             11434
away_team_id              11434
home_team_id              11434
away_goals                11434
home_goals                11434
outcome                   11434
home_rink_side_start      11140
venue                     11434
venue_link                11434
venue_time_zone_id        11434
venue_time_zone_offset    11434
venue_time_zone_tz        11434
dtype: int64

In [60]:
drop_na_df = raw_df.dropna(how='any')
drop_na_df.count()

game_id                   11140
season                    11140
type                      11140
date_time                 11140
date_time_GMT             11140
away_team_id              11140
home_team_id              11140
away_goals                11140
home_goals                11140
outcome                   11140
home_rink_side_start      11140
venue                     11140
venue_link                11140
venue_time_zone_id        11140
venue_time_zone_offset    11140
venue_time_zone_tz        11140
dtype: int64

In [61]:
drop_na_df.columns

Index(['game_id', 'season', 'type', 'date_time', 'date_time_GMT',
       'away_team_id', 'home_team_id', 'away_goals', 'home_goals', 'outcome',
       'home_rink_side_start', 'venue', 'venue_link', 'venue_time_zone_id',
       'venue_time_zone_offset', 'venue_time_zone_tz'],
      dtype='object')

In [62]:
rename_df = drop_na_df.rename(columns={
    'game_id':'Game ID',
    'season':'Season',
    'type':'Type',
    'date_time':'Date',
    'date_time_GMT':'Unix Date',
    'away_team_id':'Away Team ID',
    'home_team_id':'Home Team ID',
    'away_goals':'Away Goals',
    'home_goals':'Home Goals',
    'outcome':'Outcome',
    'home_rink_side_start':'Home Rink Side Start',
    'venue':'Venue',
    'venue_link':'Venue Link',
    'venue_time_zone_id':'Venue Time Zone ID',
    'venue_time_zone_offset':'Venue Time Zone Offset',
    'venue_time_zone_tz':'Venue Time Zone TZ'
})

In [63]:
rename_df.head()

Unnamed: 0,Game ID,Season,Type,Date,Unix Date,Away Team ID,Home Team ID,Away Goals,Home Goals,Outcome,Home Rink Side Start,Venue,Venue Link,Venue Time Zone ID,Venue Time Zone Offset,Venue Time Zone TZ
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,3,4,home win OT,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,4,1,away win REG,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,3,4,home win OT,left,Prudential Center,/api/v1/venues/null,America/New_York,-4,EDT
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,2,4,home win REG,left,Prudential Center,/api/v1/venues/null,America/New_York,-4,EDT
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,3,1,away win REG,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT


In [72]:
clean_df = rename_df.drop(columns=['Venue Link','Venue Time Zone ID'])
clean_df.head()

Unnamed: 0,Game ID,Season,Type,Date,Unix Date,Away Team ID,Home Team ID,Away Goals,Home Goals,Outcome,Home Rink Side Start,Venue,Venue Time Zone Offset,Venue Time Zone TZ
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,3,4,home win OT,right,Wells Fargo Center,-4,EDT
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,4,1,away win REG,right,Wells Fargo Center,-4,EDT
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,3,4,home win OT,left,Prudential Center,-4,EDT
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,2,4,home win REG,left,Prudential Center,-4,EDT
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,3,1,away win REG,right,Wells Fargo Center,-4,EDT


In [73]:
winner = []
reg_ot = []

clean_df['Outcome'] = clean_df['Outcome'].str.split(' ',2)
for x in clean_df['Outcome']:
    winner.append(x[0])
    reg_ot.append(x[2])

In [82]:
clean_df['Winner'] = winner
clean_df['Final Period'] = reg_ot
data_df = clean_df[[
    'Game ID',
    'Season',
    'Type',
    'Date',
    'Unix Date',
    'Away Team ID',
    'Home Team ID',
    'Away Goals',
    'Home Goals',
    'Winner',
    'Final Period',
    'Home Rink Side Start',
    'Venue',
    'Venue Time Zone Offset',
    'Venue Time Zone TZ'
]]

In [83]:
data_df.head()

Unnamed: 0,Game ID,Season,Type,Date,Unix Date,Away Team ID,Home Team ID,Away Goals,Home Goals,Winner,Final Period,Home Rink Side Start,Venue,Venue Time Zone Offset,Venue Time Zone TZ
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,3,4,home,OT,right,Wells Fargo Center,-4,EDT
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,4,1,away,REG,right,Wells Fargo Center,-4,EDT
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,3,4,home,OT,left,Prudential Center,-4,EDT
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,2,4,home,REG,left,Prudential Center,-4,EDT
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,3,1,away,REG,right,Wells Fargo Center,-4,EDT


In [84]:
data_df.dtypes

Game ID                    int64
Season                     int64
Type                      object
Date                      object
Unix Date                 object
Away Team ID               int64
Home Team ID               int64
Away Goals                 int64
Home Goals                 int64
Winner                    object
Final Period              object
Home Rink Side Start      object
Venue                     object
Venue Time Zone Offset     int64
Venue Time Zone TZ        object
dtype: object

In [85]:
data = pd.get_dummies(data_df)
data.head(5)

Unnamed: 0,Game ID,Season,Away Team ID,Home Team ID,Away Goals,Home Goals,Venue Time Zone Offset,Type_P,Type_R,Date_2010-10-07,...,Venue_Xcel Energy Center,Venue_Yankee Stadium,Venue Time Zone TZ_CDT,Venue Time Zone TZ_CST,Venue Time Zone TZ_EDT,Venue Time Zone TZ_EST,Venue Time Zone TZ_MDT,Venue Time Zone TZ_MST,Venue Time Zone TZ_PDT,Venue Time Zone TZ_PST
0,2011030221,20112012,1,4,3,4,-4,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2011030222,20112012,1,4,4,1,-4,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2011030223,20112012,4,1,3,4,-4,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,2011030224,20112012,4,1,2,4,-4,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,2011030225,20112012,1,4,3,1,-4,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
cor = data.corr()
cor_target = abs(cor["Winner"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.1]
relevant_features