## Things to try:
* Look at other notebooks for data preprocessing
* Look at other notebooks for data visualization
* Bet against odds ... %

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# loading CSV files downloaded from Kaggle
path = "data/"
df = pd.read_csv(path + "spreadspoke_scores.csv")
teams = pd.read_csv(path + "nfl_teams.csv")
games_elo = pd.read_csv(path + "nfl_elo.csv")

In [3]:
df.shape

(4750, 17)

In [4]:
df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
0,9/9/01,2001,1,False,Baltimore Ravens,17,6,Chicago Bears,BAL,-10.5,33.5,M&T Bank Stadium,False,72.0,6.0,79.0,
1,9/9/01,2001,1,False,Buffalo Bills,6,24,New Orleans Saints,NO,-1.5,37.5,Ralph Wilson Stadium,False,80.0,12.0,58.0,
2,9/9/01,2001,1,False,Cleveland Browns,6,9,Seattle Seahawks,SEA,-4.0,38.5,FirstEnergy Stadium,False,79.0,14.0,68.0,
3,9/9/01,2001,1,False,Dallas Cowboys,6,10,Tampa Bay Buccaneers,TB,-9.0,34.0,Texas Stadium,False,75.0,13.0,78.0,
4,9/9/01,2001,1,False,Green Bay Packers,28,6,Detroit Lions,GB,-5.5,41.5,Lambeau Field,False,61.0,7.0,93.0,


In [5]:
teams.head()

Unnamed: 0,team_name,team_name_short,team_id,team_id_pfr,team_conference,team_division
0,Arizona Cardinals,Cardinals,ARI,CRD,NFC,NFC West
1,Atlanta Falcons,Falcons,ATL,ATL,NFC,NFC South
2,Baltimore Ravens,Ravens,BAL,RAV,AFC,AFC North
3,Buffalo Bills,Bills,BUF,BUF,AFC,AFC East
4,Carolina Panthers,Panthers,CAR,CAR,NFC,NFC South


In [6]:
games_elo.head()

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,elo1_post,elo2_post,score1,score2
0,9/9/01,2001,0,,TEN,MIA,1603.825,1568.09,0.641039,0.358961,1574.303,1597.611,23,31
1,9/9/01,2001,0,,PHI,LAR,1546.139,1550.697,0.586116,0.413884,1529.43,1567.407,17,20
2,9/9/01,2001,0,,BUF,NO,1521.265,1498.313,0.623937,0.376063,1482.992,1536.586,6,24
3,9/9/01,2001,0,,CIN,NE,1380.427,1453.308,0.48866,0.51134,1400.399,1433.336,23,17
4,9/9/01,2001,0,,CLE,SEA,1332.236,1461.802,0.408137,0.591863,1321.242,1472.795,6,9


## Combine data into one dataframe
**Note: use merge to join dataframe**
1. df - add team_id for home and away (data from teams)
2. df - add ELO prob for home and away 

In [7]:
df.team_home.describe()

count                     4750
unique                      32
top       New England Patriots
freq                       171
Name: team_home, dtype: object

In [8]:
df.team_away.describe()

count                 4750
unique                  32
top       Baltimore Ravens
freq                   156
Name: team_away, dtype: object

In [9]:
df.team_favorite_id.describe()

count     4750
unique      32
top         NE
freq       263
Name: team_favorite_id, dtype: object

In [10]:
# mapping team_id to the correct teams
df['team_home'] = df.team_home.map(teams.set_index('team_name')['team_id'].to_dict())
df['team_away'] = df.team_away.map(teams.set_index('team_name')['team_id'].to_dict())

In [11]:
# Get division info. in each team to create 'division_game' flag

In [12]:
new_df = pd.merge(df, teams[["team_id", "team_division"]], how='inner', left_on=['team_home'], right_on=['team_id'])
new_df = pd.merge(new_df, teams[["team_id", "team_division"]], how='inner', left_on=['team_away'], right_on=['team_id'])
new_df = new_df.rename(columns={'team_division_x': 'team_home_division', 'team_division_y': 'team_away_division'})

# drop unused columns
new_df = new_df.drop(['team_id_x', 'team_id_y'], axis=1)

new_df["division_game"] = new_df["team_home_division"] == new_df["team_away_division"]

In [13]:
new_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,team_home_division,team_away_division,division_game
0,9/9/01,2001,1,False,BAL,17,6,CHI,BAL,-10.5,33.5,M&T Bank Stadium,False,72.0,6.0,79.0,,AFC North,NFC North,False
1,12/20/09,2009,15,False,BAL,31,7,CHI,BAL,-11.0,39.0,M&T Bank Stadium,False,30.0,12.0,68.0,,AFC North,NFC North,False
2,10/15/17,2017,6,False,BAL,24,27,CHI,BAL,-6.5,38.5,M&T Bank Stadium,False,73.0,10.0,,,AFC North,NFC North,False
3,9/29/02,2002,4,False,BUF,33,27,CHI,BUF,-3.0,46.5,Ralph Wilson Stadium,False,57.0,9.0,76.0,,AFC East,NFC North,False
4,11/7/10,2010,9,False,BUF,19,22,CHI,CHI,-3.0,41.5,Rogers Centre,True,72.0,0.0,,DOME,AFC East,NFC North,False


In [14]:
new_df.division_game.describe()

count      4750
unique        2
top       False
freq       3042
Name: division_game, dtype: object

### Merge elo data

In [15]:
new_df = pd.merge(df, teams[["team_id", "team_division"]], how='inner', left_on=['team_home'], right_on=['team_id'])
new_df = pd.merge(new_df, teams[["team_id", "team_division"]], how='inner', left_on=['team_away'], right_on=['team_id'])
new_df = new_df.rename(columns={'team_division_x': 'team_home_division', 'team_division_y': 'team_away_division'})

# drop unused columns
new_df = new_df.drop(['team_id_x', 'team_id_y'], axis=1)

new_df["division_game"] = new_df["team_home_division"] == new_df["team_away_division"]

# change data type of date columns
new_df['schedule_date'] = pd.to_datetime(new_df['schedule_date'])
games_elo['date'] = pd.to_datetime(games_elo['date'])

In [16]:
new_df = new_df.merge(games_elo[['date', 'team1', 'team2', 'elo_prob1', 'elo_prob2']], left_on=['schedule_date', 'team_home', 'team_away'], right_on=['date', 'team1', 'team2'], how='left')

In [17]:
new_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,...,weather_humidity,weather_detail,team_home_division,team_away_division,division_game,date,team1,team2,elo_prob1,elo_prob2
0,2001-09-09,2001,1,False,BAL,17,6,CHI,BAL,-10.5,...,79.0,,AFC North,NFC North,False,2001-09-09,BAL,CHI,0.847023,0.152977
1,2009-12-20,2009,15,False,BAL,31,7,CHI,BAL,-11.0,...,68.0,,AFC North,NFC North,False,2009-12-20,BAL,CHI,0.781479,0.218521
2,2017-10-15,2017,6,False,BAL,24,27,CHI,BAL,-6.5,...,,,AFC North,NFC North,False,2017-10-15,BAL,CHI,0.751455,0.248545
3,2002-09-29,2002,4,False,BUF,33,27,CHI,BUF,-3.0,...,76.0,,AFC East,NFC North,False,2002-09-29,BUF,CHI,0.401823,0.598177
4,2010-11-07,2010,9,False,BUF,19,22,CHI,CHI,-3.0,...,,DOME,AFC East,NFC North,False,2010-11-07,BUF,CHI,0.421488,0.578512


In [18]:
# merge to fix neutral games where team_home and team_away are switched
games_elo2 = games_elo.rename(columns={'team1' : 'team2', 
                                       'team2' : 'team1', 
                                       'elo1' : 'elo2', 
                                       'elo2' : 'elo1',
                                       'elo_prob1' : 'elo_prob2', 
                                       'elo_prob2' : 'elo_prob1'})
# games_elo2 = games_elo.rename(columns={'team1' : 'team2', 'team2' : 'team1', 'elo1' : 'elo2', 'elo2' : 'elo1'})

In [19]:
# new_df = new_df.merge(games_elo, left_on=['schedule_date', 'team_home', 'team_away'], right_on=['date', 'team1', 'team2'], how='left')
# new_df = new_df.merge(games_elo2, left_on=['schedule_date', 'team_home', 'team_away'], right_on=['date', 'team1', 'team2'], how='left')
new_df = new_df.merge(games_elo2[['date', 'team1', 'team2', 'elo_prob1', 'elo_prob2']], left_on=['schedule_date', 'team_home', 'team_away'], right_on=['date', 'team1', 'team2'], how='left')
new_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,...,date_x,team1_x,team2_x,elo_prob1_x,elo_prob2_x,date_y,team1_y,team2_y,elo_prob1_y,elo_prob2_y
0,2001-09-09,2001,1,False,BAL,17,6,CHI,BAL,-10.5,...,2001-09-09,BAL,CHI,0.847023,0.152977,NaT,,,,
1,2009-12-20,2009,15,False,BAL,31,7,CHI,BAL,-11.0,...,2009-12-20,BAL,CHI,0.781479,0.218521,NaT,,,,
2,2017-10-15,2017,6,False,BAL,24,27,CHI,BAL,-6.5,...,2017-10-15,BAL,CHI,0.751455,0.248545,NaT,,,,
3,2002-09-29,2002,4,False,BUF,33,27,CHI,BUF,-3.0,...,2002-09-29,BUF,CHI,0.401823,0.598177,NaT,,,,
4,2010-11-07,2010,9,False,BUF,19,22,CHI,CHI,-3.0,...,2010-11-07,BUF,CHI,0.421488,0.578512,NaT,,,,


In [20]:
new_df.columns.values

array(['schedule_date', 'schedule_season', 'schedule_week',
       'schedule_playoff', 'team_home', 'score_home', 'score_away',
       'team_away', 'team_favorite_id', 'spread_favorite',
       'over_under_line', 'stadium', 'stadium_neutral',
       'weather_temperature', 'weather_wind_mph', 'weather_humidity',
       'weather_detail', 'team_home_division', 'team_away_division',
       'division_game', 'date_x', 'team1_x', 'team2_x', 'elo_prob1_x',
       'elo_prob2_x', 'date_y', 'team1_y', 'team2_y', 'elo_prob1_y',
       'elo_prob2_y'], dtype=object)

In [21]:
# new_df[new_df["elo_prob1"].isnull()]

In [22]:
# separating merged columns into x and y cols
x_cols = ['date_x', 'team1_x', 'team2_x', 'elo_prob1_x', 'elo_prob2_x']
y_cols = ['date_y', 'team1_y', 'team2_y', 'elo_prob1_y','elo_prob2_y']

# filling null values for games_elo merged cols
for x, y in zip(x_cols, y_cols):
    new_df[x] = new_df[x].fillna(new_df[y]) 

# removing y_cols from dataframe    
new_df = new_df[['schedule_date', 'schedule_season', 'schedule_week',
       'schedule_playoff', 'team_home', 'score_home', 'score_away',
       'team_away', 'team_favorite_id', 'spread_favorite',
       'over_under_line', 'stadium', 'stadium_neutral',
       'weather_temperature', 'weather_wind_mph', 'weather_humidity',
       'weather_detail', 'team_home_division', 'team_away_division',
       'division_game', 'date_x', 'team1_x', 'team2_x', 'elo_prob1_x',
       'elo_prob2_x']]

# remove _x ending from column names
new_df.columns = new_df.columns.str.replace('_x', '')

In [23]:
new_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,...,weather_humidity,weather_detail,team_home_division,team_away_division,division_game,date,team1,team2,elo_prob1,elo_prob2
0,2001-09-09,2001,1,False,BAL,17,6,CHI,BAL,-10.5,...,79.0,,AFC North,NFC North,False,2001-09-09,BAL,CHI,0.847023,0.152977
1,2009-12-20,2009,15,False,BAL,31,7,CHI,BAL,-11.0,...,68.0,,AFC North,NFC North,False,2009-12-20,BAL,CHI,0.781479,0.218521
2,2017-10-15,2017,6,False,BAL,24,27,CHI,BAL,-6.5,...,,,AFC North,NFC North,False,2017-10-15,BAL,CHI,0.751455,0.248545
3,2002-09-29,2002,4,False,BUF,33,27,CHI,BUF,-3.0,...,76.0,,AFC East,NFC North,False,2002-09-29,BUF,CHI,0.401823,0.598177
4,2010-11-07,2010,9,False,BUF,19,22,CHI,CHI,-3.0,...,,DOME,AFC East,NFC North,False,2010-11-07,BUF,CHI,0.421488,0.578512


In [24]:
new_df.elo_prob1.describe()

count    4460.000000
mean        0.583040
std         0.170068
min         0.092782
25%         0.465182
50%         0.593258
75%         0.711751
max         0.964578
Name: elo_prob1, dtype: float64

In [25]:
# Drop column where there are eno Elo information
new_df = new_df.dropna(subset=['elo_prob1', 'elo_prob2'])

In [26]:
# creating result column df.loc[(df.score_home > df.score_away), 'result'
new_df['result'] = (new_df.score_home > new_df.score_away).astype(int)

In [27]:
new_df.shape

(4460, 26)

## Exploratory Data Analysis

In [31]:
new_df.columns.values

array(['schedule_date', 'schedule_season', 'schedule_week',
       'schedule_playoff', 'team_home', 'score_home', 'score_away',
       'team_away', 'team_favorite_id', 'spread_favorite',
       'over_under_line', 'stadium', 'stadium_neutral',
       'weather_temperature', 'weather_wind_mph', 'weather_humidity',
       'weather_detail', 'team_home_division', 'team_away_division',
       'division_game', 'date', 'team1', 'team2', 'elo_prob1', 'elo_prob2',
       'result'], dtype=object)

In [30]:
new_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,...,weather_detail,team_home_division,team_away_division,division_game,date,team1,team2,elo_prob1,elo_prob2,result
0,2001-09-09,2001,1,False,BAL,17,6,CHI,BAL,-10.5,...,,AFC North,NFC North,False,2001-09-09,BAL,CHI,0.847023,0.152977,1
1,2009-12-20,2009,15,False,BAL,31,7,CHI,BAL,-11.0,...,,AFC North,NFC North,False,2009-12-20,BAL,CHI,0.781479,0.218521,1
2,2017-10-15,2017,6,False,BAL,24,27,CHI,BAL,-6.5,...,,AFC North,NFC North,False,2017-10-15,BAL,CHI,0.751455,0.248545,0
3,2002-09-29,2002,4,False,BUF,33,27,CHI,BUF,-3.0,...,,AFC East,NFC North,False,2002-09-29,BUF,CHI,0.401823,0.598177,1
4,2010-11-07,2010,9,False,BUF,19,22,CHI,CHI,-3.0,...,DOME,AFC East,NFC North,False,2010-11-07,BUF,CHI,0.421488,0.578512,0


In [33]:
new_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
schedule_season,4460.0,2009.506951,5.17036,2001.0,2005.0,2010.0,2014.0,2018.0
score_home,4460.0,23.356726,10.424167,0.0,16.0,23.0,30.0,62.0
score_away,4460.0,20.693498,10.085172,0.0,13.0,20.0,27.0,59.0
spread_favorite,4460.0,-5.461771,3.395551,-26.5,-7.0,-4.5,-3.0,-1.0
over_under_line,4460.0,43.291928,4.895715,30.0,40.0,43.5,46.5,63.5
weather_temperature,4353.0,60.778773,15.630705,-6.0,50.0,65.0,72.0,97.0
weather_wind_mph,4353.0,6.216173,5.462556,0.0,0.0,6.0,10.0,40.0
weather_humidity,2228.0,65.045781,16.761103,4.0,54.0,67.0,77.0,100.0
elo_prob1,4460.0,0.58304,0.170068,0.092782,0.465182,0.593258,0.711751,0.964578
elo_prob2,4460.0,0.41696,0.170068,0.035422,0.288249,0.406742,0.534818,0.907218


In [35]:
# null values by column
new_df.isnull().sum(axis=0)

schedule_date             0
schedule_season           0
schedule_week             0
schedule_playoff          0
team_home                 0
score_home                0
score_away                0
team_away                 0
team_favorite_id          0
spread_favorite           0
over_under_line           0
stadium                   0
stadium_neutral           0
weather_temperature     107
weather_wind_mph        107
weather_humidity       2232
weather_detail         3171
team_home_division        0
team_away_division        0
division_game             0
date                      0
team1                     0
team2                     0
elo_prob1                 0
elo_prob2                 0
result                    0
dtype: int64

In [41]:
df = new_df

In [42]:
# creating home favorite and away favorite columns (fill na with 0's)
df.loc[df.team_favorite_id == df.team_home, 'home_favorite'] = 1
df.loc[df.team_favorite_id == df.team_away, 'away_favorite'] = 1
df.home_favorite.fillna(0, inplace=True)
df.away_favorite.fillna(0, inplace=True)

# fixing some schedule_week column errors and converting column to integer data type
df.loc[(df.schedule_week == '18'), 'schedule_week'] = '17'
df.loc[(df.schedule_week == 'Wildcard') | (df.schedule_week == 'WildCard'), 'schedule_week'] = '18'
df.loc[(df.schedule_week == 'Division'), 'schedule_week'] = '19'
df.loc[(df.schedule_week == 'Conference'), 'schedule_week'] = '20'
df.loc[(df.schedule_week == 'Superbowl') | (df.schedule_week == 'SuperBowl'), 'schedule_week'] = '21'
df['schedule_week'] = df.schedule_week.astype(int)

In [86]:
# some percentages to take into consideration when betting
# home_win = "{:.2f}".format((sum((df.result == 1) & (df.stadium_neutral == 0)) / len(df)) * 100)
home_win = "{:.2f}".format((sum((df.result == 1) & (df.stadium_neutral == 0)) / sum(df.stadium_neutral == 0)) * 100)
# away_win = "{:.2f}".format((sum((df.result == 0) & (df.stadium_neutral == 0)) / len(df)) * 100)
away_win = "{:.2f}".format((sum((df.result == 0) & (df.stadium_neutral == 0)) / sum(df.stadium_neutral == 0)) * 100)

under_line = "{:.2f}".format((sum((df.score_home + df.score_away) < df.over_under_line) / len(df)) * 100)
over_line = "{:.2f}".format((sum((df.score_home + df.score_away) > df.over_under_line) / len(df)) * 100)
equal_line = "{:.2f}".format((sum((df.score_home + df.score_away) == df.over_under_line) / len(df)) * 100)

favored = "{:.2f}".format((sum(((df.home_favorite == 1) & (df.result == 1)) | ((df.away_favorite == 1) & (df.result == 0)))
                           / len(df)) * 100)


cover = "{:.2f}".format((sum(((df.home_favorite == 1) & ((df.score_away - df.score_home) < df.spread_favorite)) | 
                             ((df.away_favorite == 1) & ((df.score_home - df.score_away) < df.spread_favorite))) # use score_home - score_away because the fav are swap
                         / len(df)) * 100)

ats = "{:.2f}".format((sum(((df.home_favorite == 1) & ((df.score_away - df.score_home) > df.spread_favorite)) | 
                           ((df.away_favorite == 1) & ((df.score_home - df.score_away) > df.spread_favorite))) 
                       / len(df)) * 100)

In [87]:
# print all percentages
print("Number of Games: " + str(len(df)))
print("Home Straight Up Win Percentage: " + home_win + "%")
print("Away Straight Up Win Percentage: " + away_win + "%")
print("Under Percentage: " + under_line + "%")
print("Over Percentage: " + over_line + "%")
print("Equal Percentage: " + equal_line + "%")
print("Favored Win Percentage: " + favored + "%")
print("Cover The Spread Percentage: " + cover + "%")
print("Against The Spread Percentage: " + ats + "%")

Number of Games: 4460
Home Straight Up Win Percentage: 57.79%
Away Straight Up Win Percentage: 42.21%
Under Percentage: 49.48%
Over Percentage: 48.79%
Equal Percentage: 1.73%
Favored Win Percentage: 66.84%
Cover The Spread Percentage: 47.67%
Against The Spread Percentage: 49.46%


In [78]:
pd.set_option('display.max_columns', 500)
df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,team_home_division,team_away_division,division_game,date,team1,team2,elo_prob1,elo_prob2,result,home_favorite,away_favorite
0,2001-09-09,2001,1,False,BAL,17,6,CHI,BAL,-10.5,33.5,M&T Bank Stadium,False,72.0,6.0,79.0,,AFC North,NFC North,False,2001-09-09,BAL,CHI,0.847023,0.152977,1,1.0,0.0
1,2009-12-20,2009,15,False,BAL,31,7,CHI,BAL,-11.0,39.0,M&T Bank Stadium,False,30.0,12.0,68.0,,AFC North,NFC North,False,2009-12-20,BAL,CHI,0.781479,0.218521,1,1.0,0.0
2,2017-10-15,2017,6,False,BAL,24,27,CHI,BAL,-6.5,38.5,M&T Bank Stadium,False,73.0,10.0,,,AFC North,NFC North,False,2017-10-15,BAL,CHI,0.751455,0.248545,0,1.0,0.0
3,2002-09-29,2002,4,False,BUF,33,27,CHI,BUF,-3.0,46.5,Ralph Wilson Stadium,False,57.0,9.0,76.0,,AFC East,NFC North,False,2002-09-29,BUF,CHI,0.401823,0.598177,1,1.0,0.0
4,2010-11-07,2010,9,False,BUF,19,22,CHI,CHI,-3.0,41.5,Rogers Centre,True,72.0,0.0,,DOME,AFC East,NFC North,False,2010-11-07,BUF,CHI,0.421488,0.578512,0,0.0,1.0


# Feature and Model Testing 
## Predict probability the the Home team will win p(y=1; x)

In [88]:
df.columns.values

array(['schedule_date', 'schedule_season', 'schedule_week',
       'schedule_playoff', 'team_home', 'score_home', 'score_away',
       'team_away', 'team_favorite_id', 'spread_favorite',
       'over_under_line', 'stadium', 'stadium_neutral',
       'weather_temperature', 'weather_wind_mph', 'weather_humidity',
       'weather_detail', 'team_home_division', 'team_away_division',
       'division_game', 'date', 'team1', 'team2', 'elo_prob1', 'elo_prob2',
       'result', 'home_favorite', 'away_favorite'], dtype=object)

In [97]:
# Change boolean column to 0/1
df.division_game = df.division_game.astype(int)
df.home_favorite = df.home_favorite.astype(int)
df.away_favorite = df.away_favorite.astype(int)

In [98]:
# initial features possible for model
# X = df[['schedule_season', 'schedule_week', 'over_under_line', 'spread_favorite', 'weather_temperature', 'weather_wind_mph',
#         'home_favorite', 'hm_avg_pts_diff','aw_avg_pts_diff', 'elo1', 'elo2', 'elo_prob1']]
X = df[['schedule_week', 'spread_favorite', 'division_game', 'elo_prob1', 'elo_prob2', 'result', 'home_favorite', 'away_favorite']]

y = df['result']

In [100]:
X.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
schedule_week,4460.0,9.510314,5.275639,1.0,5.0,10.0,14.0,21.0
spread_favorite,4460.0,-5.461771,3.395551,-26.5,-7.0,-4.5,-3.0,-1.0
division_game,4460.0,0.358969,0.479752,0.0,0.0,0.0,1.0,1.0
elo_prob1,4460.0,0.58304,0.170068,0.092782,0.465182,0.593258,0.711751,0.964578
elo_prob2,4460.0,0.41696,0.170068,0.035422,0.288249,0.406742,0.534818,0.907218
result,4460.0,0.575336,0.494347,0.0,0.0,1.0,1.0,1.0
home_favorite,4460.0,0.671076,0.469875,0.0,0.0,1.0,1.0,1.0
away_favorite,4460.0,0.328924,0.469875,0.0,0.0,0.0,1.0,1.0


In [101]:
# required machine learning packages
from sklearn import model_selection
from sklearn.feature_selection import RFE
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV as CCV

from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
import xgboost as xgb

In [102]:
# training and testing data (2017 and 2018)
train = df.copy()
test = df.copy()
train = train.loc[train['schedule_season'] < 2016]
test = test.loc[test['schedule_season'] > 2015]
X_train = train[['schedule_week', 'spread_favorite', 'division_game', 'elo_prob1', 'elo_prob2', 'result', 'home_favorite', 'away_favorite']]
y_train = train['result']
X_test = test[['schedule_week', 'spread_favorite', 'division_game', 'elo_prob1', 'elo_prob2', 'result', 'home_favorite', 'away_favorite']]
y_test = test['result']

In [105]:
# calibrate probabilities and fit model to training data
boost = xgb.XGBClassifier()
dtc = DecisionTreeClassifier(max_depth=5, criterion='entropy')
lrg = LogisticRegression(solver='liblinear')
vote = VotingClassifier(estimators=[('boost', boost), ('dtc', dtc), ('lrg', lrg)], voting='soft')

# model = CCV(vote, method='isotonic', cv=3)
# model.fit(X_train, y_train)

In [107]:
model = lrg.fit(X_train, y_train)

In [112]:
# predict probabilities
predicted = model.predict_proba(X_test)[:,1]

In [120]:
model.score(X_test, y_test)

1.0

In [124]:
sum((predicted > 0.5) & (y_test == 1))

435

In [132]:
predictions = model.predict(X_test)

In [133]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

[[304   0]
 [  0 435]]
