In [22]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [65]:
full_df = pd.read_csv('data.csv')

In [66]:
# Preview the data; 2894 rows x 174 columns
full_df

Unnamed: 0,game,date,away_team,away_score,mp_pg_away,fg_pg_away,fga_pg_away,fg%_pg_away,3p_pg_away,3pa_pg_away,...,ftr_lr_home,3par_lr_home,efg%_off_lr_home,tov%_off_lr_home,orb%_off_lr_home,ft/fga_off_lr_home,efg%_def_lr_home,tov%_def_lr_home,drb%_def_lr_home,ft/fga_def_lr_home
0,Game 1,May 31 2018,Cleveland Cavaliers,114,240.6,40.4,84.8,0.476,12.0,32.1,...,20,15,1,28,23,16,5,21,25,11
1,Game 2,Jun 3 2018,Cleveland Cavaliers,103,240.6,40.4,84.8,0.476,12.0,32.1,...,20,15,1,28,23,16,5,21,25,11
2,Game 3,Jun 6 2018,Golden State Warriors,110,240.6,42.8,85.1,0.503,11.3,28.9,...,8,4,3,11,27,6,28,26,17,2
3,Game 4,Jun 8 2018,Golden State Warriors,108,240.6,42.8,85.1,0.503,11.3,28.9,...,8,4,3,11,27,6,28,26,17,2
4,Game 1,May 13 2018,Cleveland Cavaliers,83,240.6,40.4,84.8,0.476,12.0,32.1,...,18,8,15,15,18,20,2,14,11,16
5,Game 2,May 15 2018,Cleveland Cavaliers,94,240.6,40.4,84.8,0.476,12.0,32.1,...,18,8,15,15,18,20,2,14,11,16
6,Game 3,May 19 2018,Boston Celtics,86,241.5,38.3,85.1,0.450,11.5,30.4,...,8,4,3,11,27,6,28,26,17,2
7,Game 4,May 21 2018,Boston Celtics,102,241.5,38.3,85.1,0.450,11.5,30.4,...,8,4,3,11,27,6,28,26,17,2
8,Game 5,May 23 2018,Cleveland Cavaliers,83,240.6,40.4,84.8,0.476,12.0,32.1,...,18,8,15,15,18,20,2,14,11,16
9,Game 6,May 25 2018,Boston Celtics,99,241.5,38.3,85.1,0.450,11.5,30.4,...,8,4,3,11,27,6,28,26,17,2


In [67]:
# Add column for score differential
# full_df['score_diff'] = full_df['home_score'] - full_df['away_score']
# Add column for winner
full_df['winner'] = (full_df['home_score'] - full_df['away_score'] > 0).astype(int)

In [68]:
# Replace "Game i" with "i"
full_df['game'] = full_df['game'].replace('Game ', '', regex=True)

In [69]:
# Drop dates for now until we can think of a way to engineer features
full_df = full_df.drop('date', axis=1)

In [70]:
# Create map from team name to their alphabetic indices
teams = full_df['home_team'].unique().tolist()
# For now, just merge:
# NJ Nets -> BK Nets
# Supersonics -> Thunder
# NOLA Hornets -> Pelicans
# Bobcats -> Charlotte Hornets
# Kansas City -> Sacramento
# Bullets -> Wizards
merges = {'New Jersey Nets': 'Brooklyn Nets',
         'Seattle SuperSonics': 'Oklahoma City Thunder',
         'New Orleans Hornets': 'New Orleans Pelicans',
          'Charlotte Bobcats': 'Charlotte Hornets',
          'Kansas City Kings': 'Sacramento Kings',
          'Washington Bullets': 'Washington Wizards'
         }
for team in merges:
    teams.remove(team)
teams.sort()
team_to_index = {}
for i, team in enumerate(teams):
    team_to_index[team] = i
for old_team, new_team in merges.items():
    team_to_index[old_team] = team_to_index[new_team]

In [71]:
# Replace teams with their indices
full_df = full_df.replace({'home_team': team_to_index, 'away_team': team_to_index})

In [72]:
# Make sure all columns are numeric
full_df = full_df.apply(pd.to_numeric)

In [73]:
# Extract X and y; need to drop the columns we used to calculate labels
X = full_df.drop(['winner', 'home_score', 'away_score'], axis=1).to_numpy()
X = scale(X)
y = full_df['winner'].to_numpy()

In [74]:
# We'll use these two metrics for now
scoring = {'accuracy': 'accuracy', 'f1': 'f1'}

In [75]:
clf = GradientBoostingClassifier(n_estimators=100)
scores = cross_validate(clf, X, y, cv=10, scoring=scoring)