In [100]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from neupy import algorithms
from sklearn.neural_network import MLPClassifier

from sklearn.naive_bayes import BernoulliNB


In [101]:
full_df = pd.read_csv('data.csv')

In [102]:
# Preview the data; 2894 rows x 174 columns
full_df

Unnamed: 0,game,date,away_team,away_score,mp_pg_away,fg_pg_away,fga_pg_away,fg%_pg_away,3p_pg_away,3pa_pg_away,...,ftr_lr_home,3par_lr_home,efg%_off_lr_home,tov%_off_lr_home,orb%_off_lr_home,ft/fga_off_lr_home,efg%_def_lr_home,tov%_def_lr_home,drb%_def_lr_home,ft/fga_def_lr_home
0,Game 1,May 31 2018,Cleveland Cavaliers,114,240.6,40.4,84.8,0.476,12.0,32.1,...,20,15,1,28,23,16,5,21,25,11
1,Game 2,Jun 3 2018,Cleveland Cavaliers,103,240.6,40.4,84.8,0.476,12.0,32.1,...,20,15,1,28,23,16,5,21,25,11
2,Game 3,Jun 6 2018,Golden State Warriors,110,240.6,42.8,85.1,0.503,11.3,28.9,...,8,4,3,11,27,6,28,26,17,2
3,Game 4,Jun 8 2018,Golden State Warriors,108,240.6,42.8,85.1,0.503,11.3,28.9,...,8,4,3,11,27,6,28,26,17,2
4,Game 1,May 13 2018,Cleveland Cavaliers,83,240.6,40.4,84.8,0.476,12.0,32.1,...,18,8,15,15,18,20,2,14,11,16
5,Game 2,May 15 2018,Cleveland Cavaliers,94,240.6,40.4,84.8,0.476,12.0,32.1,...,18,8,15,15,18,20,2,14,11,16
6,Game 3,May 19 2018,Boston Celtics,86,241.5,38.3,85.1,0.450,11.5,30.4,...,8,4,3,11,27,6,28,26,17,2
7,Game 4,May 21 2018,Boston Celtics,102,241.5,38.3,85.1,0.450,11.5,30.4,...,8,4,3,11,27,6,28,26,17,2
8,Game 5,May 23 2018,Cleveland Cavaliers,83,240.6,40.4,84.8,0.476,12.0,32.1,...,18,8,15,15,18,20,2,14,11,16
9,Game 6,May 25 2018,Boston Celtics,99,241.5,38.3,85.1,0.450,11.5,30.4,...,8,4,3,11,27,6,28,26,17,2


In [103]:
# Add column for score differential
# full_df['score_diff'] = full_df['home_score'] - full_df['away_score']
# Add column for winner
full_df['winner'] = (full_df['home_score'] - full_df['away_score'] > 0).astype(int)

In [104]:
# Replace "Game i" with "i"
full_df['game'] = full_df['game'].replace('Game ', '', regex=True)

In [105]:
# Drop dates for now until we can think of a way to engineer features
full_df = full_df.drop('date', axis=1)

In [106]:
# Create map from team name to their alphabetic indices
teams = full_df['home_team'].unique().tolist()
# For now, just merge:
# NJ Nets -> BK Nets
# Supersonics -> Thunder
# NOLA Hornets -> Pelicans
# Bobcats -> Charlotte Hornets
# Kansas City -> Sacramento
# Bullets -> Wizards
merges = {'New Jersey Nets': 'Brooklyn Nets',
         'Seattle SuperSonics': 'Oklahoma City Thunder',
         'New Orleans Hornets': 'New Orleans Pelicans',
          'Charlotte Bobcats': 'Charlotte Hornets',
          'Kansas City Kings': 'Sacramento Kings',
          'Washington Bullets': 'Washington Wizards'
         }
for team in merges:
    teams.remove(team)
teams.sort()
team_to_index = {}
for i, team in enumerate(teams):
    team_to_index[team] = i
for old_team, new_team in merges.items():
    team_to_index[old_team] = team_to_index[new_team]

In [107]:
# Replace teams with their indices
full_df = full_df.replace({'home_team': team_to_index, 'away_team': team_to_index})

In [312]:
# Make sure all columns are numeric
full_df = full_df.apply(pd.to_numeric)
full_df = full_df[:500]

In [425]:
# Extract X and y; need to drop the columns we used to calculate labels
X = full_df.drop(['winner', 'home_score', 'away_score'], axis=1).values


features = [(78, -0.06983, 'tov%_off_lr_away'), (25, -0.06763, 'fga_lr_away'), (3, 0.06033, 'fga_pg_away'), (75, 0.05996, 'ftr_lr_away'), (42, -0.05778, 'tov_lr_away'), (80, 0.0569, 'ft/fga_off_lr_away'), (43, -0.05378, 'pf_lr_away'), (84, -0.05362, 'ft/fga_def_lr_away'), (24, -0.05269, 'fg_lr_away'), (58, -0.05141, 'tov%_off_away'), (55, -0.04907, 'ftr_away'), (2, 0.04883, 'fg_pg_away'), (16, 0.048, 'trb_pg_away'), (60, -0.04529, 'ft/fga_off_away'), (82, 0.0438, 'tov%_def_lr_away'), (34, 0.04264, 'fta_lr_away'), (38, -0.04216, 'trb_lr_away'), (22, 0.04154, 'pts_pg_away'), (52, 0.03937, 'ortg_away'), (64, -0.03918, 'ft/fga_def_away'), (62, -0.03809, 'tov%_def_away'), (72, -0.03783, 'ortg_lr_away'), (33, 0.03723, 'ft_lr_away'), (44, -0.03496, 'pts_lr_away'), (122, 0.03414, 'trb_lr_home'), (17, 0.03399, 'ast_pg_away'), (37, -0.03387, 'drb_lr_away'), (28, -0.03383, '3pa_lr_away'), (27, -0.03331, '3p_lr_away'), (54, 0.03057, 'pace_away'), (8, 0.03041, '2p_pg_away'), (88, 0.02921, 'fg%_pg_home'), (31, -0.02869, '2pa_lr_away'), (30, -0.02825, '2p_lr_away'), (9, 0.02822, '2pa_pg_away'), (14, 0.02807, 'orb_pg_away'), (121, 0.02644, 'drb_lr_home'), (36, -0.02637, 'orb_lr_away'), (39, -0.02599, 'ast_lr_away'), (53, 0.02544, 'drtg_away'), (45, 0.02488, 'w_away'), (101, 0.0238, 'ast_pg_home'), (21, -0.0235, 'pf_pg_away'), (76, -0.02331, '3par_lr_away'), (40, 0.02329, 'stl_lr_away'), (92, 0.02307, '2p_pg_home'), (65, -0.02303, 'w_lr_away'), (137, 0.0223, 'drtg_home'), (99, -0.02229, 'drb_pg_home'), (100, -0.02176, 'trb_pg_home'), (110, -0.02144, 'fg%_lr_home'), (105, 0.02139, 'pf_pg_home'), (86, 0.02137, 'fg_pg_home'), (145, 0.02082, 'efg%_def_home'), (94, 0.02048, '2p%_pg_home'), (71, -0.02032, 'srs_lr_away'), (47, 0.02025, 'pw_away'), (15, 0.02024, 'drb_pg_away'), (148, 0.02021, 'ft/fga_def_home'), (12, -0.0202, 'fta_pg_away'), (69, -0.01997, 'mov_lr_away'), (20, -0.01995, 'tov_pg_away'), (74, -0.01988, 'pace_lr_away'), (166, -0.01985, 'tov%_def_lr_home'), (140, -0.01957, '3par_home'), (90, -0.0194, '3pa_pg_home'), (4, 0.01926, 'fg%_pg_away'), (67, -0.01896, 'pw_lr_away'), (68, -0.01892, 'pl_lr_away'), (29, -0.01883, '3p%_lr_away'), (147, -0.01879, 'drb%_def_home'), (93, 0.01877, '2pa_pg_home'), (51, 0.01875, 'srs_away'), (146, 0.01873, 'tov%_def_home'), (49, 0.01865, 'mov_away'), (150, -0.01849, 'l_lr_home'), (138, 0.01844, 'pace_home'), (89, -0.01837, '3p_pg_home'), (97, 0.01836, 'ft%_pg_home'), (102, 0.01819, 'stl_pg_home'), (106, 0.0181, 'pts_pg_home'), (165, 0.01777, 'efg%_def_lr_home'), (116, -0.017, '2p%_lr_home'), (119, -0.01665, 'ft%_lr_home'), (59, 0.01656, 'orb%_off_away'), (130, 0.01641, 'l_home'), (104, 0.01621, 'tov_pg_home'), (18, -0.01604, 'stl_pg_away'), (46, -0.01582, 'l_away'), (132, 0.01569, 'pl_home'), (11, -0.01561, 'ft_pg_away'), (161, -0.01549, 'efg%_off_lr_home'), (120, 0.01485, 'orb_lr_home'), (124, -0.01479, 'stl_lr_home'), (95, 0.01457, 'ft_pg_home'), (168, 0.0141, 'ft/fga_def_lr_home'), (85, -0.01388, 'mp_pg_home'), (13, 0.01385, 'ft%_pg_away'), (57, 0.01385, 'efg%_off_away'), (61, 0.01333, 'efg%_def_away'), (154, -0.01332, 'sos_lr_home'), (167, 0.01321, 'drb%_def_lr_home'), (141, 0.01317, 'efg%_off_home'), (123, -0.01289, 'ast_lr_home'), (142, 0.01268, 'tov%_off_home'), (73, 0.0126, 'drtg_lr_away'), (79, -0.01247, 'orb%_off_lr_away'), (10, 0.01232, '2p%_pg_away'), (109, 0.01228, 'fga_lr_home'), (157, 0.01227, 'drtg_lr_home'), (77, -0.01221, 'efg%_off_lr_away'), (66, 0.01146, 'l_lr_away'), (144, 0.01104, 'ft/fga_off_home'), (136, 0.01074, 'ortg_home'), (127, 0.01057, 'pf_lr_home'), (133, -0.01049, 'mov_home'), (48, -0.01042, 'pl_away'), (113, -0.01035, '3p%_lr_home'), (63, -0.01029, 'drb%_def_away'), (35, -0.01018, 'ft%_lr_away'), (91, -0.00994, '3p%_pg_home'), (125, 0.00979, 'blk_lr_home'), (135, -0.00978, 'srs_home'), (87, 0.00963, 'fga_pg_home'), (149, 0.00904, 'w_lr_home'), (96, 0.00881, 'fta_pg_home'), (114, -0.00872, '2p_lr_home'), (0, 0.00854, 'game'), (164, -0.00853, 'ft/fga_off_lr_home'), (112, 0.00846, '3pa_lr_home'), (56, -0.00842, '3par_away'), (162, 0.00795, 'tov%_off_lr_home'), (163, 0.00778, 'orb%_off_lr_home'), (107, 0.00678, 'mp_lr_home'), (153, 0.00673, 'mov_lr_home'), (134, 0.00665, 'sos_home'), (160, 0.00659, '3par_lr_home'), (152, 0.00655, 'pl_lr_home'), (151, 0.00654, 'pw_lr_home'), (111, 0.00608, '3p_lr_home'), (155, 0.00604, 'srs_lr_home'), (103, -0.0054, 'blk_pg_home'), (81, 0.00537, 'efg%_def_lr_away'), (83, -0.00506, 'drb%_def_lr_away'), (23, -0.00505, 'mp_lr_away'), (129, -0.00485, 'w_home'), (156, -0.00485, 'ortg_lr_home'), (139, 0.00466, 'ftr_home'), (32, -0.00457, '2p%_lr_away'), (7, -0.00444, '3p%_pg_away'), (41, -0.00438, 'blk_lr_away'), (26, -0.00413, 'fg%_lr_away'), (131, -0.0041, 'pw_home'), (108, -0.004, 'fg_lr_home'), (6, -0.00357, '3pa_pg_away'), (19, 0.00331, 'blk_pg_away'), (118, 0.00322, 'fta_lr_home'), (143, 0.0026, 'orb%_off_home'), (158, 0.00258, 'pace_lr_home'), (126, 0.00244, 'tov_lr_home'), (117, -0.00231, 'ft_lr_home'), (159, -0.002, 'ftr_lr_home'), (5, -0.00189, '3p_pg_away'), (50, -0.00179, 'sos_away'), (115, -0.0017, '2pa_lr_home'), (98, -0.00111, 'orb_pg_home'), (70, -0.00035, 'sos_lr_away'), (1, -0.00021, 'mp_pg_away'), (128, -4e-05, 'pts_lr_home')]

#new_features = ['tov%_off_lr_away', 'fga_lr_away', 'fga_pg_away', 'ftr_lr_away', 'tov_lr_away', 'ft/fga_off_lr_away', 'pf_lr_away', 'ft/fga_def_lr_away', 'fg_lr_away', 'tov%_off_away', '']
new_features = features[:100]
new_features = [x[2] for x in new_features]

X = full_df[new_features].values
X = scale(X)
y = full_df['winner'].values

In [426]:
# We'll use these two metrics for now
scoring = {'accuracy': 'accuracy'}

In [445]:
#clf = ExtraTreesClassifier(n_estimators=100)

clf = MLPClassifier(hidden_layer_sizes=(10,), max_iter=750, activation='logistic')

#clf = algorithms.PNN(std=15, verbose = False)

scores = cross_validate(clf, X, y, cv=15, scoring=scoring)




In [446]:
print(scores['test_accuracy'].mean())
#print(scores['test_f1'].mean())

0.7055555555555555
