# NBA Business Track Viewership Model

In [1]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [6]:
import sklearn as skl
import sklearn.linear_model as lm
import numpy as np

In [2]:
connection = sqlite3.connect('business_track.db')
cursor = connection.cursor()

def run_query(qry):
    return pd.read_sql_query(qry, connection)

In [83]:
## Create MAPE function to check MAPE 
##(which is scoring metric used for competition)

def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Baseline Models
The features utilized:
* Day of Week
* Team Playing

In [10]:
baseline_data = run_query('SELECT Season, Game_ID, Game_Date, Away_Team, Home_Team,count(Country) as NumCountries, sum(RoundedViewers) as TotalViews, Day_of_Week FROM train_data GROUP BY Game_ID')
baseline_data['Game_Date'] = pd.to_datetime(baseline_data['Game_Date']) 

In [12]:
baseline_data.head()

Unnamed: 0,Season,Game_ID,Game_Date,Away_Team,Home_Team,NumCountries,TotalViews,Day_of_Week
0,2016-17,21600001,2016-10-25,NYK,CLE,177,57691,Tuesday
1,2016-17,21600002,2016-10-25,UTA,POR,169,40740,Tuesday
2,2016-17,21600003,2016-10-25,SAS,GSW,185,60428,Tuesday
3,2016-17,21600004,2016-10-26,MIA,ORL,148,22756,Wednesday
4,2016-17,21600005,2016-10-26,DAL,IND,162,33922,Wednesday


In [25]:
base_predictors = baseline_data.drop(['TotalViews', 'Season', 'Game_ID', 'Game_Date', 'NumCountries'], axis=1)
base_predictors.head()

Unnamed: 0,Away_Team,Home_Team,Day_of_Week
0,NYK,CLE,Tuesday
1,UTA,POR,Tuesday
2,SAS,GSW,Tuesday
3,MIA,ORL,Wednesday
4,DAL,IND,Wednesday


### Convert the three categorical variables above to OneHot Binary Variables:
* 7 days of week + 30 teams x 2 (home & away) 
= 67 columns


In [26]:
base_X = pd.get_dummies(base_predictors)  # One Hot Encoding: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
base_X.head()

Unnamed: 0,Away_Team_ATL,Away_Team_BKN,Away_Team_BOS,Away_Team_CHA,Away_Team_CHI,Away_Team_CLE,Away_Team_DAL,Away_Team_DEN,Away_Team_DET,Away_Team_GSW,...,Home_Team_TOR,Home_Team_UTA,Home_Team_WAS,Day_of_Week_Friday,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [27]:
base_Y = baseline_data['TotalViews']
base_Y.head()

0    57691
1    40740
2    60428
3    22756
4    33922
Name: TotalViews, dtype: int64

In [31]:
# Verify matrix dimensions
print('X matrix shape',base_X.as_matrix().shape)
print('Y column shape',base_Y.as_matrix().shape)

X matrix shape (2000, 67)
Y column shape (2000,)


### Try Ordinary Least Squares

In [32]:
OLS = lm.LinearRegression(fit_intercept = True)
OLS.fit(base_X.as_matrix(), base_Y.as_matrix())

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [37]:
coeff_df = pd.DataFrame(columns=['Name', 'Value'])
for col, coeff in zip(base_X.columns, ridge.coef_):
    coeff_df = coeff_df.append({'Name':col, 
                     'Value':coeff}, ignore_index=True)

Away_Team_ATL: 1.6368388152484906e+16
Away_Team_BKN: 1.6368388152483824e+16
Away_Team_BOS: 1.6368388152495448e+16
Away_Team_CHA: 1.6368388152484772e+16
Away_Team_CHI: 1.6368388152488098e+16
Away_Team_CLE: 1.636838815250718e+16
Away_Team_DAL: 1.6368388152483786e+16
Away_Team_DEN: 1.6368388152485054e+16
Away_Team_DET: 1.6368388152484052e+16
Away_Team_GSW: 1.636838815250943e+16
Away_Team_HOU: 1.6368388152493906e+16
Away_Team_IND: 1.6368388152485448e+16
Away_Team_LAC: 1.6368388152487256e+16
Away_Team_LAL: 1.6368388152492484e+16
Away_Team_MEM: 1.6368388152483668e+16
Away_Team_MIA: 1.6368388152486528e+16
Away_Team_MIL: 1.6368388152486552e+16
Away_Team_MIN: 1.6368388152489158e+16
Away_Team_NOP: 1.6368388152487062e+16
Away_Team_NYK: 1.6368388152489798e+16
Away_Team_OKC: 1.6368388152497118e+16
Away_Team_ORL: 1.6368388152483124e+16
Away_Team_PHI: 1.6368388152489462e+16
Away_Team_PHX: 1.6368388152483478e+16
Away_Team_POR: 1.6368388152487368e+16
Away_Team_SAC: 1.6368388152484146e+16
Away_Team_SAS:

In [78]:
# R^2 Value:
OLS.score(base_X.as_matrix(), base_Y.as_matrix())

0.62391225672263562

In [86]:
MAPE(base_Y, OLS.predict(base_X))

35.968439121789544

Seems to make coefficients for AWAY/HOME teams very similar. Might need more features.

### Try Ridge Regression
One tuning parameter $\alpha$. Possible params are 20 values b/w 10^-10 and 10^10, evenly spaced on log scale. CV is done by 20 fold.

Two ways to run CV: Shuffle Split and 20-fold CV

In [132]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=2018)

In [141]:
alphas = np.logspace(-10,10,40) # tuning parameters
ridge = lm.RidgeCV(alphas=alphas, fit_intercept=True, normalize=True, cv=cv)

In [124]:
alphas = np.logspace(-10,10,20) # tuning parameters
num_folds = 20
ridge = lm.RidgeCV(alphas=alphas, fit_intercept=True, normalize=True, cv=num_folds)

In [142]:
ridge.fit(base_X.as_matrix(), base_Y.as_matrix())

RidgeCV(alphas=array([  1.00000e-10,   3.25702e-10,   1.06082e-09,   3.45511e-09,
         1.12534e-08,   3.66524e-08,   1.19378e-07,   3.88816e-07,
         1.26638e-06,   4.12463e-06,   1.34340e-05,   4.37548e-05,
         1.42510e-04,   4.64159e-04,   1.51178e-03,   4.92388e-03,
         1.60372e-02,   5...6,   2.72833e+07,   8.88624e+07,
         2.89427e+08,   9.42668e+08,   3.07029e+09,   1.00000e+10]),
    cv=ShuffleSplit(n_splits=5, random_state=18, test_size=0.3, train_size=None),
    fit_intercept=True, gcv_mode=None, normalize=True, scoring=None,
    store_cv_values=False)

In [143]:
print('Minimum MSE Error:', min(ridge_mse))
print('Corresponds to alpha:', ridge.alpha_)

Minimum MSE Error: 47689838.4953
Corresponds to alpha: 1.80472176683


In [144]:
# Store coefficients to see which ones are most influential
## Check if intution about teams and Days of Week correct
coeff_df = pd.DataFrame(columns=['Name', 'Value'])
for col, coeff in zip(base_X.columns, ridge.coef_):
    coeff_df = coeff_df.append({'Name':col, 
                     'Value':coeff}, ignore_index=True)

In [145]:
coeff_df.sort_values(by='Value', ascending=False)

Unnamed: 0,Name,Value
9,Away_Team_GSW,20043.756376
35,Home_Team_CLE,18382.830333
5,Away_Team_CLE,17797.191329
39,Home_Team_GSW,15294.486948
50,Home_Team_OKC,9571.678511
20,Away_Team_OKC,8089.712824
32,Home_Team_BOS,6425.287876
2,Away_Team_BOS,6061.987195
10,Away_Team_HOU,4636.912973
43,Home_Team_LAL,4489.422849


In [146]:
# R^2 Value:
ridge.score(base_X.as_matrix(), base_Y.as_matrix())
## Slightly better than OLS

0.62405834439406915

In [147]:
MAPE(base_Y, ridge.predict(base_X))

36.33409636880521

Ridge has slightly better R^2 but worse MAPE than OLS.