# **Statsketball Analysis**

In [1]:
#import the python scientific suite
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import sklearn
import seaborn as sns
import scipy
import pylab
import statsmodels

from sklearn.cross_validation import *
from sklearn import svm, linear_model
from sklearn import preprocessing, cross_validation, neighbors
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import tree

from textblob import TextBlob

import statsmodels.api as sm
from collections import defaultdict


# This is importing a "future" python version 3 print function.
from __future__ import print_function
from __future__ import division

#plot figures inline in Jupyter notebooks
%matplotlib inline 

#use seaborn for plotting
sns.set()
sns.set_context('paper', font_scale = 1.5)
sns.set_style('ticks')
pylab.rcParams.update({'figure.autolayout': True})

In [2]:
def train_and_evaluate(clf, X_train, y_train, verbose = False):
        
    clf.fit(X_train, y_train) #fit the model

    #create a k-fold cross validation iterator of k = 5 folds

    cv = KFold(X_train.shape[0], 10, shuffle = True)
    scores = cross_val_score(clf, X_train, y_train, cv = cv)

    if verbose == True: #if we want to print status
        print("Coefficient of determination on training set:", clf.score(X_train, y_train))
        print("Average coefficient of determination using 10-fold crossvalidation:", np.mean(scores))
    
    return clf.score(X_train, y_train), np.mean(scores)


# Reading in Data files

In [3]:
file_dir = '~/Statsketball/Statsketball-Tournament/Master_Data/'

num = np.arange(2002,2017)
f = dict()
for n in num:
    f.update({str(n): str(n)+'_master.csv'})

d = defaultdict()

for fi in f:
    d[fi] = pd.read_csv(file_dir + f[fi])
    d[fi] = d[fi].drop(['Unnamed: 0'], 1)

In [4]:
print (f.keys())

dict_keys(['2013', '2011', '2003', '2012', '2014', '2016', '2005', '2010', '2007', '2015', '2008', '2002', '2009', '2004', '2006'])


In [5]:
d['2013'].head()

Unnamed: 0,team_X,team_Y,AdjEM_x-y,AdjO_x-y,AdjD_x-y,AdjT_x-y,luck_x-y,W_x-y,L_x-y,W_L_ratio_x-y,SRS_x-y,SOS_x-y,TM_pt_x-y,OPPO_pt_x-y,PT_Ratio_x-y,True_S_x-y,seed_x-y,point_diff_x-y,result
0,Indiana,James Madison,30.19,20.6,-9.5,2.6,-0.084,8.0,-8.0,0.223,26.63,10.66,485.0,-90.0,0.257502,0.076,-15,21,X
1,North Carolina State,Temple,,,,,,0.0,1.0,-0.02,4.8,1.97,252.0,149.0,0.038045,0.042,-1,-4,Y
2,UNLV,University of California,,,,,,,,,,,,,,,-7,-3,Y
3,Syracuse,Montana,22.6,8.2,-14.3,0.3,-0.179,5.0,3.0,-0.031,21.41,12.78,578.0,251.0,0.132081,-0.053,-9,47,X
4,Butler,Bucknell,2.67,3.3,0.7,1.7,0.102,-1.0,3.0,-0.074,4.04,8.04,218.0,333.0,-0.072935,0.008,-5,12,X


# Regression/Random Forset/LASSO Predictions

In [60]:
d['master'] = pd.DataFrame()

for n in num:
    d['master'] = d['master'].append(d[str(n)])

d['master'] = d['master'].dropna()
d['master'].index = range(len(d['master']))

In [61]:
list(d['master'])

['team_X',
 'team_Y',
 'AdjEM_x-y',
 'AdjO_x-y',
 'AdjD_x-y',
 'AdjT_x-y',
 'luck_x-y',
 'W_x-y',
 'L_x-y',
 'W_L_ratio_x-y',
 'SRS_x-y',
 'SOS_x-y',
 'TM_pt_x-y',
 'OPPO_pt_x-y',
 'PT_Ratio_x-y',
 'True_S_x-y',
 'seed_x-y',
 'point_diff_x-y',
 'result']

In [62]:
print ("Number of data points: ", len(d['master']))

Number of data points:  222


## Regression Models: Using Point Difference as Response Variable

In [9]:
# One feature prediciton: on "W_L_ratio"
c_response = d['master'][['point_diff_x-y']]
c_features = d['master'][['AdjEM_x-y']]

In [10]:
d['master'].isnull().sum().sum()

0

In [31]:
models = {'LASSO': linear_model.Lasso(alpha = 10E-6),\
         'LINEAR': linear_model.LinearRegression(),\
         'RF': RandomForestRegressor()}

#now, split the data in to training and testing sets using the sklearn function train_test_split

std_errors = defaultdict(list)

for sim in range(100):
    X_train, X_test, y_train, y_test = train_test_split(c_features, c_response, test_size = 0.90)
    print(X_train.columns)
    for model in models: #for every model in our model dictionary
        print (model, ': ')
        train_and_evaluate(models[model], X_train, np.ravel(y_train), verbose = True) #train the model on the data
        print('\n')
        
    #std_errors['W_L_ratio'].append(np.std(np.array(models['RF'].predict(X_test)) - y_test['point_diff']))

Index(['AdjEM_x-y'], dtype='object')
RF : 
Coefficient of determination on training set: 0.84551505585
Average coefficient of determination using 5-fold crossvalidation: -0.182984197634


LASSO : 
Coefficient of determination on training set: 0.435309553766
Average coefficient of determination using 5-fold crossvalidation: 0.277600997966


LINEAR : 
Coefficient of determination on training set: 0.435309553766
Average coefficient of determination using 5-fold crossvalidation: 0.392700575457


Index(['AdjEM_x-y'], dtype='object')
RF : 
Coefficient of determination on training set: 0.792460531586
Average coefficient of determination using 5-fold crossvalidation: -0.222256199159


LASSO : 
Coefficient of determination on training set: 0.388449899328
Average coefficient of determination using 5-fold crossvalidation: 0.344769916164


LINEAR : 
Coefficient of determination on training set: 0.388449899328
Average coefficient of determination using 5-fold crossvalidation: 0.197727871949


Index

## Binary Classification (SVM): Using Game Result as Response Variable

In [11]:
def powerset(seq):
    """
    Returns all the subsets of this set. This is a generator.
    """
    if len(seq) <= 1:
        yield seq
        yield []
    else:
        for item in powerset(seq[1:]):
            yield [seq[0]]+item
            yield item
            
feature_list_pre = powerset(list(d['master'])[2:17])
feature_list_pre = list(feature_list_pre)

In [13]:
feature_list = list()

for i in range(len(feature_list_pre)):
    if len(feature_list_pre[i])<=3 and len(feature_list_pre[i])>=1:
        feature_list.append(feature_list_pre[i])

In [15]:
len(feature_list)

575

In [16]:
feature_r_squared = dict()

for feature in feature_list:
    X = np.array(d['master'][feature]) 
    y = np.array(d['master']['result'])

    r_squared_lst = []
    for i in range(100):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.15)

        clf = svm.SVC()
        clf.fit(X_train, y_train)
        
        cv = KFold(X_train.shape[0], 10, shuffle = True)
        r_squared = cross_val_score(clf, X_train, y_train, cv = cv)
        r_squared = np.mean(r_squared)
        r_squared_lst.append(r_squared)
    print ("Average confidence w/ 100 repetitions for ", feature, "w/ 5-fold CV :", sum(r_squared_lst)/len(r_squared_lst))
    feature_r_squared[str(feature)] = sum(r_squared_lst)/len(r_squared_lst)
    

Average confidence w/ 100 repetitions for  ['PT_Ratio_x-y', 'True_S_x-y', 'seed_x-y'] w/ 5-fold CV : 0.724397660819
Average confidence w/ 100 repetitions for  ['OPPO_pt_x-y', 'True_S_x-y', 'seed_x-y'] w/ 5-fold CV : 0.733078947368


KeyboardInterrupt: 

In [17]:
sorted(feature_r_squared, key=feature_r_squared.get, reverse=True)

["['OPPO_pt_x-y', 'True_S_x-y', 'seed_x-y']",
 "['PT_Ratio_x-y', 'True_S_x-y', 'seed_x-y']"]

In [68]:
X = np.array(d['master'][['AdjT_x-y', 'W_x-y', 'PT_Ratio_x-y']])  # 0.79051754386
y = np.array(d['master']['result'])

r_squared_lst = []
for i in range(100):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.15)

    clf = svm.SVC() # class_weight={'X':1., 'Y':1.345}
    clf.fit(X_train, y_train)
        
    cv = KFold(X_train.shape[0], 10, shuffle = True)
    r_squared = cross_val_score(clf, X_train, y_train, cv = cv)
    r_squared = np.mean(r_squared)
    r_squared_lst.append(r_squared)
print ("Average confidence w/ 100 repetitions for ", "AdjT_x-y', 'W_x-y', 'PT_Ratio_x-y", "w/ 5-fold CV :", \
       sum(r_squared_lst)/len(r_squared_lst))


Average confidence w/ 100 repetitions for  AdjT_x-y', 'W_x-y', 'PT_Ratio_x-y w/ 5-fold CV : 0.791242690058


In [53]:
#y_rows = d['master'][d['master']['result']=='Y']

#d['master'] = d['master'].append(y_rows)

#sum(d['master']['result'] == 'Y')/sum(d['master']['result'] == 'X')

0.69090909090909092

## Prediction testing on 2016 Data

In [63]:
# drop the NA rows
d['2016'].dropna(inplace= True)

In [69]:
# Data to be fed to our model
example_measures_list = d['master'].iloc[:,[5,7,14]]
print (example_measures_list)

# The actual result of the games
result_list = d['master'].iloc[:,-1]
result_list.index = range(len(result_list))
print ('\n', 'Actual results: ', "\n", result_list)

     AdjT_x-y  W_x-y  PT_Ratio_x-y
0         5.8   15.0      0.184958
1        -4.6   -1.0      0.050474
2        -0.6   -3.0     -0.049257
3         0.3   -5.0     -0.015700
4        14.1   15.0      0.158563
5         2.4   -8.0     -0.093872
6         2.4   -1.0      0.106861
7        -0.3    2.0      0.024520
8        -2.1   -1.0     -0.021740
9         2.8   10.0      0.199554
10        7.9   12.0      0.248749
11        1.8    4.0      0.080976
12        3.2    4.0      0.028312
13       -2.5    8.0      0.108392
14        2.7    9.0      0.245711
15        1.1    7.0      0.126887
16        0.1   -1.0      0.019331
17       -4.6    6.0      0.015268
18        6.3    2.0      0.139870
19        2.2    7.0      0.029577
20        0.3   12.0      0.197785
21       10.5   -2.0      0.007500
22       -3.9    1.0     -0.037167
23       -0.9   -2.0     -0.020919
24        0.9    1.0     -0.067563
25       -1.1    4.0     -0.018838
26        8.4    7.0      0.126998
27       -0.9   -7.0

In [70]:
# Create an empty list to store the predictions
prediction_pre_list = []
prediction_list = []

for i in range(len(example_measures_list)):
    example_measures = example_measures_list.iloc[i]
    example_measures = example_measures.reshape(-1, len(example_measures))
    prediction = clf.predict(example_measures)
    prediction_pre_list.append(prediction)
    
for pre in prediction_pre_list:
    prediction_list.append(pre[0])
    
prediction_list = pd.Series(prediction_list)
print (prediction_list)

0      X
1      Y
2      X
3      Y
4      X
5      X
6      X
7      X
8      X
9      X
10     X
11     X
12     X
13     X
14     X
15     X
16     X
17     X
18     X
19     X
20     X
21     Y
22     Y
23     X
24     X
25     X
26     X
27     Y
28     X
29     X
      ..
192    X
193    Y
194    X
195    X
196    X
197    X
198    X
199    X
200    X
201    X
202    X
203    X
204    X
205    X
206    X
207    X
208    X
209    X
210    X
211    Y
212    X
213    X
214    Y
215    X
216    X
217    X
218    X
219    X
220    Y
221    X
dtype: object


In [71]:
compare = {'predictions' : prediction_list,'actual_results' : result_list}
compare = pd.DataFrame(compare)
compare['accuracy'] = compare['actual_results'] == compare['predictions']
print (compare)
print ('\n')
print ('Percentage of correctness: ', sum(compare['accuracy'])/len(compare['accuracy']))

    actual_results predictions accuracy
0                X           X     True
1                Y           Y     True
2                X           X     True
3                Y           Y     True
4                X           X     True
5                X           X     True
6                Y           X    False
7                X           X     True
8                X           X     True
9                X           X     True
10               X           X     True
11               X           X     True
12               X           X     True
13               X           X     True
14               X           X     True
15               Y           X    False
16               X           X     True
17               X           X     True
18               X           X     True
19               X           X     True
20               X           X     True
21               Y           Y     True
22               Y           Y     True
23               X           X     True


In [126]:
flase_x = []
flase_y = []

for i in range(len(compare)):
    if compare.loc[i, 'accuracy'] == False and compare.loc[i, 'predictions'] == 'X':
        flase_x.append(i)
    if compare.loc[i, 'accuracy'] == False and compare.loc[i, 'predictions'] == 'Y':
        flase_y.append(i)
print (len(flase_x))
print (len(flase_y))

28
1


In [127]:
for r in flase_x:
    print(d['master'].iloc[r,[5,7,14]])


AdjT_x-y             2.4
W_x-y                 -1
PT_Ratio_x-y    0.106861
Name: 6, dtype: object
AdjT_x-y             1.1
W_x-y                  7
PT_Ratio_x-y    0.126887
Name: 15, dtype: object
AdjT_x-y            -3.3
W_x-y                  4
PT_Ratio_x-y    0.132473
Name: 30, dtype: object
AdjT_x-y              3.4
W_x-y                   0
PT_Ratio_x-y    0.0732229
Name: 49, dtype: object
AdjT_x-y               1.8
W_x-y                    2
PT_Ratio_x-y    0.00274268
Name: 56, dtype: object
AdjT_x-y             -0.6
W_x-y                   3
PT_Ratio_x-y    0.0295316
Name: 65, dtype: object
AdjT_x-y            -0.6
W_x-y                  3
PT_Ratio_x-y    0.127066
Name: 75, dtype: object
AdjT_x-y              4.8
W_x-y                   2
PT_Ratio_x-y    0.0412807
Name: 77, dtype: object
AdjT_x-y              1.9
W_x-y                  -2
PT_Ratio_x-y   -0.0274076
Name: 89, dtype: object
AdjT_x-y               -1
W_x-y                   2
PT_Ratio_x-y    0.0643929
Name: 96, dtyp

## Generate predictions for 2017 March Madness!

In [97]:
# Read in the data
d['2017'] = pd.read_csv('~/Statsketball/Statsketball-Tournament/Master_Data/2017_master.csv').drop(['Unnamed: 0'], axis = 1)

# take out the features list
features_list = d['2017'].iloc[:,[4,7,10]]
print (features_list.head())

   AdjT_x-y  W_x-y  PT_Ratio_x-y
0      -2.8     12      0.240862
1      -5.0      3      0.119829
2     -11.3     -7      0.060613
3      -1.2     -3      0.023809
4      -2.8      9      0.190788


In [98]:
# Create an empty list to store the predictions
prediction_pre_list = []
prediction_list = []

for i in range(len(features_list)):
    features = features_list.iloc[i]
    features = features.reshape(-1, len(features))
    prediction = clf.predict(features)
    prediction_pre_list.append(prediction)
    
for pre in prediction_pre_list:
    prediction_list.append(pre[0])
prediction_list = pd.Series(prediction_list, index = range(len(prediction_list)))

In [99]:
prediction_list

0     X
1     X
2     X
3     X
4     X
5     X
6     X
7     X
8     X
9     X
10    X
11    X
12    X
13    X
14    X
15    X
16    X
17    X
18    X
19    Y
20    X
21    X
22    X
23    X
24    X
25    X
26    Y
27    X
28    X
29    X
30    Y
31    X
dtype: object

In [100]:
d['2017']['predicted_winner'] = 0
for i in range(len(prediction_list)):
    if prediction_list[i] == 'X':
        d['2017'].loc[i, 'predicted_winner'] = d['2017'].loc[i, 'team_X']
    else:
        d['2017'].loc[i, 'predicted_winner'] = d['2017'].loc[i, 'team_Y']

d['2017'].drop(['AdjT_x', 'AdjT_y', 'W_x', 'W_y', 'PT_Ratio_x', 'PT_Ratio_y'], axis = 1, inplace=True)

In [101]:
#d['2017'].drop(['AdjT_x-y', 'W_x-y', 'PT_Ratio_x-y'], axis = 1, inplace = True)

In [106]:
pd.DataFrame(d['2017'])

Unnamed: 0,team_X,team_Y,AdjT_x-y,W_x-y,PT_Ratio_x-y,predicted_winner
0,Villanova,Mt St.Mary's/New Orleans,-2.8,12,0.240862,Villanova
1,Wisconsin,Virginia Tech,-5.0,3,0.119829,Wisconsin
2,Virginia,UNC Wilmington,-11.3,-7,0.060613,Virginia
3,Florida,East Tenn,-1.2,-3,0.023809,Florida
4,SMU,Providence/USC,-2.8,9,0.190788,SMU
5,Baylor,New Mexico St.,-4.5,-3,-0.01451,Baylor
6,South Carolina,Marquette,-1.8,3,0.017575,South Carolina
7,Duke,Troy,-0.1,6,0.060891,Duke
8,Gonzaga,South Dakota St.,3.0,14,0.326169,Gonzaga
9,Northwestern,Vanderbilt,-0.4,4,0.046927,Northwestern
