In [1]:
#import the python scientific suite
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import sklearn
import seaborn as sns
import scipy
import pylab
import statsmodels

from sklearn.cross_validation import *
from sklearn import svm, linear_model
from sklearn import preprocessing, cross_validation, neighbors
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import tree

from textblob import TextBlob

import statsmodels.api as sm
from collections import defaultdict


# This is importing a "future" python version 3 print function.
from __future__ import print_function
from __future__ import division

#plot figures inline in Jupyter notebooks
%matplotlib inline 

#use seaborn for plotting
sns.set()
sns.set_context('paper', font_scale = 1.5)
sns.set_style('ticks')
pylab.rcParams.update({'figure.autolayout': True})

In [2]:
def train_and_evaluate(clf, X_train, y_train, verbose = False):
        
    clf.fit(X_train, y_train) #fit the model

    #create a k-fold cross validation iterator of k = 5 folds

    cv = KFold(X_train.shape[0], 5, shuffle = True)
    scores = cross_val_score(clf, X_train, y_train, cv = cv)

    if verbose == True: #if we want to print status
        print("Coefficient of determination on training set:", clf.score(X_train, y_train))
        print("Average coefficient of determination using 5-fold crossvalidation:", np.mean(scores))
    
    return clf.score(X_train, y_train), np.mean(scores)

def OneDimensionalROC(clf, X_train, y_train, X_test):
    
    pylab.rcParams['figure.figsize'] = (6.0, 6.0)
    
    #debate differences between decision function and predict_proba?
    #they are inverse monotone
    
    #y_score = clf.fit(X_train, y_train).decision_function(X_test)
    y_score = clf.fit(X_train, y_train).predict_proba(X_test).T[1]

    n_classes = 1 #this is a single class classification problem

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    # Compute micro-average ROC curve and ROC area
    fpr, tpr, _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc = auc(fpr, tpr)
    
    return fpr, tpr, roc_auc, y_score

# Reading in Data files

In [3]:
file_dir = '~/Statsketball/Statsketball-Tournament/Master_Data/'

num = np.arange(2002,2017)
f = dict()
for n in num:
    f.update({str(n): str(n)+'_master.csv'})

d = defaultdict()

for fi in f:
    d[fi] = pd.read_csv(file_dir + f[fi])
    d[fi] = d[fi].drop(['Unnamed: 0'], 1)

In [4]:
print (f.keys())

dict_keys(['2013', '2010', '2003', '2011', '2009', '2005', '2002', '2014', '2007', '2008', '2012', '2016', '2004', '2006', '2015'])


In [5]:
d['2013'].head()

Unnamed: 0,team_X,team_Y,AdjEM_x-y,AdjO_x-y,AdjD_x-y,AdjT_x-y,luck_x-y,W_x-y,L_x-y,W_L_ratio_x-y,SRS_x-y,SOS_x-y,TM_pt_x-y,OPPO_pt_x-y,PT_Ratio_x-y,True_S_x-y,seed_x-y,point_diff_x-y,result
0,Indiana,James Madison,30.19,20.6,-9.5,2.6,-0.084,8.0,-8.0,0.223,26.63,10.66,485.0,-90.0,0.257502,0.076,-15,21,X
1,North Carolina State,Temple,,,,,,0.0,1.0,-0.02,4.8,1.97,252.0,149.0,0.038045,0.042,-1,-4,Y
2,UNLV,University of California,,,,,,,,,,,,,,,-7,-3,Y
3,Syracuse,Montana,22.6,8.2,-14.3,0.3,-0.179,5.0,3.0,-0.031,21.41,12.78,578.0,251.0,0.132081,-0.053,-9,47,X
4,Butler,Bucknell,2.67,3.3,0.7,1.7,0.102,-1.0,3.0,-0.074,4.04,8.04,218.0,333.0,-0.072935,0.008,-5,12,X


# Regression/Random Forset/LASSO Predictions

In [47]:
a = pd.get_dummies(d['2002']['seed'])
print(pd.factorize(d['2002']['seed']))

In [6]:
d['master'] = pd.DataFrame()

for n in num:
    d['master'] = d['master'].append(d[str(n)])

d['master'] = d['master'].dropna()
d['master'].index = range(len(d['master']))

In [7]:
list(d['master'])

['team_X',
 'team_Y',
 'AdjEM_x-y',
 'AdjO_x-y',
 'AdjD_x-y',
 'AdjT_x-y',
 'luck_x-y',
 'W_x-y',
 'L_x-y',
 'W_L_ratio_x-y',
 'SRS_x-y',
 'SOS_x-y',
 'TM_pt_x-y',
 'OPPO_pt_x-y',
 'PT_Ratio_x-y',
 'True_S_x-y',
 'seed_x-y',
 'point_diff_x-y',
 'result']

In [36]:
print ("Number of data points: ", len(d['master']))

Number of data points:  222


## Regression Models: Using Point Difference as Response Variable

In [26]:
# One feature prediciton: on "W_L_ratio"
c_response = d['master'][['point_diff_x-y']]
c_features = d['master'][['AdjEM_x-y']]

In [27]:
d['master'].isnull().sum().sum()

0

In [31]:
models = {'LASSO': linear_model.Lasso(alpha = 10E-6),\
         'LINEAR': linear_model.LinearRegression(),\
         'RF': RandomForestRegressor()}

#now, split the data in to training and testing sets using the sklearn function train_test_split

std_errors = defaultdict(list)

for sim in range(100):
    X_train, X_test, y_train, y_test = train_test_split(c_features, c_response, test_size = 0.50)
    print(X_train.columns)
    for model in models: #for every model in our model dictionary
        print (model, ': ')
        train_and_evaluate(models[model], X_train, np.ravel(y_train), verbose = True) #train the model on the data
        print('\n')
        
    #std_errors['W_L_ratio'].append(np.std(np.array(models['RF'].predict(X_test)) - y_test['point_diff']))

Index(['AdjEM_x-y'], dtype='object')
RF : 
Coefficient of determination on training set: 0.84551505585
Average coefficient of determination using 5-fold crossvalidation: -0.182984197634


LASSO : 
Coefficient of determination on training set: 0.435309553766
Average coefficient of determination using 5-fold crossvalidation: 0.277600997966


LINEAR : 
Coefficient of determination on training set: 0.435309553766
Average coefficient of determination using 5-fold crossvalidation: 0.392700575457


Index(['AdjEM_x-y'], dtype='object')
RF : 
Coefficient of determination on training set: 0.792460531586
Average coefficient of determination using 5-fold crossvalidation: -0.222256199159


LASSO : 
Coefficient of determination on training set: 0.388449899328
Average coefficient of determination using 5-fold crossvalidation: 0.344769916164


LINEAR : 
Coefficient of determination on training set: 0.388449899328
Average coefficient of determination using 5-fold crossvalidation: 0.197727871949


Index

## Binary Classification (SVM): Using Game Result as Response Variable

In [50]:
feature_list = list(d['master'])[2:18]

feature_result = dict()

for feature in feature_list:
    X = np.array(d['master'][[feature]]) 
    y = np.array(d['master']['result'])

    conf = []
    for i in range(1000):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.15)

        clf = svm.SVC()

        clf.fit(X_train, y_train)
        confidence = clf.score(X_test, y_test)
        conf.append(confidence)
    print ("Average confidence w/ 1000 repetitions for ", feature, " :", sum(conf)/len(conf))

    print ('Result for SVM: ')
    _, r_squared = train_and_evaluate(svm.SVC(), X_train, np.ravel(y_train), verbose = True) #train the model on the data
    print('\n')
    
    feature_result[feature] = r_squared
    

Average confidence w/ 1000 repetitions for  AdjEM_x-y  : 0.752705882353
Result for SVM: 
Coefficient of determination on training set: 0.797872340426
Average coefficient of determination using 5-fold crossvalidation: 0.771408250356


Average confidence w/ 1000 repetitions for  AdjO_x-y  : 0.708617647059
Result for SVM: 
Coefficient of determination on training set: 0.776595744681
Average coefficient of determination using 5-fold crossvalidation: 0.686628733997


Average confidence w/ 1000 repetitions for  AdjD_x-y  : 0.749941176471
Result for SVM: 
Coefficient of determination on training set: 0.776595744681
Average coefficient of determination using 5-fold crossvalidation: 0.739260312945


Average confidence w/ 1000 repetitions for  AdjT_x-y  : 0.740117647059
Result for SVM: 
Coefficient of determination on training set: 0.75
Average coefficient of determination using 5-fold crossvalidation: 0.749786628734


Average confidence w/ 1000 repetitions for  luck_x-y  : 0.739235294118
Result

In [51]:
sorted(feature_result, key=feature_result.get, reverse=True)

['point_diff_x-y',
 'AdjEM_x-y',
 'luck_x-y',
 'W_x-y',
 'W_L_ratio_x-y',
 'AdjT_x-y',
 'OPPO_pt_x-y',
 'True_S_x-y',
 'AdjD_x-y',
 'TM_pt_x-y',
 'L_x-y',
 'PT_Ratio_x-y',
 'SOS_x-y',
 'SRS_x-y',
 'seed_x-y',
 'AdjO_x-y']

In [59]:
# X = np.array(d['master'][['W_x-y', 'True_S_x-y', 'W_L_ratio_x-y','SOS_x-y']]) #0.750213371266
X = np.array(d['master'][['luck_x-y', 'AdjEM_x-y', 'W_x-y']])  # 0.807681365576
y = np.array(d['master']['result'])

conf = []
for i in range(1000):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.15)

    clf = svm.SVC()

    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    conf.append(confidence)
print (sum(conf)/len(conf))
print ('SVM: ')
train_and_evaluate(svm.SVC(), X_train, np.ravel(y_train), verbose = True) #train the model on the data
print('\n')


0.765117647059
SVM: 
Coefficient of determination on training set: 0.872340425532
Average coefficient of determination using 5-fold crossvalidation: 0.771408250356


