In [1]:
# year (0), games played (1), wins (2), losses (3), 
# runs (4), at bats (5), hits by oposing batters (6), 
# doubles (7), triples (8), homeruns (9), walks (10), 
# strikeouts (11), stolen bases (12), 
# opponents runs scored (13), earned runs allowed (14), 
# earned run average (15), hits allowed (16), 
# homeruns allowed (17), walks allowed (18), 
# strikeouts by pitchers (19), errors (20)

In [2]:
# ----------------------------
# Features 4 through 20
# ----------------------------
# 1) runs 
# 2) at bats 
# 3) hits by oposing batters 
# 4) doubles 
# 5) triples 
# 6) homeruns 
# 7) walks  
# 8) strikeouts
# 9) stolen bases 
# 10) opponents runs scored
# 11) earned runs allowed 
# 12) earned run average 
# 13) hits allowed
# 14) homeruns allowed
# 15) walks allowed
# 16) strikeouts by pitchers
# 17) errors

features = [ "runs", "at bats", "hits by op. batters", 
             "doubles", "triples", "homeruns", 
             "walks", "strikeouts", "stolen bases", "opp. runs scored", 
             "era", "erav", "hits allowed", "homeruns allowed",
             "walks allowed", "strikeouts by pitchers", "errors"]

In [14]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as lm
import sklearn.svm as svm
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

#### Online documentation for various scikit-learn functions 
1. API reference (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection)
2. Generalized linear models (https://scikit-learn.org/stable/modules/linear_model.html#lasso)
3. LASSO and feature selection (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso) - look at regression coefficients (coef_ model attribute)
4. KFold cross validation (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html)

In [15]:
VIZ = False
NRM = False

stats = np.genfromtxt( "../data/teams_raw.csv", delimiter=",")
idx = stats[:,0] > 1999
stats_2X = stats[idx,:]

# labels
Y = stats_2X[:,2] >= np.mean( stats_2X[:,2] )
Y = np.int8( Y )

# features 4 through 20
X = stats_2X[:,4:]

if VIZ:
    plt.figure()
    plt.boxplot( X )
    plt.show()

    plt.figure()
    plt.stem( np.linspace( 1, np.size(X,1), np.size(X,1) ), np.max( X, 0 ) )
    plt.show()

if NRM:
    mx = np.max(X,0)
    for i in range(0,np.size(X,1)):
        X[:,i] = X[:,i] / mx[i]

model = lm.Lasso( alpha=1.0 )
model.fit( X, np.transpose( np.matrix( Y ) ) )

w = np.abs( model.coef_ ) * 1e3

idx = np.argsort( w * -1 )

for i in range(0, np.size(w )):
    print( "{0} ( w={1} ) ".format( features[idx[i]], w[idx[i]] ) )
    


opp. runs scored ( w=3.310357754041947 ) 
runs ( w=3.302166416368542 ) 
walks allowed ( w=0.5935288725727443 ) 
strikeouts ( w=0.3587212724778082 ) 
at bats ( w=0.3156851280080856 ) 
strikeouts by pitchers ( w=0.22662720508687517 ) 
homeruns ( w=0.13904747902481066 ) 
hits allowed ( w=0.07600864853838399 ) 
homeruns allowed ( w=0.0 ) 
erav ( w=0.0 ) 
stolen bases ( w=0.0 ) 
walks ( w=0.0 ) 
triples ( w=0.0 ) 
doubles ( w=0.0 ) 
hits by op. batters ( w=0.0 ) 
era ( w=0.0 ) 
errors ( w=0.0 ) 


In [16]:
fs_model = SelectFromModel( lm.Lasso( alpha=0.5 ) )
cf_model = svm.SVC( kernel="linear")

two_stage_pipeline = Pipeline( [ ('features', fs_model ), ('classifier', cf_model ) ], memory=None )
two_stage_pipeline.fit( X, Y )
print( "Accuracy of pipeline is {0:.2f}%".format( two_stage_pipeline.score(X,Y)*100 ) )

w = two_stage_pipeline.named_steps.features.get_support().astype( int )

for i in range(0, np.size(w )):
    print( "{0} ( w={1} ) ".format( features[idx[i]], w[idx[i]] ) )

Accuracy of pipeline is 93.33%
opp. runs scored ( w=1 ) 
runs ( w=1 ) 
walks allowed ( w=1 ) 
strikeouts ( w=1 ) 
at bats ( w=1 ) 
strikeouts by pitchers ( w=1 ) 
homeruns ( w=1 ) 
hits allowed ( w=1 ) 
homeruns allowed ( w=0 ) 
erav ( w=0 ) 
stolen bases ( w=0 ) 
walks ( w=0 ) 
triples ( w=0 ) 
doubles ( w=0 ) 
hits by op. batters ( w=1 ) 
era ( w=0 ) 
errors ( w=0 ) 


In [18]:
kf_eval = KFold( n_splits=10, shuffle=True)

kf_eval.get_n_splits( X )

for test_idx, train_idx in kf_eval.split(X):
    
    print( "--------------------------------------" )
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = Y[train_idx], Y[test_idx]
    
    two_stage_pipeline.fit( X_train, y_train )
    print( "[Train] Accuracy of pipeline is {0:.2f}".format( two_stage_pipeline.score(X,Y)*100 ) )
    test_acc = accuracy_score( two_stage_pipeline.predict( X_test ), y_test )
    print( "[Test] Accuracy of pipeline is {0:0.2f}".format( test_acc*100 ) )


--------------------------------------
[Train] Accuracy of pipeline is 88.96
[Test] Accuracy of pipeline is 87.73
--------------------------------------
[Train] Accuracy of pipeline is 91.25
[Test] Accuracy of pipeline is 90.28
--------------------------------------
[Train] Accuracy of pipeline is 87.50
[Test] Accuracy of pipeline is 86.11
--------------------------------------
[Train] Accuracy of pipeline is 86.88
[Test] Accuracy of pipeline is 85.42
--------------------------------------
[Train] Accuracy of pipeline is 85.42
[Test] Accuracy of pipeline is 83.80
--------------------------------------
[Train] Accuracy of pipeline is 86.46
[Test] Accuracy of pipeline is 84.95
--------------------------------------
[Train] Accuracy of pipeline is 85.21
[Test] Accuracy of pipeline is 83.56
--------------------------------------
[Train] Accuracy of pipeline is 82.92
[Test] Accuracy of pipeline is 81.02
--------------------------------------
[Train] Accuracy of pipeline is 83.75
[Test] Accu

In [1]:
from sklearn.naive_bayes import BernoulliNB
fs_model = SelectFromModel( lm.Lasso( alpha=0.5 ) )
cf_model = BernoulliNB()

two_stage_pipeline = Pipeline( [ ('features', fs_model ), ('classifier', cf_model ) ], memory=None )
two_stage_pipeline.fit( X, Y )
print( "Accuracy of pipeline is {0:.2f}%".format( two_stage_pipeline.score(X,Y)*100 ) )

w = two_stage_pipeline.named_steps.features.get_support().astype( int )

for i in range(0, np.size(w )):
    print( "{0} ( w={1} ) ".format( features[idx[i]], w[idx[i]] ) )

NameError: name 'SelectFromModel' is not defined