In [193]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
%matplotlib inline
sns.set_style('dark')

import warnings
warnings.filterwarnings(action="ignore", module="sklearn")

In [52]:
mlb = pd.read_csv('mlb_elo.csv', low_memory=False)

mlb.head()

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1_pre,elo2_pre,elo_prob1,elo_prob2,...,pitcher1_rgs,pitcher2_rgs,pitcher1_adj,pitcher2_adj,rating_prob1,rating_prob2,rating1_post,rating2_post,score1,score2
0,2018-10-28,2018,0,w,LAD,BOS,1572.026537,1603.192577,0.486253,0.513747,...,55.720196,55.904108,8.777848,11.191392,0.483877,0.516123,1572.395835,1610.086323,1,5
1,2018-10-27,2018,0,w,LAD,BOS,1575.479964,1599.73915,0.499503,0.500497,...,54.474209,53.638152,3.666228,0.386608,0.508342,0.491658,1576.245147,1606.237011,6,9
2,2018-10-26,2018,0,w,LAD,BOS,1573.220427,1601.998687,0.490832,0.509168,...,57.86924,51.146043,20.863039,-11.123666,0.555907,0.444093,1579.775197,1602.706961,3,2
3,2018-10-24,2018,0,w,BOS,LAD,1600.026162,1575.192952,0.59262,0.40738,...,55.360095,51.91669,9.286617,-7.249446,0.619808,0.380192,1604.605197,1577.876961,4,2
4,2018-10-23,2018,0,w,BOS,LAD,1597.035414,1578.1837,0.581491,0.418509,...,64.089475,56.628589,50.165559,14.184438,0.648954,0.351046,1602.81127,1579.670888,8,4


In [53]:
mlb.columns

Index(['date', 'season', 'neutral', 'playoff', 'team1', 'team2', 'elo1_pre',
       'elo2_pre', 'elo_prob1', 'elo_prob2', 'elo1_post', 'elo2_post',
       'rating1_pre', 'rating2_pre', 'pitcher1', 'pitcher2', 'pitcher1_rgs',
       'pitcher2_rgs', 'pitcher1_adj', 'pitcher2_adj', 'rating_prob1',
       'rating_prob2', 'rating1_post', 'rating2_post', 'score1', 'score2'],
      dtype='object')

In [54]:
# turn the names of columns into binary naming
mlb = mlb.rename(columns={'team1':'team0', 'team2':'team1', 'elo1_pre':'elo0_pre', 'elo2_pre':'elo1_pre',
                          'elo_prob1':'elo_prob0', 'elo_prob2':'elo_prob1', 'elo1_post':'elo0_post', 'elo2_post':'elo1_post',
                          'rating1_pre':'rating0_pre', 'rating2_pre':'rating1_pre', 'pitcher1':'pitcher0', 'pitcher2':'pitcher1', 
                          'pitcher1_rgs':'pitcher0_rgs', 'pitcher2_rgs':'pitcher1_rgs', 'pitcher1_adj':'pitcher0_adj', 
                          'pitcher2_adj':'pitcher1_adj', 'rating_prob1':'rating_prob0',
                          'rating_prob2':'rating_prob1', 'rating1_post':'rating0_post', 
                          'rating2_post':'rating1_post', 'score1':'score0', 'score2':'score1'})

In [55]:
# create a binary variable with the winner of each game

mlb['winner'] = np.where(mlb['score0']>mlb['score1'], 0, 1)

mlb.head()

Unnamed: 0,date,season,neutral,playoff,team0,team1,elo0_pre,elo1_pre,elo_prob0,elo_prob1,...,pitcher1_rgs,pitcher0_adj,pitcher1_adj,rating_prob0,rating_prob1,rating0_post,rating1_post,score0,score1,winner
0,2018-10-28,2018,0,w,LAD,BOS,1572.026537,1603.192577,0.486253,0.513747,...,55.904108,8.777848,11.191392,0.483877,0.516123,1572.395835,1610.086323,1,5,1
1,2018-10-27,2018,0,w,LAD,BOS,1575.479964,1599.73915,0.499503,0.500497,...,53.638152,3.666228,0.386608,0.508342,0.491658,1576.245147,1606.237011,6,9,1
2,2018-10-26,2018,0,w,LAD,BOS,1573.220427,1601.998687,0.490832,0.509168,...,51.146043,20.863039,-11.123666,0.555907,0.444093,1579.775197,1602.706961,3,2,0
3,2018-10-24,2018,0,w,BOS,LAD,1600.026162,1575.192952,0.59262,0.40738,...,51.91669,9.286617,-7.249446,0.619808,0.380192,1604.605197,1577.876961,4,2,0
4,2018-10-23,2018,0,w,BOS,LAD,1597.035414,1578.1837,0.581491,0.418509,...,56.628589,50.165559,14.184438,0.648954,0.351046,1602.81127,1579.670888,8,4,0


In [26]:
mlb.columns

Index(['date', 'season', 'neutral', 'playoff', 'team0', 'team1', 'elo0_pre',
       'elo1_pre', 'elo_prob0', 'elo_prob1', 'elo0_post', 'elo1_post',
       'rating0_pre', 'rating1_pre', 'pitcher0', 'pitcher1', 'pitcher0_rgs',
       'pitcher1_rgs', 'pitcher0_adj', 'pitcher1_adj', 'rating_prob0',
       'rating_prob1', 'rating0_post', 'rating1_post', 'score0', 'score1',
       'winner'],
      dtype='object')

In [57]:
features = ['season', 'neutral', 'elo0_pre', 'elo1_pre', 'elo_prob0', 'elo_prob1', 'rating0_pre', 'rating1_pre',
            'pitcher0_rgs', 'pitcher1_rgs', 'pitcher0_adj', 'pitcher1_adj', 'rating_prob0',
            'rating_prob1']

mlb = mlb.dropna()
mlb.describe()

Unnamed: 0,season,neutral,elo0_pre,elo1_pre,elo_prob0,elo_prob1,elo0_post,elo1_post,rating0_pre,rating1_pre,...,pitcher1_rgs,pitcher0_adj,pitcher1_adj,rating_prob0,rating_prob1,rating0_post,rating1_post,score0,score1,winner
count,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0,...,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0,1525.0
mean,1986.033443,0.0,1555.752692,1555.416772,0.546008,0.453992,1555.669461,1555.500005,1555.763705,1555.341862,...,54.465072,8.161676,8.026899,0.5471,0.4529,1555.66413,1555.441433,4.107541,3.843279,0.450492
std,27.649395,0.0,20.938475,21.456301,0.053662,0.053662,21.204075,21.750404,21.448402,22.009375,...,4.168144,9.873138,10.053475,0.058319,0.058319,21.696973,22.298615,2.91934,2.915425,0.497706
min,1913.0,0.0,1485.273,1487.336,0.389169,0.306389,1482.773,1485.273,1482.849,1484.9,...,43.509,-27.3493,-27.636,0.352993,0.288651,1480.292,1482.849,0.0,0.0,0.0
25%,1972.0,0.0,1540.32,1540.232,0.509952,0.418509,1540.32,1540.217,1539.796,1539.623,...,51.585,3.368611,3.368611,0.506744,0.413765,1540.286,1539.235,2.0,2.0,0.0
50%,1996.0,0.0,1554.704,1554.599,0.546776,0.453224,1554.72,1554.642,1554.467,1554.464,...,54.096,7.721274,7.721274,0.547697,0.452303,1554.467,1554.255,4.0,3.0,0.0
75%,2007.0,0.0,1570.318,1570.467,0.581491,0.490048,1570.014,1570.727,1571.01,1571.34,...,57.013,11.579111,9.821046,0.586235,0.493256,1571.178,1571.34,6.0,5.0,1.0
max,2018.0,0.0,1618.326,1623.472,0.693611,0.610831,1620.47,1625.935,1623.602,1628.765,...,72.675,71.925024,74.3751,0.711349,0.647007,1626.135,1631.597,23.0,19.0,1.0


In [59]:
from sklearn.preprocessing import StandardScaler

# Separating out the features
x = mlb.loc[:, features].values
# Separating out the target
y = mlb.loc[:,['winner']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

In [60]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['pc1','pc2', 'pc3'])

In [61]:
mlb_m = pd.concat([principalDf, mlb], axis = 1).dropna()
mlb_m.head().dropna()

Unnamed: 0,pc1,pc2,pc3,date,season,neutral,playoff,team0,team1,elo0_pre,...,pitcher1_rgs,pitcher0_adj,pitcher1_adj,rating_prob0,rating_prob1,rating0_post,rating1_post,score0,score1,winner
0,2.592791,2.526961,0.80702,2018-10-28,2018.0,0.0,w,LAD,BOS,1572.026537,...,55.904108,8.777848,11.191392,0.483877,0.516123,1572.395835,1610.086323,1.0,5.0,1.0
1,1.885953,2.50638,-0.448577,2018-10-27,2018.0,0.0,w,LAD,BOS,1575.479964,...,53.638152,3.666228,0.386608,0.508342,0.491658,1576.245147,1606.237011,6.0,9.0,1.0
2,1.245307,2.580401,-0.134958,2018-10-26,2018.0,0.0,w,LAD,BOS,1573.220427,...,51.146043,20.863039,-11.123666,0.555907,0.444093,1579.775197,1602.706961,3.0,2.0,0.0
3,-2.424855,2.565902,-0.842756,2018-10-24,2018.0,0.0,w,BOS,LAD,1600.026162,...,51.91669,9.286617,-7.249446,0.619808,0.380192,1604.605197,1577.876961,4.0,2.0,0.0
4,-2.880414,2.762066,3.832165,2018-10-23,2018.0,0.0,w,BOS,LAD,1597.035414,...,56.628589,50.165559,14.184438,0.648954,0.351046,1602.81127,1579.670888,8.0,4.0,0.0


In [46]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectKBest, f_classif

features = ['elo0_pre', 'elo1_pre', 'elo_prob0', 'elo_prob1', 'rating0_pre', 'rating1_pre',
            'pitcher0_rgs', 'pitcher1_rgs', 'pitcher0_adj', 'pitcher1_adj', 'rating_prob0',
            'rating_prob1', 'pc1','pc2', 'pc3']

X = mlb_m[features]
y = mlb_m['winner']

# place results into a dataframe
selector=SelectKBest(score_func=f_classif,k=15)
selector.fit(X,y)
scores = pd.DataFrame()
scores["Attribute"] = features
scores["F Score"] = selector.scores_
scores["P Value"] = selector.pvalues_
scores["Support"] = selector.get_support()

scores.sort_values(by=['F Score'], ascending=False)

Unnamed: 0,Attribute,F Score,P Value,Support
3,elo_prob1,4.230868,0.048193,True
2,elo_prob0,4.230868,0.048193,True
1,elo1_pre,4.227969,0.048265,True
5,rating1_pre,4.145176,0.050375,True
12,pc1,3.554158,0.0688,True
11,rating_prob1,2.739089,0.108012,True
10,rating_prob0,2.739089,0.108012,True
13,pc2,0.992561,0.326831,True
0,elo0_pre,0.187533,0.667976,True
4,rating0_pre,0.135272,0.715527,True


In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix

In [181]:
from sklearn.preprocessing import StandardScaler

features = ['elo0_pre', 'elo1_pre', 'elo_prob0', 'elo_prob1', 'rating0_pre', 'rating1_pre',
            'pitcher0_rgs', 'pitcher1_rgs', 'pitcher0_adj', 'pitcher1_adj', 'rating_prob0',
            'rating_prob1']

# Separating out the features
X = mlb[features]
# Separating out the target
Y = mlb['winner']
# Standardizing the features
X = StandardScaler().fit_transform(X)

#split out a training and test set
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)

In [67]:
# Instantiate and fit our model.
#X = mlb_m[features].values
#Y = mlb_m['winner'].values.reshape(-1, 1)
#split out a training and test set
#x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)

### Logistic Regression

Here, the model works decently, with a bit of overfitting but not too much. Predicting at an accuracy of 55% isn't all that great but predicting the winner of a game is often fairly random.

In [190]:
lr = LogisticRegression(C=200, penalty='l2')
lr.fit(x_train, y_train)

# Display.
print('Coefficients:')
print(lr.coef_)
print('\nIntercept:')
print(lr.intercept_)

print('\nTrain Percentage accuracy:')
print(lr.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(lr.score(x_test, y_test))

print('\nCross Validation Score:')
print(cross_val_score(lr, x_train, y_train, cv = 4))

print('\nCross Validation Mean:')
print(cross_val_score(lr, x_train, y_train, cv = 4).mean())

print('\nCross Validation Standard Deviation:')
print(cross_val_score(lr, x_train, y_train, cv = 4).std())

Coefficients:
[[ 0.42254694 -0.55176097 -0.51886447  0.51886447 -0.18530843  0.27444976
  -0.07919706  0.17262635 -0.20436705  0.05900657  0.24807212 -0.24807212]]

Intercept:
[-0.26540341]

Train Percentage accuracy:
0.5809273840769904

Test Percentage accuracy:
0.5314136125654451

Cross Validation Score:
[0.56293706 0.56993007 0.54195804 0.5754386 ]

Cross Validation Mean:
0.5625659428291008

Cross Validation Standard Deviation:
0.012696051440410786


### Ridge Regression

I can't tell if I am doing something wrong in setting up the ridge regression, but it is obvious here that the logistic regression works much much better. A part of that might be the data and the ultimately random nature of predicting the winner of a game. 

In [191]:
ridgeregr = linear_model.Ridge(alpha=25, fit_intercept=False) 
ridgeregr.fit(x_train, y_train)

# Display.
print('Coefficients:')
print(ridgeregr.coef_)
print('\nIntercept:')
print(ridgeregr.intercept_)

origparams = ridgeregr.coef_[0]
print('\nParameter estimates:')
print(origparams)

print('\nTrain Percentage accuracy:')
print(ridgeregr.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(ridgeregr.score(x_test, y_test))

print('\nCross Validation Score:')
print(cross_val_score(ridgeregr, x_train, y_train, cv = 4))

print('\nCross Validation Mean:')
print(cross_val_score(ridgeregr, x_train, y_train, cv = 4).mean())

print('\nCross Validation Standard Deviation:')
print(cross_val_score(ridgeregr, x_train, y_train, cv = 4).std())

Coefficients:
[-0.02405436  0.02744287 -0.03047858  0.03047858 -0.01388817  0.00642452
 -0.01817649  0.04349298 -0.0261802  -0.00945963  0.02353426 -0.02353426]

Intercept:
0.0

Parameter estimates:
-0.024054361162177948

Train Percentage accuracy:
-0.7468054016162873

Test Percentage accuracy:
-0.9817938858630744

Cross Validation Score:
[-0.73208699 -0.64925474 -0.81161059 -0.88060822]

Cross Validation Mean:
-0.7683901343495456

Cross Validation Standard Deviation:
0.08656221548350644


### Lasso Regression

Again, I can't tell if I am doing something wrong here but this model does not seem to work.

In [192]:
lass = linear_model.Lasso(alpha=.35)
lassfit = lass.fit(x_train, y_train)

# Display.
print('Coefficients:')
print(lassfit.coef_)
print('\nIntercept:')
print(lassfit.intercept_)

print('\nTrain Percentage accuracy:')
print(lassfit.score(x_train, y_train))

print('\nTest Percentage accuracy:')
print(lassfit.score(x_test, y_test))

origparams = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates:')
print(origparams)

print('\nCross Validation Score:')
print(cross_val_score(lassfit, x_train, y_train, cv = 4))

print('\nCross Validation Mean:')
print(cross_val_score(lassfit, x_train, y_train, cv = 4).mean())

print('\nCross Validation Standard Deviation:')
print(cross_val_score(lassfit, x_train, y_train, cv = 4).std())

Coefficients:
[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0.]

Intercept:
0.4374453193350831

Train Percentage accuracy:
0.0

Test Percentage accuracy:
-0.010855515147828008

Parameter estimates:
[-0.          0.         -0.          0.         -0.          0.
 -0.          0.         -0.         -0.         -0.          0.
  0.43744532]

Cross Validation Score:
[-0.00010898 -0.00924452 -0.00132948 -0.00471882]

Cross Validation Mean:
-0.003850450330071531

Cross Validation Standard Deviation:
0.00354273626595941
