In [5424]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

In [5425]:
AdvancedBattingDf = pd.read_csv('AdvancedBattingClean.csv')
AdvancedBattingDf.drop('index', axis=1, inplace=True)
temp = AdvancedBattingDf.loc[AdvancedBattingDf.Playoff=='Yes',['Tm','Year']]
temp.loc[AdvancedBattingDf.Year==2012,'Tm']                                                      

331          Atlanta Braves
332       Baltimore Orioles
336         Cincinnati Reds
339          Detroit Tigers
348        New York Yankees
349       Oakland Athletics
354    San Francisco Giants
355     St. Louis Cardinals
357           Texas Rangers
359    Washington Nationals
Name: Tm, dtype: object

# Cleaning 2022 Data

In [5426]:
data2022 = pd.read_csv('MLB2022_Data.txt')
standardBatting2022=pd.read_csv('StandardBatting_2022Data.txt')
data2022['G']=standardBatting2022['G']
data2022.info()
data2022.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 24 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Tm      30 non-null     object 
 1   rOBA    30 non-null     float64
 2   Rbat+   30 non-null     int64  
 3   BAbip   30 non-null     float64
 4   ISO     30 non-null     float64
 5   HR%     30 non-null     object 
 6   SO%     30 non-null     object 
 7   BB%     30 non-null     object 
 8   EV      30 non-null     float64
 9   HardH%  30 non-null     object 
 10  LD%     30 non-null     object 
 11  GB%     30 non-null     object 
 12  FB%     30 non-null     object 
 13  GB/FB   30 non-null     float64
 14  Pull%   30 non-null     object 
 15  Cent%   30 non-null     object 
 16  Oppo%   30 non-null     object 
 17  WPA     30 non-null     float64
 18  cWPA    30 non-null     object 
 19  RE24    30 non-null     float64
 20  RS%     30 non-null     object 
 21  SB%     30 non-null     object 
 22  XBT%

Unnamed: 0,Tm,rOBA,Rbat+,BAbip,ISO,HR%,SO%,BB%,EV,HardH%,...,Pull%,Cent%,Oppo%,WPA,cWPA,RE24,RS%,SB%,XBT%,G
0,Arizona Diamondbacks,0.309,94,0.268,0.159,3.0%,22.2%,9.3%,87.2,36.6%,...,33.2%,50.6%,16.1%,-3.6,-1.7%,-26.3,31%,75%,48%,117
1,Atlanta Braves,0.335,107,0.3,0.194,4.1%,24.5%,7.7%,89.8,43.5%,...,31.2%,52.7%,16.1%,1.6,0.5%,21.9,32%,75%,51%,119
2,Baltimore Orioles,0.313,99,0.286,0.152,2.7%,23.1%,7.6%,88.1,39.3%,...,31.0%,52.0%,17.0%,-4.7,-2.4%,-46.9,31%,76%,48%,117
3,Boston Red Sox,0.319,97,0.305,0.153,2.5%,22.0%,7.5%,89.0,40.8%,...,25.4%,54.7%,19.9%,-6.3,-3.7%,-37.9,32%,72%,38%,118
4,Chicago Cubs,0.321,99,0.3,0.151,2.7%,23.5%,8.5%,88.1,38.4%,...,30.1%,51.0%,18.9%,-11.2,-2.8%,-65.2,29%,74%,43%,116


In [5427]:
def cleanPercent(x):
  x=x[:-1]
  return float(x)/100

data2022['HR%'] = list(map(cleanPercent, data2022['HR%']))
data2022['HardH%'] = list(map(cleanPercent, data2022['HardH%']))
data2022['SO%'] = list(map(cleanPercent, data2022['SO%']))
data2022['BB%'] = list(map(cleanPercent, data2022['BB%']))
data2022['LD%'] = list(map(cleanPercent, data2022['LD%']))
data2022['GB%'] = list(map(cleanPercent, data2022['GB%']))
data2022['FB%'] = list(map(cleanPercent, data2022['FB%']))
data2022['Pull%'] = list(map(cleanPercent, data2022['Pull%']))
data2022['Cent%'] = list(map(cleanPercent, data2022['Cent%']))
data2022['Oppo%'] = list(map(cleanPercent, data2022['Oppo%']))
data2022['cWPA'] = list(map(cleanPercent, data2022['cWPA']))
data2022['RS%'] = list(map(cleanPercent, data2022['RS%']))
data2022['SB%'] = list(map(cleanPercent, data2022['SB%']))
data2022['XBT%'] = list(map(cleanPercent, data2022['XBT%']))

In [5428]:
data2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 24 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Tm      30 non-null     object 
 1   rOBA    30 non-null     float64
 2   Rbat+   30 non-null     int64  
 3   BAbip   30 non-null     float64
 4   ISO     30 non-null     float64
 5   HR%     30 non-null     float64
 6   SO%     30 non-null     float64
 7   BB%     30 non-null     float64
 8   EV      30 non-null     float64
 9   HardH%  30 non-null     float64
 10  LD%     30 non-null     float64
 11  GB%     30 non-null     float64
 12  FB%     30 non-null     float64
 13  GB/FB   30 non-null     float64
 14  Pull%   30 non-null     float64
 15  Cent%   30 non-null     float64
 16  Oppo%   30 non-null     float64
 17  WPA     30 non-null     float64
 18  cWPA    30 non-null     float64
 19  RE24    30 non-null     float64
 20  RS%     30 non-null     float64
 21  SB%     30 non-null     float64
 22  XBT%

In [5429]:
for col in data2022.columns:
  if col in ['WPA', 'RE24']:
    data2022[col] = data2022[col] * (162 / data2022['G'])
  else:
    continue

In [5430]:
data2022.head()

Unnamed: 0,Tm,rOBA,Rbat+,BAbip,ISO,HR%,SO%,BB%,EV,HardH%,...,Pull%,Cent%,Oppo%,WPA,cWPA,RE24,RS%,SB%,XBT%,G
0,Arizona Diamondbacks,0.309,94,0.268,0.159,0.03,0.222,0.093,87.2,0.366,...,0.332,0.506,0.161,-4.984615,-0.017,-36.415385,0.31,0.75,0.48,117
1,Atlanta Braves,0.335,107,0.3,0.194,0.041,0.245,0.077,89.8,0.435,...,0.312,0.527,0.161,2.178151,0.005,29.813445,0.32,0.75,0.51,119
2,Baltimore Orioles,0.313,99,0.286,0.152,0.027,0.231,0.076,88.1,0.393,...,0.31,0.52,0.17,-6.507692,-0.024,-64.938462,0.31,0.76,0.48,117
3,Boston Red Sox,0.319,97,0.305,0.153,0.025,0.22,0.075,89.0,0.408,...,0.254,0.547,0.199,-8.649153,-0.037,-52.032203,0.32,0.72,0.38,118
4,Chicago Cubs,0.321,99,0.3,0.151,0.027,0.235,0.085,88.1,0.384,...,0.301,0.51,0.189,-15.641379,-0.028,-91.055172,0.29,0.74,0.43,116


# Logistic Regression

In [5431]:
scaler = MinMaxScaler()
ML_Data = AdvancedBattingDf.drop(['Year','Tm','WPA', 'RE24'], axis = 1)
ML_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 19 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   rOBA     600 non-null    float64
 1   Rbat+    600 non-null    int64  
 2   BAbip    600 non-null    float64
 3   ISO      600 non-null    float64
 4   HR%      600 non-null    float64
 5   SO%      600 non-null    float64
 6   BB%      600 non-null    float64
 7   LD%      600 non-null    float64
 8   GB%      600 non-null    float64
 9   FB%      600 non-null    float64
 10  GB/FB    600 non-null    float64
 11  Pull%    600 non-null    float64
 12  Cent%    600 non-null    float64
 13  Oppo%    600 non-null    float64
 14  cWPA     600 non-null    float64
 15  RS%      600 non-null    float64
 16  SB%      600 non-null    float64
 17  XBT%     600 non-null    float64
 18  Playoff  600 non-null    object 
dtypes: float64(17), int64(1), object(1)
memory usage: 89.2+ KB


In [5432]:
#train-test split
X = ML_Data.drop('Playoff', axis = 1)
Y = ML_Data.Playoff

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.15)

In [5433]:
logregModel = LogisticRegression(max_iter = 10000)
logregModel.fit(X_train, Y_train)

LogisticRegression(max_iter=10000)

In [5434]:
predicted = logregModel.predict(X_test)
actual = Y_test

In [5435]:
print(metrics.accuracy_score(predicted, actual))

0.7555555555555555


In [5436]:
ML_2022Data = data2022.drop(['Tm','HardH%','EV','G','RE24','WPA'], axis = 1)
ML_2022Data = scaler.transform(ML_2022Data)

In [5437]:
pred_2022 = logregModel.predict(ML_2022Data)

In [5438]:
ML_2022_Predictions=pd.DataFrame({'2022 Teams': data2022.Tm, 'Playoff Predictions':pred_2022})
ML_2022_Predictions.loc[ML_2022_Predictions['Playoff Predictions']=='Yes']

Unnamed: 0,2022 Teams,Playoff Predictions
1,Atlanta Braves,Yes
13,Los Angeles Dodgers,Yes
18,New York Yankees,Yes
25,St. Louis Cardinals,Yes


# Support Vector Machines

In [5439]:
SVM_Data = AdvancedBattingDf.drop(['Year','Tm'], axis = 1)
SVM_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 21 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   rOBA     600 non-null    float64
 1   Rbat+    600 non-null    int64  
 2   BAbip    600 non-null    float64
 3   ISO      600 non-null    float64
 4   HR%      600 non-null    float64
 5   SO%      600 non-null    float64
 6   BB%      600 non-null    float64
 7   LD%      600 non-null    float64
 8   GB%      600 non-null    float64
 9   FB%      600 non-null    float64
 10  GB/FB    600 non-null    float64
 11  Pull%    600 non-null    float64
 12  Cent%    600 non-null    float64
 13  Oppo%    600 non-null    float64
 14  WPA      600 non-null    float64
 15  cWPA     600 non-null    float64
 16  RE24     600 non-null    float64
 17  RS%      600 non-null    float64
 18  SB%      600 non-null    float64
 19  XBT%     600 non-null    float64
 20  Playoff  600 non-null    object 
dtypes: float64(19), 

In [5440]:
X_SVM = SVM_Data.drop('Playoff', axis=1)
Y_SVM = SVM_Data.Playoff

In [5441]:
SVM_ml = svm.NuSVC(gamma='auto')
X_SVM = scaler.fit_transform(X_SVM)

SVM_X_train, SVM_X_test, SVM_Y_train, SVM_Y_test = train_test_split(X_SVM,Y_SVM, test_size=0.5)
SVM_ml.fit(SVM_X_train, SVM_Y_train)

NuSVC(gamma='auto')

In [5442]:
predicted_SVM = SVM_ml.predict(SVM_X_test)
print(metrics.accuracy_score(predicted_SVM, SVM_Y_test))

0.8233333333333334


In [5443]:
SVM_2022_Data = data2022.drop(['Tm','HardH%','EV','G'], axis = 1)
SVM_2022_Data = scaler.transform(SVM_2022_Data)
SVM_preds=SVM_ml.predict(SVM_2022_Data)

In [5444]:
SVM_2022_Predictions = pd.DataFrame({'2022 Teams': data2022.Tm, 'Playoff Predictions':SVM_preds})
SVM_2022_Predictions.loc[SVM_2022_Predictions['Playoff Predictions']=='Yes']

Unnamed: 0,2022 Teams,Playoff Predictions
1,Atlanta Braves,Yes
13,Los Angeles Dodgers,Yes
17,New York Mets,Yes
18,New York Yankees,Yes
25,St. Louis Cardinals,Yes


# Random Forest

In [5445]:
RF_Data = AdvancedBattingDf.drop(['Year','Tm'], axis = 1)
RF_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 21 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   rOBA     600 non-null    float64
 1   Rbat+    600 non-null    int64  
 2   BAbip    600 non-null    float64
 3   ISO      600 non-null    float64
 4   HR%      600 non-null    float64
 5   SO%      600 non-null    float64
 6   BB%      600 non-null    float64
 7   LD%      600 non-null    float64
 8   GB%      600 non-null    float64
 9   FB%      600 non-null    float64
 10  GB/FB    600 non-null    float64
 11  Pull%    600 non-null    float64
 12  Cent%    600 non-null    float64
 13  Oppo%    600 non-null    float64
 14  WPA      600 non-null    float64
 15  cWPA     600 non-null    float64
 16  RE24     600 non-null    float64
 17  RS%      600 non-null    float64
 18  SB%      600 non-null    float64
 19  XBT%     600 non-null    float64
 20  Playoff  600 non-null    object 
dtypes: float64(19), 

In [5446]:
X_RF = RF_Data.drop('Playoff', axis=1)
Y_RF = RF_Data.Playoff

In [5447]:
RF_ml = RandomForestClassifier( n_estimators=50, oob_score=True, bootstrap=True, min_samples_split=5)
RF_X_train, RF_X_test, RF_Y_train, RF_Y_test = train_test_split(X_RF,Y_RF, test_size=0.25)
RF_ml.fit(RF_X_train, RF_Y_train)

RandomForestClassifier(min_samples_split=5, n_estimators=50, oob_score=True)

In [5448]:
predicted_RF = RF_ml.predict(RF_X_test)
print(metrics.accuracy_score(predicted_RF, RF_Y_test))

0.7666666666666667


In [5449]:
RF_2022_Data = data2022.drop(['Tm','HardH%','EV','G'], axis = 1)
RF_preds=RF_ml.predict(RF_2022_Data)

In [5450]:
RF_2022_Predictions = pd.DataFrame({'2022 Teams': data2022.Tm, 'Playoff Predictions':RF_preds})
RF_2022_Predictions.loc[RF_2022_Predictions['Playoff Predictions']=='Yes']

Unnamed: 0,2022 Teams,Playoff Predictions
13,Los Angeles Dodgers,Yes
17,New York Mets,Yes
18,New York Yankees,Yes
24,San Francisco Giants,Yes
25,St. Louis Cardinals,Yes
