In [45]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [46]:
# Goal: Predict WS (World Series) outcome (Win/Loss) given playoff team and regular season performance.

# Data: Teams.csv - db with every major league team from 1871-2021, various stats, and WSWin = Y or N.

# Methods: Testing Logistic Regression, SVM Classifier, Trees, Random Forests

# Results: 

teams = pd.read_csv('https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/master/core/Teams.csv')

teams.columns

Index(['yearID', 'lgID', 'teamID', 'franchID', 'divID', 'Rank', 'G', 'Ghome',
       'W', 'L', 'DivWin', 'WCWin', 'LgWin', 'WSWin', 'R', 'AB', 'H', '2B',
       '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF', 'RA', 'ER', 'ERA',
       'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP',
       'name', 'park', 'attendance', 'BPF', 'PPF', 'teamIDBR',
       'teamIDlahman45', 'teamIDretro'],
      dtype='object')

In [47]:

# Filtering out seasons before 1970
teams = teams[teams['yearID']>=1970]

# Filtering out teams that did not make it to the playoffs.
teams = teams[(teams['DivWin']=="Y") | (teams['WCWin']=="Y")]

# useless columns to drop
dropped_cols = ['Rank','G','Ghome','teamIDBR','teamIDlahman45','teamIDretro','DivWin','WCWin',
       'LgWin','name','park']

teams = teams.drop(dropped_cols, axis=1)

# Replace Y and N with 1 and 0
teams.WSWin = teams.WSWin.replace(to_replace=['N','Y'], value=[0,1])

In [48]:
# Replace categorical variables with dummy 0/1 variables/vectors
#teams = pd.get_dummies(teams)

# Remove redundant column with 1/0 for WSWin = N/Y in favor of the opposite column.
#teams.drop('WSWin_N',inplace=True,axis=1)

In [41]:
teams.isna().sum()

yearID        0
lgID          0
teamID        0
franchID      0
divID         0
W             0
L             0
WSWin         0
R             0
AB            0
H             0
2B            0
3B            0
HR            0
BB            0
SO            0
SB            0
CS            0
HBP           0
SF            0
RA            0
ER            0
ERA           0
CG            0
SHO           0
SV            0
IPouts        0
HA            0
HRA           0
BBA           0
SOA           0
E             0
DP            0
FP            0
attendance    0
BPF           0
PPF           0
dtype: int64

In [49]:
teams_noID = teams.drop(columns=["lgID", "franchID", "divID"], axis=1)
teams = pd.get_dummies(teams, columns=["lgID", "franchID", "divID"])

In [50]:
teams.head()

Unnamed: 0,yearID,teamID,W,L,WSWin,R,AB,H,2B,3B,...,franchID_SEA,franchID_SFG,franchID_STL,franchID_TBD,franchID_TEX,franchID_TOR,franchID_WSN,divID_C,divID_E,divID_W
1542,1970,BAL,108,54,1,792,5545,1424,213,25,...,0,0,0,0,0,0,0,0,1,0
1547,1970,CIN,102,60,0,775,5540,1498,253,45,...,0,0,0,0,0,0,0,0,0,1
1553,1970,MIN,98,64,0,744,5483,1438,230,41,...,0,0,0,0,0,0,0,0,0,1
1560,1970,PIT,89,73,0,729,5637,1522,235,70,...,0,0,0,0,0,0,0,0,1,0
1566,1971,BAL,101,57,0,742,5303,1382,207,25,...,0,0,0,0,0,0,0,0,1,0


In [53]:
# Training model with team/lg/div names

teams.shape
y = teams.WSWin
X = teams.drop(['yearID', 'teamID', 'WSWin'], axis=1)

In [54]:
# Training model without team/lg/div names

teams_noID.shape
y_n = teams_noID.WSWin
X_n = teams_noID.drop(['yearID', 'teamID', 'WSWin'], axis=1)

In [62]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
clf = RandomForestClassifier(max_depth=3)
#clf.fit(X,y)
lreg = LogisticRegression(solver='liblinear', penalty='l1')

In [63]:
clf_n = RandomForestClassifier(max_depth=3)
#clf_n.fit(X_n,y_n)
lreg_n = LogisticRegression(solver='liblinear', penalty='l1')

In [64]:

#clf_scores = cross_val_score(clf, X, y, cv=skf, scoring='accuracy')
#print(clf_scores)
clf_scores = cross_val_score(clf, X, y, cv=skf, scoring='average_precision')
print(clf_scores)
clf_scores = cross_val_score(lreg, X, y, cv=skf, scoring='average_precision')
print(clf_scores)

[0.21284852 0.12140013 0.19618526 0.26452973 0.28762961]
[0.22514985 0.13366392 0.12888417 0.29329071 0.37745292]


In [65]:
#clf_scores = cross_val_score(clf, X, y, cv=skf, scoring='accuracy')
#print(clf_scores)
clf_scores = cross_val_score(clf_n, X_n, y_n, cv=skf, scoring='average_precision')
print(clf_scores)
clf_scores = cross_val_score(lreg_n, X_n, y_n, cv=skf, scoring='average_precision')
print(clf_scores)

[0.21741861 0.12462398 0.17967909 0.2359369  0.27821974]
[0.20187634 0.13061482 0.1386097  0.32196003 0.45235482]
