In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
# Goal: Predict WS (World Series) outcome (Win/Loss) given playoff team and regular season performance.

# Data: Teams.csv - db with every major league team from 1871-2016, various stats, and WSWin = Y or N.

# Method: Logistic regression on data with various numerical+categorical predictors

# Results: Odds are 83.56% that we correctly predict a playoff teams' outcome.

teams = pd.read_csv("core/Teams.csv")

teams.columns

Index(['yearID', 'lgID', 'teamID', 'franchID', 'divID', 'Rank', 'G', 'Ghome',
       'W', 'L', 'DivWin', 'WCWin', 'LgWin', 'WSWin', 'R', 'AB', 'H', '2B',
       '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF', 'RA', 'ER', 'ERA',
       'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP',
       'name', 'park', 'attendance', 'BPF', 'PPF', 'teamIDBR',
       'teamIDlahman45', 'teamIDretro'],
      dtype='object')

In [3]:
# Filtering out seasons before the WS was established
teams = teams[teams['yearID']>=1903]

# Filtering out teams that did not make it to the playoffs.
teams = teams[(teams['DivWin']=="Y") | (teams['WCWin']=="Y")]

# Filtering out teams that did not make it to the world series.
#teams = teams[teams['LgWin'] == "Y"]

In [4]:
# Assigning predictive variables for WS win
WS_Predictors = ['yearID','lgID','divID','R','H','2B','3B','HR','BB','SO','SB','ERA','CG']
WS_Win = ['WSWin']

In [5]:
# Putting predictive and target variables in db for preprocessing
data = teams[WS_Predictors+WS_Win]

# Dropping NaN values
data = pd.get_dummies(data.dropna(axis=0))

# Replace categorical variables with dummy 0/1 variables/vectors
data = pd.get_dummies(data)

# Remove redundant column with 1/0 for WSWin = N/Y in favor of the opposite column.
data.drop('WSWin_N',inplace=True,axis=1)

# Separate predictors from target
X = data.loc[:, data.columns != 'WSWin_Y']
y = data['WSWin_Y']

## TODO ##
## 1. Try to make y data be the next years WS Win
## 2. Separate years in train/test data

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state = 42)

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
model = LogisticRegression()

In [9]:
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [10]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)

# Odds that we are correct in our prediction
1-mse

0.8356164383561644

In [13]:
win_data = data[data['WSWin_Y']==1]
X_win = win_data.loc[:, win_data.columns != 'WSWin_Y']
X_win
#win_pred = model.predict(X_win)


Unnamed: 0,yearID,R,H,2B,3B,HR,BB,SO,SB,ERA,CG,lgID_AL,lgID_NL,divID_C,divID_E,divID_W
1532,1969,632,1311,184,41,109,527,1089.0,66.0,2.99,51,0,1,0,1,0
1542,1970,792,1424,213,25,179,717,952.0,84.0,3.15,60,1,0,0,1,0
1584,1971,788,1555,223,61,154,469,919.0,65.0,3.31,43,0,1,0,1,0
1606,1972,604,1248,195,29,134,463,886.0,87.0,2.58,42,1,0,0,0,1
1630,1973,758,1431,216,28,147,595,919.0,128.0,3.29,46,1,0,0,0,1
1654,1974,689,1315,205,37,132,568,876.0,164.0,2.95,49,1,0,0,0,1
1667,1975,840,1515,278,37,124,691,916.0,168.0,3.37,22,0,1,0,0,1
1691,1976,857,1599,271,63,141,681,902.0,210.0,3.51,33,0,1,0,0,1
1724,1977,831,1576,267,47,184,533,681.0,93.0,3.61,52,1,0,0,1,0
1750,1978,735,1489,228,38,125,505,695.0,98.0,3.18,39,1,0,0,1,0
