# Setup

In [293]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)

# Read and Prepare the Data:

In [294]:
#Readind the data from the file
bbgame = pd.read_csv("baseball.csv")
bbgame.head()

Unnamed: 0,attendance_binary,previous_attendance,previous_away_team_errors,previous_away_team_hits,previous_away_team_runs,game_type,previous_game_type,previous_home_team_errors,previous_home_team_hits,previous_home_team_runs,game_day,previous_game_day,temperature,wind_speed,sky,previous_game_duration,previous_homewin
0,0,43683,2,6,2,Night Game,Day Game,0,6,6,Wednesday,Monday,55,24,Overcast,2.933333,1
1,0,45785,0,7,2,Night Game,Day Game,0,10,3,Wednesday,Monday,48,7,Unknown,2.8,1
2,0,48282,0,8,4,Night Game,Day Game,2,4,3,Wednesday,Monday,65,10,Cloudy,3.383333,0
3,0,21830,0,9,6,Day Game,Night Game,0,15,11,Wednesday,Tuesday,77,0,In Dome,3.233333,1
4,0,49289,2,4,2,Night Game,Day Game,1,1,3,Tuesday,Monday,81,12,Cloudy,2.633333,1


In [295]:
#Creating a test and train split
from sklearn.model_selection import train_test_split

train, test = train_test_split(bbgame, test_size=0.3)

In [296]:
#Checking for null values
train_set.isna().sum()
test_set.isna().sum()

attendance_binary            0
previous_attendance          0
previous_away_team_errors    0
previous_away_team_hits      0
previous_away_team_runs      0
game_type                    0
previous_game_type           0
previous_home_team_errors    0
previous_home_team_hits      0
previous_home_team_runs      0
game_day                     0
previous_game_day            0
temperature                  0
wind_speed                   0
sky                          0
previous_game_duration       0
previous_homewin             0
dtype: int64

In [297]:
#Imports for data transformation wing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [298]:
#Sepetating the Dependent Variable
train_y = train['attendance_binary']
test_y = test['attendance_binary']

train_inputs = train.drop(['attendance_binary'], axis=1)
test_inputs = test.drop(['attendance_binary'], axis=1)

In [299]:
train_inputs.dtypes

previous_attendance            int64
previous_away_team_errors      int64
previous_away_team_hits        int64
previous_away_team_runs        int64
game_type                     object
previous_game_type            object
previous_home_team_errors      int64
previous_home_team_hits        int64
previous_home_team_runs        int64
game_day                      object
previous_game_day             object
temperature                    int64
wind_speed                     int64
sky                           object
previous_game_duration       float64
previous_homewin               int64
dtype: object

In [300]:
# Identifying the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [301]:
# Identifying the binary columns so we can pass them through without transforming
binary_columns = ['previous_homewin']

In [302]:
#Removing the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

In [303]:
binary_columns

['previous_homewin']

In [304]:
numeric_columns

['previous_attendance',
 'previous_away_team_errors',
 'previous_away_team_hits',
 'previous_away_team_runs',
 'previous_home_team_errors',
 'previous_home_team_hits',
 'previous_home_team_runs',
 'temperature',
 'wind_speed',
 'previous_game_duration']

In [305]:
categorical_columns

['game_type', 'previous_game_type', 'game_day', 'previous_game_day', 'sky']

# Pipeline

In [306]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [307]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [308]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [309]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

# Transform: fit_transform() for TRAIN

In [310]:
#Fit and Transformation of the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[-1.12371621,  0.57666325, -0.20719118, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.05787587, -0.72716391, -0.78017264, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.38394295,  0.57666325, -1.35315411, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.52534761,  0.57666325, -0.78017264, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.94392488, -0.72716391,  0.65228102, ...,  0.        ,
         0.        ,  0.        ],
       [-0.98407336,  1.88049041, -0.49368191, ...,  1.        ,
         0.        ,  1.        ]])

In [311]:
train_x.shape

(1698, 37)

# Tranform: transform() for TEST

In [312]:
# Transformation of the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[ 0.0814842 ,  1.88049041, -1.06666338, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.17001331, -0.72716391, -1.63964484, ...,  0.        ,
         0.        ,  0.        ],
       [-0.29905768, -0.72716391, -0.20719118, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.49582715, -0.72716391,  1.22526249, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.26445059, -0.72716391, -0.20719118, ...,  0.        ,
         0.        ,  1.        ],
       [-1.71775245, -0.72716391, -1.63964484, ...,  0.        ,
         0.        ,  0.        ]])

In [313]:
test_x.shape

(729, 37)

## Baseline Accuracy

In [314]:
# Finding majority class
train_y.value_counts()

1    873
0    825
Name: attendance_binary, dtype: int64

In [315]:
# Finding the percentages
train_y.value_counts()/len(train_y)

1    0.514134
0    0.485866
Name: attendance_binary, dtype: float64

# SVM Model 1:

In [316]:
from sklearn.svm import LinearSVC 

svm_clf = LinearSVC(C=0.1)

svm_clf.fit(train_x, train_y)

LinearSVC(C=0.1)

## Calculating the Accuracy

In [317]:
from sklearn.metrics import accuracy_score

In [318]:
#Predict the train values
train_y_pred = svm_clf.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8433451118963486

In [319]:
#Predict the test values
test_y_pred = svm_clf.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.813443072702332

In [320]:
#Classification Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, test_y_pred)

array([[279,  64],
       [ 72, 314]])

In [321]:
#Classification Report
from sklearn.metrics import classification_report
print(classification_report(test_y, test_y_pred))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80       343
           1       0.83      0.81      0.82       386

    accuracy                           0.81       729
   macro avg       0.81      0.81      0.81       729
weighted avg       0.81      0.81      0.81       729



# SVM Model 2:

In [322]:
from sklearn.preprocessing import PolynomialFeatures

# Creating second degree terms
poly_features = PolynomialFeatures(degree=2, include_bias=False)

train_x_poly = poly_features.fit_transform(train_x)
test_x_poly = poly_features.transform(test_x)

In [323]:
pol_svm = LinearSVC(C=0.1)

pol_svm.fit(train_x_poly, train_y)



LinearSVC(C=0.1)

In [324]:
#Predicting the train values
train_y_poly_pred = pol_svm.predict(train_x_poly)

#Train accuracy
accuracy_score(train_y, train_y_poly_pred)

0.9069493521790342

In [325]:
#Predicting the test values
test_y_poly_pred = pol_svm.predict(test_x_poly)

#Test accuracy
accuracy_score(test_y, test_y_poly_pred)

0.8120713305898491

# SVM Model 3:

In [326]:
from sklearn.svm import SVC

pol_svm2 = SVC(kernel="poly", degree=3, coef0=1, C=0.1, gamma='scale')

pol_svm2.fit(train_x, train_y)

SVC(C=0.1, coef0=1, kernel='poly')

In [327]:
#Predicting the train values
train_y_pred = pol_svm2.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y)

1.0

In [328]:
#Predicting the test values
test_y_pred = pol_svm2.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8189300411522634

# SVM Model 4:

In [361]:
rbf_svm = SVC(kernel="rbf", C=0.1, gamma='scale')

rbf_svm.fit(train_x, train_y)

SVC(C=0.1)

In [362]:
#Predicting the train values
train_y_pred = rbf_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8445229681978799

In [363]:
#Predicting the test values
test_y_pred = rbf_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8161865569272977

I have constructed 4 SVM models to see how each paramer tuned model performs in contrast to the base model. I have observed that the SVM model with Radial Basis Function Kernel performs slightly better than the standard SVM classification model. In summary SVM RBF returns an accuracy of 84.4 for train and 81.6 for test. The best model performs better than the baseline; the accuracy of base model is 51.4%  and SVM RBF returns an accuracy of 81.6.