In [362]:
# Model Training:
# 1) Load all data from preprocessing (training/test splits, etc)
# 2) Begin Training Models
    #  a) Decision Tree
    #  b) Naive Bayes
    #  c) Logistic Regression
    #  d) SVM
# 3) Testing Models
# 4) New Iterations

In [363]:
# Imports
from Models import ModelUtil
import importlib
from Data import Preprocessing
from sklearn.model_selection import train_test_split
from Logs import logging as logs
import pandas
import numpy
import math
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold

In [365]:
# 1) Load all data from preprocessing
importlib.reload(Preprocessing)
importlib.reload(ModelUtil)
importlib.reload(logs)

pandas.set_option('display.min_rows',5)

dataFrame = Preprocessing.dataProcessing()
infield, outfield = Preprocessing.dataFiltering(dataFrame)


Infield Data: (No Pitcher / Batter IDs)


Unnamed: 0,PitcherThrows_Right,PitcherThrows_Left,BatterSide_Right,BatterSide_Left,TaggedPitchType_ChangeUp,TaggedPitchType_Curveball,TaggedPitchType_Cutter,TaggedPitchType_Fastball,TaggedPitchType_FourSeamFastBall,TaggedPitchType_Sinker,TaggedPitchType_Slider,TaggedPitchType_Splitter,TaggedHitType_GroundBall,ZoneSpeed,PlateLocHeight,PlateLocSide,VertApprAngle,HorzApprAngle,RelSpeed,Direction,Distance
148,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.618983,0.628555,0.396940,0.740884,0.558973,0.612908,0.676860,0.170838
429,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.460569,0.681070,0.368010,0.513916,0.762501,0.403231,0.978072,0.284395
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11037,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.143092,0.542205,0.571930,0.336653,0.688333,0.291923,0.223692,0.096578
11073,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.602029,0.607562,0.799601,0.720504,0.694682,0.615386,0.280237,0.318906



Outfield Data: (No Pitcher / Batter IDs)


Unnamed: 0,PitcherThrows_Right,PitcherThrows_Left,BatterSide_Right,BatterSide_Left,TaggedPitchType_ChangeUp,TaggedPitchType_Curveball,TaggedPitchType_Cutter,TaggedPitchType_Fastball,TaggedPitchType_FourSeamFastBall,TaggedPitchType_Sinker,TaggedPitchType_Slider,TaggedPitchType_Splitter,TaggedHitType_FlyBall,TaggedHitType_LineDrive,ZoneSpeed,PlateLocHeight,PlateLocSide,VertApprAngle,HorzApprAngle,RelSpeed,Direction,Distance
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.678734,0.322793,0.503922,0.623942,0.574357,0.697695,0.539397,0.708019
27,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.731690,0.603275,0.535222,0.719754,0.547786,0.739539,0.519547,0.548674
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11035,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.579952,0.496505,0.315830,0.630448,0.555952,0.577488,0.401207,0.584206
11100,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.679112,0.492692,0.624825,0.785695,0.723216,0.701936,0.492214,0.458072


In [366]:
# 2) Training Models
Y = infield[0][['Direction','Distance']]
X = infield[0][infield[1]]
trainIn  = []
trainOut = []
testIn   = []
testOut  = []
#display(Y)
#display(X)

# KFold:
kf = KFold(n_splits=X.shape[0], shuffle=True)
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    #xTrain, xTest = X.iloc[train_index,:], X.iloc[test_index,:]
    #yTrain, yTest = Y.iloc[train_index,:], Y.iloc[test_index,:]
    trainIn.append(X.iloc[train_index,:])
    trainOut.append(Y.iloc[train_index,:])
    testIn.append(X.iloc[test_index,:])
    testOut.append(Y.iloc[test_index,:])
    

# GroupKFold: (avoids putting data from the same group in the test set -- useful for Pitcher/Batter ID)

#xTrain, xTest, yTrain, yTest = train_test_split(_in, _out, test_size=0.25, random_state=11)

In [367]:
# z) RandomForestRegressor
for i in range(0, len(trainIn)):
    direction, distance = ModelUtil.runRFR(trainIn[i], trainOut[i], testIn[i], testOut[i])

# This will take an insanely long time. One loop took my beefy computer 200 minutes to run.
# This absolutely needs to be trimmed down when it comes to the CV training. I was just testing stuff.

Fitting 5 folds for each of 15840 candidates, totalling 79200 fits
Fitting 5 folds for each of 15840 candidates, totalling 79200 fits


KeyboardInterrupt: 

In [151]:
# a) Decision Tree
# Need to test these hyperparameters for best case
max_depth = 25
max_features = 10
max_leaf_nodes = 55
result = ModelUtil.runDT(xTrain, yTrain, xTest, yTest, max_depth, max_features, max_leaf_nodes)
#print(result)

training decision tree model...


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [64]:
# b) Naive Bayes

var_smoothing = 1e-9
result = ModelUtil.runNB(xTrain, yTrain, xTest, yTest, var_smoothing)
test = True
print("Training Accuracy: ")
print(result[1])
print("Testing Accuracy: ")
print(result[2])

training Naive Bayes model...
done!
getting statistics...
done!
Training Accuracy: 
0.39375
Testing Accuracy: 
0.2803738317757009


  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]


In [65]:
# c)Logistic Regression
lr = 0.8
e = 100
result = ModelUtil.runLogReg(xTrain, yTrain, xTest, yTest, lr, e)
print("Training Accuracy: ")
print(result[1])
print("Testing Accuracy: ")
print(result[2])

training logistic regression model...
done!
getting statistics...
logging statistics...
done!
Training Accuracy: 
0.340625
Testing Accuracy: 
0.2523364485981308


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]


In [66]:
# d) SVM
rC = 1
kernel='linear'
degree= 1
gamma= 'scale'
coef0= 0.0
result = ModelUtil.runSVM(xTrain, yTrain, xTest, yTest, rC, kernel, degree, gamma, coef0)
print("Training Accuracy: ")
print(result[1])
print("Testing Accuracy: ")
print(result[2])

training SVM model...
done!
getting statistics...
done!
Training Accuracy: 
0.353125
Testing Accuracy: 
0.2523364485981308


  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]


In [None]:
# 3) Model Testing:

In [None]:
# 4) Model Iterations and Improvements