In [None]:
# Model Training:
# 1) Load all data from preprocessing (training/test splits, etc)
# 2) Begin Training Models
    #  a) Decision Tree
    #  b) Naive Bayes
    #  c) Logistic Regression
    #  d) SVM
# 3) Testing Models
# 4) New Iterations

In [48]:
# Imports
from Models import ModelUtil
from Data import Preprocessing, DataUtil
from Visualization import VisualUtil
from Logs import logging as logs
from sklearn.ensemble import AdaBoostClassifier

import importlib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
import configparser
import numpy as np

config = configparser.ConfigParser()
config.read('Data//config.ini')

importlib.reload(Preprocessing)
importlib.reload(ModelUtil)
importlib.reload(VisualUtil)
importlib.reload(logs)

import warnings
warnings.filterwarnings("ignore")

In [56]:
# 1) Load all data from preprocessing 
importlib.reload(Preprocessing)

if("False" in config['DATA']['USE_NEW_PREPROCESSING']):
    infieldDataFrame, outfieldDataFrame = Preprocessing.dataFiltering([], False)
    print(len(infieldDataFrame))
    specific_columns = ["PitcherThrows", "BatterSide", "TaggedPitchType", "PlateLocHeight", "PlateLocSide", "ZoneSpeed", "SpinRate", "RelSpeed", "HorzBreak", "VertBreak", "HorzBreak", "VertBreak"]
    df = infieldDataFrame.dropna(axis=0, how='any',subset=specific_columns)
    infieldDataFrame = DataUtil.expungeData(infieldDataFrame)

    print(len(infieldDataFrame))
    print(len(df))
else:
    normDataFrame = Preprocessing.dataProcessing()
    infieldDataFrame, outfieldDataFrame = Preprocessing.dataFiltering(normDataFrame, True)

56285
1103
56283


In [None]:
infieldcorrmatrix = infieldDataFrame.corr()
infieldcorrmatrix.to_csv('Infield_Correlation_Matrix')
outfieldcorrmatrix = outfieldDataFrame.corr()
outfieldcorrmatrix.to_csv('Outfield_Correlation_Matrix')

In [50]:
importlib.reload(Preprocessing)

if("False" in config['DATA']['USE_NEW_PREPROCESSING']):
    Y = infieldDataFrame["FieldSlice"]
    X = infieldDataFrame[["PitcherThrows", "BatterSide", "TaggedPitchType", "PlateLocHeight", "PlateLocSide", "ZoneSpeed", "SpinRate", "RelSpeed", "HorzBreak", "VertBreak", "HorzBreak", "VertBreak"]] 
    X = DataUtil.normalizeData(X)
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.25, random_state=11)
    # adb = AdaBoostClassifier()
    # adb_model = adb.fit(xTrain, yTrain)

    # calculate split information:
    trainingClassSplit = [0, 0, 0, 0, 0]
    for i in yTrain:
        trainingClassSplit[i-1] += 1

    
    testingClassSplit = [0, 0, 0, 0, 0]
    for i in yTest:
        testingClassSplit[i-1] += 1

    trainingClassPercent = []
    for i in trainingClassSplit:
        trainingClassPercent.append(round(i/len(yTrain), 4))

    testingClassPercent = []
    for i in testingClassSplit:
        testingClassPercent.append(round(i/len(yTest), 4))

    print("Training Class Splits (count, then percentage):")
    print(trainingClassSplit)
    print(trainingClassPercent)
    print("\nTesting Class Splits (count, then percentage):")
    print(testingClassSplit)
    print(testingClassPercent)
else:
    infieldY = infieldDataFrame[0][['Direction','Distance']]
    infieldX = infieldDataFrame[0][infieldDataFrame[1]] 
    if("True" in config['SPLIT']['TTS']):
        xTrain, xTest, yTrain, yTest = train_test_split(infieldX, infieldY, test_size=0.20, random_state=11)
        
    elif("True" in config['SPLIT']['KFold']):
        kf = KFold(n_splits=5, shuffle=True)
        for train_index, test_index in kf.split(infieldX):
            xTrain, xTest = infieldX.iloc[train_index,:], infieldX.iloc[test_index,:]
            yTrain, yTest = infieldY.iloc[train_index,:], infieldY.iloc[test_index,:]

    elif("True" in config['SPLIT']['LOOCV']):
        loo = LeaveOneOut()
        for train_index, test_index in loo.split(infieldX):
            xTrain, xTest = infieldX.iloc[train_index,:], infieldX.iloc[test_index,:]
            yTrain, yTest = infieldY.iloc[train_index,:], infieldY.iloc[test_index,:]

    else:
        print("No Splitting Method Selected")
        

# GroupKFold: (avoids putting data from the same group in the test set -- useful for Pitcher/Batter ID when we implement that.)

Training Class Splits (count, then percentage):
[43, 240, 245, 259, 40]
[0.052, 0.2902, 0.2963, 0.3132, 0.0484]

Testing Class Splits (count, then percentage):
[12, 82, 92, 77, 13]
[0.0435, 0.2971, 0.3333, 0.279, 0.0471]


In [None]:
# 2) Training Models

In [51]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# a) Decision Tree
# Need to test these hyperparameters for best case
max_depth = 50
max_features = 30
max_leaf_nodes = 150
dtOutput = ModelUtil.runDT(xTrain, yTrain, xTest, yTest, max_depth, max_features, max_leaf_nodes)


training decision tree model...
done!
getting statistics...

printing statistics...
Model Type: DecisionTree

Training Size = 827
Testing Size = 276

Training Accuracy = 0.7617896009673518
Testing Accuracy = 0.33695652173913043

Training Average Error = 0.3373639661426844
Testing Average Error = 0.9456521739130435

Training Recall = [0.4883720930232558, 0.8458333333333333, 0.6857142857142857, 0.8301158301158301, 0.575]
Testing Recall = [0.0, 0.43902439024390244, 0.21739130434782608, 0.4675324675324675, 0.07692307692307693]

Training f1 (micro, macro, weighted) = [0.7617896009673518, 0.7156231158853819, 0.7637479787151689]
Testing f1 (micro, macro, weighted) = [0.33695652173913043, 0.2296120651037778, 0.3502766326258999]

Training auc (macro, weighted) = [0.9996622937557913, 0.999563637310118]
Testing auc (macro, weighted) = [0.9997586469474807, 0.99978266823396]

Hyper-Parameters: 

Max Tree Depth: 50
Max Tree Features: 30
Max Leaf Nodes: 150

Accuracy Score for Predicting on Training 

In [47]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# b) Naive Bayes

var_smoothing = 1e-9
nbOutput = ModelUtil.runNB(xTrain, yTrain, xTest, yTest, var_smoothing)

training Naive Bayes model...


ValueError: Input X contains NaN.
GaussianNB does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [9]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# c)Logistic Regression
lr = 0.8
e = 100
logRegOutput = ModelUtil.runLogReg(xTrain, yTrain, xTest, yTest, lr, e)

training logistic regression model...
done!
getting statistics...
logging statistics...
printing statistics...
Model Type: LogisticRegression

Training Size = 42213
Testing Size = 14072

Training Accuracy = 0.3314144931656125
Testing Accuracy = 0.3384735645252985

Training Average Error = 0.9623812569587568
Testing Average Error = 0.9481949971574758

Training Recall = [0.20945853324194655, 0.668095194674934, 0.05701584847313491, 0.4630437345278793, 0.1205078125]
Testing Recall = [0.20692399522483088, 0.6771891298380456, 0.06011088415523782, 0.4758764317945158, 0.12375621890547264]

Training f1 (micro, macro, weighted) = [0.3314144931656125, 0.27340808292595487, 0.38027268361718713]
Testing f1 (micro, macro, weighted) = [0.3384735645252985, 0.2789676953666368, 0.3876241066165289]

Training auc (macro, weighted) = [0.9710129774331578, 0.9551868857948491]
Testing auc (macro, weighted) = [0.9695586143919004, 0.9528372922912169]

Hyper-Parameters: 

Learning Rate: 0.8
Epochs: 100

Accuracy 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActua

In [10]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# d) SVM
rC = 1
kernel='linear'
degree= 1
gamma= 'scale'
coef0= 0.0
svmOutput = ModelUtil.runSVM(xTrain, yTrain, xTest, yTest, rC, kernel, degree, gamma, coef0)

training SVM model...
done!
getting statistics...
logging statistics...


  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]


printing statistics...
Model Type: SVM

Training Size = 42213
Testing Size = 14072

Training Accuracy = 0.31333949257337784
Testing Accuracy = 0.32227117680500283

Training Average Error = 0.9562220169142207
Testing Average Error = 0.9404491188175099

Training Recall = [0.0, 0.8033190480532506, 0.0, 0.5206884356949193, 0.0]
Testing Recall = [0.0, 0.8188306340927807, 0.0, 0.5387018396390142, 0.0]

Training f1 (micro, macro, weighted) = [0.31333949257337784, 0.16828958750340162, 0.43098074084009164]
Testing f1 (micro, macro, weighted) = [0.32227117680500283, 0.17354902523824683, 0.44205483936829404]

Training auc (macro, weighted) = Error
Testing auc (macro, weighted) = Error

Hyper-Parameters: 

Regularization Constant: 1
Kernel Type: linear
Kernel Degree1
Kernel Coefficient (gamma): scale
Independent Term in Kernel (coef0): 0.0

Accuracy Score for Predicting on Training Data: 0.9562
Accuracy Score for Predicting on Test Data: 0.3223


  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]



Overall Average Probabilities
-------------------------------------
Section 1: 16.61%
Section 2: 26.53%
Section 3: 25.04%
Section 4: 20.27%
Section 5: 11.55%

Field Slice Counts for Training Data
--------------------------------------------------
Section	Truth	Prediction
1		7295		28007.0
2		10967		14206.0
3		10348		nan
4		8483		nan
5		5120		nan
Amount Correct: 13227
Amount Incorrect: 28986

Field Slice Counts for Testing Data
--------------------------------------------------
Section	Truth	Prediction
1		2513		9367.0
2		3643		4705.0
3		3427		nan
4		2881		nan
5		1608		nan
Amount Correct: 4535
Amount Incorrect: 9537
done!


In [None]:
# z) RandomForestRegressor
for i in range(0, len(trainIn)):
    direction, distance = ModelUtil.runRFR(trainIn[i], trainOut[i], testIn[i], testOut[i])

In [42]:
# Change the value of index to look at different datapoints
importlib.reload(VisualUtil)
# 3) Model Testing:
dt = dtOutput[0]
nb = nbOutput[0]
logReg = logRegOutput[0]
# svm = svmOutput[0]

print("Testing Output: ")
# index of test value:
index = 6432
print(f"Actual Field Slice: \t\t{yTest.iloc[index]}")

print("\nDecision Tree:")
print(f"Predicted Field Slice: \t\t{dt.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{dt.predict_proba([xTest.iloc[index]])[0]}")

print("\nNaive Bayes:")
print(f"Predicted Field Slice: \t\t{nb.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{nb.predict_proba([xTest.iloc[index]])[0]}")

print("\nLogistic Regression:")
print(f"Predicted Field Slice: \t\t{logReg.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{logReg.predict_proba([xTest.iloc[index]])[0]}")

# print("\nSVM:")
# print(f"Predicted Field Slice: \t\t{svm.predict([xTest.iloc[index]])[0]}")
# print(f"Field Slice Probabilities: \t{svm.predict_proba([xTest.iloc[index]])[0]}")

averageProbs = dt.predict_proba([xTest.iloc[index]])[0] + nb.predict_proba([xTest.iloc[index]])[0] + logReg.predict_proba([xTest.iloc[index]])[0] # + svm.predict_proba([xTest.iloc[index]])[0]
averageProbs = averageProbs / 3 

print(f"\n\nAVG Prediction: \t\t{np.argmax(averageProbs)+1}")
print(f"Field Slice AVG Probabilities: \t{averageProbs}")

VisualUtil.visualizeData(averageProbs, [1])

Testing Output: 
Actual Field Slice: 		2

Decision Tree:
Predicted Field Slice: 		2
Field Slice Probabilities: 	[0.30976172 0.38431975 0.17063797 0.08378171 0.05149885]

Naive Bayes:
Predicted Field Slice: 		1
Field Slice Probabilities: 	[0.50913324 0.26158801 0.11318565 0.06351724 0.05257585]

Logistic Regression:
Predicted Field Slice: 		1
Field Slice Probabilities: 	[0.37627526 0.30511483 0.17419319 0.09638931 0.0480274 ]


AVG Prediction: 		1
Field Slice AVG Probabilities: 	[0.39839008 0.31700753 0.15267227 0.08122942 0.0507007 ]


In [40]:
# Gather data on average predictions
length = len(xTest)
correct = 0
incorrect = 0
wrongProbs = 0
wrongDistance = 0

dt = dtOutput[0]
nb = nbOutput[0]
logReg = logRegOutput[0]
# svm = svmOutput[0]
for index in range(length):
    averageProbs = dt.predict_proba([xTest.iloc[index]])[0] + nb.predict_proba([xTest.iloc[index]])[0] + logReg.predict_proba([xTest.iloc[index]])[0] #+ svm.predict_proba([xTest.iloc[index]])[0]
    averageProbs = averageProbs / 3

    actual = yTest.iloc[index]
    predicted = np.argmax(averageProbs)+1

    percentageActual = averageProbs[actual-1]

    if predicted == actual:
        correct += 1
    else:
        incorrect += 1 
        wrongProbs += percentageActual
        wrongDistance += abs(actual-predicted)


# Correct Prediction Count
print(correct)
# Incorrect Prediction Count
print(incorrect)
# Average probability of actual slice when guess is incorrect
print(wrongProbs/incorrect)
# Average distance from actual slice to guess slice when guess is incorrect
print(wrongDistance/incorrect)

4746
9326
0.21237108290236043
1.4551790692687112


In [None]:
# 4) Model Iterations and Improvements

In [307]:
# 5) Data Visualization

# Temporary method of getting percentages for testing purposes
infieldPercentages  = np.random.dirichlet(np.ones(4), size=1)[0]
outfieldPercentages = np.random.dirichlet(np.ones(1), size=1)[0]

VisualUtil.visualizeData(infieldPercentages, outfieldPercentages)
