In [None]:
# Model Training:
# 1) Load all data from preprocessing (training/test splits, etc)
# 2) Begin Training Models
    #  a) Decision Tree
    #  b) Naive Bayes
    #  c) Logistic Regression
    #  d) SVM
# 3) Testing Models
# 4) New Iterations

In [9]:
# Imports
from Models import ModelUtil
from Data import Preprocessing, DataUtil
from Visualization import VisualUtil
from Logs import logging as logs
from sklearn.ensemble import AdaBoostClassifier

import importlib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
import configparser
import numpy as np

config = configparser.ConfigParser()
config.read('Data//config.ini')

importlib.reload(Preprocessing)
importlib.reload(ModelUtil)
importlib.reload(VisualUtil)
importlib.reload(logs)

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

In [94]:
# 1) Load all data from preprocessing 
importlib.reload(Preprocessing)

if("False" in config['DATA']['USE_NEW_PREPROCESSING']):
    infieldDataFrame, outfieldDataFrame = Preprocessing.dataFiltering([], False)

    # drop nan values from the used columns
    specific_columns = ["PitcherThrows", "BatterSide", "TaggedPitchType", "PlateLocHeight", "PlateLocSide", "ZoneSpeed", "SpinRate", "RelSpeed", "HorzBreak", "VertBreak"]
    specific_columns = ["PitcherThrows", "BatterSide", "TaggedPitchType", "RelSpeed", "VertRelAngle", "HorzRelAngle", "SpinRate", "SpinAxis", "RelHeight", "RelSide", "VertBreak", "InducedVertBreak", "HorzBreak", "VertApprAngle", "HorzApprAngle"] # pitcher averages
    specific_columns = ["PitcherThrows", "BatterSide", "TaggedPitchType", "RelSpeed", "InducedVertBreak", "HorzBreak", "RelHeight", "RelSide", "SpinAxis", "SpinRate", "VertApprAngle", "HorzApprAngle"] 
    infieldDataFrame = infieldDataFrame.dropna(axis=0, how='any',subset=specific_columns)

else:
    normDataFrame = Preprocessing.dataProcessing()
    infieldDataFrame, outfieldDataFrame = Preprocessing.dataFiltering(normDataFrame, True)

In [None]:
infieldcorrmatrix = infieldDataFrame.corr()
infieldcorrmatrix.to_csv('Infield_Correlation_Matrix')
outfieldcorrmatrix = outfieldDataFrame.corr()
outfieldcorrmatrix.to_csv('Outfield_Correlation_Matrix')

In [95]:
importlib.reload(Preprocessing)
importlib.reload(DataUtil)

if("False" in config['DATA']['USE_NEW_PREPROCESSING']):
    Y = infieldDataFrame["FieldSlice"]
    X = infieldDataFrame[["PitcherThrows", "BatterSide", "TaggedPitchType", "PlateLocHeight", "PlateLocSide", "ZoneSpeed", "SpinRate", "RelSpeed", "HorzBreak", "VertBreak"]] 
    X = infieldDataFrame[specific_columns] # pitcher averages
    originalNotNormX = X
    X = DataUtil.normalizeData(X, originalNotNormX)
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.25, random_state=11)
    # adb = AdaBoostClassifier()
    # adb_model = adb.fit(xTrain, yTrain)

    # calculate split information:
    trainingClassSplit = [0, 0, 0, 0, 0]
    for i in yTrain:
        trainingClassSplit[i-1] += 1

    
    testingClassSplit = [0, 0, 0, 0, 0]
    for i in yTest:
        testingClassSplit[i-1] += 1

    trainingClassPercent = []
    for i in trainingClassSplit:
        trainingClassPercent.append(round(i/len(yTrain), 4))

    testingClassPercent = []
    for i in testingClassSplit:
        testingClassPercent.append(round(i/len(yTest), 4))

    print("Training Class Splits (count, then percentage):")
    print(trainingClassSplit)
    print(trainingClassPercent)
    print("\nTesting Class Splits (count, then percentage):")
    print(testingClassSplit)
    print(testingClassPercent)
else:
    infieldY = infieldDataFrame[0][['Direction','Distance']]
    infieldX = infieldDataFrame[0][infieldDataFrame[1]] 
    if("True" in config['SPLIT']['TTS']):
        xTrain, xTest, yTrain, yTest = train_test_split(infieldX, infieldY, test_size=0.20, random_state=11)
        
    elif("True" in config['SPLIT']['KFold']):
        kf = KFold(n_splits=5, shuffle=True)
        for train_index, test_index in kf.split(infieldX):
            xTrain, xTest = infieldX.iloc[train_index,:], infieldX.iloc[test_index,:]
            yTrain, yTest = infieldY.iloc[train_index,:], infieldY.iloc[test_index,:]

    elif("True" in config['SPLIT']['LOOCV']):
        loo = LeaveOneOut()
        for train_index, test_index in loo.split(infieldX):
            xTrain, xTest = infieldX.iloc[train_index,:], infieldX.iloc[test_index,:]
            yTrain, yTest = infieldY.iloc[train_index,:], infieldY.iloc[test_index,:]

    else:
        print("No Splitting Method Selected")
        

# GroupKFold: (avoids putting data from the same group in the test set -- useful for Pitcher/Batter ID when we implement that.)

Training Class Splits (count, then percentage):
[7280, 10890, 10257, 8471, 4956]
[0.1739, 0.2602, 0.2451, 0.2024, 0.1184]

Testing Class Splits (count, then percentage):
[2444, 3583, 3391, 2813, 1721]
[0.1752, 0.2568, 0.243, 0.2016, 0.1234]


In [None]:
# 2) Training Models

In [96]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# a) Decision Tree
# Need to test these hyperparameters for best case
max_depth = 50
max_features = 30
max_leaf_nodes = 150
dtOutput = ModelUtil.runDT(xTrain, yTrain, xTest, yTest, max_depth, max_features, max_leaf_nodes)


training decision tree model...
done!
getting statistics...

printing statistics...
Model Type: DecisionTree

Training Size = 41854
Testing Size = 13952

Training Accuracy = 0.35301285420748313
Testing Accuracy = 0.310134747706422

Training Average Error = 0.9330768863191092
Testing Average Error = 0.9924025229357798

Training Recall = [0.24684065934065935, 0.5972451790633608, 0.26050502096129474, 0.377523314838862, 0.12187247780468119]
Testing Recall = [0.2311783960720131, 0.5428411945297237, 0.22087879681509878, 0.3238535371489513, 0.09122603137710633]

Training f1 (micro, macro, weighted) = [0.35301285420748313, 0.3169975897911577, 0.3711977042646309]
Testing f1 (micro, macro, weighted) = [0.310134747706422, 0.27553187540417035, 0.3283299900671702]

Training auc (macro, weighted) = [0.9831208645285029, 0.9794745302320497]
Testing auc (macro, weighted) = [0.9836491352523176, 0.9797677630983791]

Hyper-Parameters: 

Max Tree Depth: 50
Max Tree Features: 30
Max Leaf Nodes: 150

Accurac

In [97]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# b) Naive Bayes

var_smoothing = 1e-9
nbOutput = ModelUtil.runNB(xTrain, yTrain, xTest, yTest, var_smoothing)

training Naive Bayes model...
done!
getting statistics...
printing statistics...
Model Type: NaiveBayes

Training Size = 41854
Testing Size = 13952

Training Accuracy = 0.29825106321976397
Testing Accuracy = 0.2993836009174312

Training Average Error = 1.07201223300043
Testing Average Error = 1.0707425458715596

Training Recall = [0.3423076923076923, 0.3335169880624426, 0.2043482499756264, 0.3952307873922795, 0.18462469733656175]
Testing Recall = [0.3355155482815057, 0.32989115266536423, 0.21439103509289295, 0.3903306078919303, 0.20337013364323067]

Training f1 (micro, macro, weighted) = [0.29825106321976397, 0.28910707130782465, 0.30332729041691026]
Testing f1 (micro, macro, weighted) = [0.2993836009174312, 0.29335265563622887, 0.303461160114694]

Training auc (macro, weighted) = [0.9817572684001687, 0.9795056791054704]
Testing auc (macro, weighted) = [0.9825934785773457, 0.9801966947889239]

Hyper-Parameters: 

Var Smoothing: 1e-09

Accuracy Score for Predicting on Training Data: 1.0

In [98]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# c)Logistic Regression
lr = 0.8
e = 100
logRegOutput = ModelUtil.runLogReg(xTrain, yTrain, xTest, yTest, lr, e)

training logistic regression model...
done!
getting statistics...
printing statistics...
Model Type: LogisticRegression

Training Size = 41854
Testing Size = 13952

Training Accuracy = 0.3293353084531944
Testing Accuracy = 0.3268348623853211

Training Average Error = 0.9563243656520285
Testing Average Error = 0.9610808486238532

Training Recall = [0.184478021978022, 0.6742883379247016, 0.06532124402846837, 0.48424034942745836, 0.06577885391444714]
Testing Recall = [0.19435351882160393, 0.6673178900362824, 0.07225007372456503, 0.4735158194098827, 0.06798373038930854]

Training f1 (micro, macro, weighted) = [0.3293353084531944, 0.2586348017297306, 0.383197598780785]
Testing f1 (micro, macro, weighted) = [0.3268348623853211, 0.2611152772127467, 0.37754840943466816]

Training auc (macro, weighted) = [0.97406089044567, 0.9642570057914913]
Testing auc (macro, weighted) = [0.9727640219584959, 0.9634473797473708]

Hyper-Parameters: 

Learning Rate: 0.8
Epochs: 100

Accuracy Score for Predictin

In [10]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# d) SVM
rC = 1
kernel='linear'
degree= 1
gamma= 'scale'
coef0= 0.0
svmOutput = ModelUtil.runSVM(xTrain, yTrain, xTest, yTest, rC, kernel, degree, gamma, coef0)

training SVM model...
done!
getting statistics...
logging statistics...


  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]


printing statistics...
Model Type: SVM

Training Size = 42213
Testing Size = 14072

Training Accuracy = 0.31333949257337784
Testing Accuracy = 0.32227117680500283

Training Average Error = 0.9562220169142207
Testing Average Error = 0.9404491188175099

Training Recall = [0.0, 0.8033190480532506, 0.0, 0.5206884356949193, 0.0]
Testing Recall = [0.0, 0.8188306340927807, 0.0, 0.5387018396390142, 0.0]

Training f1 (micro, macro, weighted) = [0.31333949257337784, 0.16828958750340162, 0.43098074084009164]
Testing f1 (micro, macro, weighted) = [0.32227117680500283, 0.17354902523824683, 0.44205483936829404]

Training auc (macro, weighted) = Error
Testing auc (macro, weighted) = Error

Hyper-Parameters: 

Regularization Constant: 1
Kernel Type: linear
Kernel Degree1
Kernel Coefficient (gamma): scale
Independent Term in Kernel (coef0): 0.0

Accuracy Score for Predicting on Training Data: 0.9562
Accuracy Score for Predicting on Test Data: 0.3223


  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]



Overall Average Probabilities
-------------------------------------
Section 1: 16.61%
Section 2: 26.53%
Section 3: 25.04%
Section 4: 20.27%
Section 5: 11.55%

Field Slice Counts for Training Data
--------------------------------------------------
Section	Truth	Prediction
1		7295		28007.0
2		10967		14206.0
3		10348		nan
4		8483		nan
5		5120		nan
Amount Correct: 13227
Amount Incorrect: 28986

Field Slice Counts for Testing Data
--------------------------------------------------
Section	Truth	Prediction
1		2513		9367.0
2		3643		4705.0
3		3427		nan
4		2881		nan
5		1608		nan
Amount Correct: 4535
Amount Incorrect: 9537
done!


In [None]:
# z) RandomForestRegressor
for i in range(0, len(trainIn)):
    direction, distance = ModelUtil.runRFR(trainIn[i], trainOut[i], testIn[i], testOut[i])

In [34]:
# Change the value of index to look at different datapoints
importlib.reload(VisualUtil)
# 3) Model Testing:
dt = dtOutput[0]
nb = nbOutput[0]
logReg = logRegOutput[0]
# svm = svmOutput[0]

print("Testing Output: ")
# index of test value:
index = 4555
print(f"Actual Field Slice: \t\t{yTest.iloc[index]}")

print("\nDecision Tree:")
print(f"Predicted Field Slice: \t\t{dt.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{dt.predict_proba([xTest.iloc[index]])[0]}")

print("\nNaive Bayes:")
print(f"Predicted Field Slice: \t\t{nb.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{nb.predict_proba([xTest.iloc[index]])[0]}")

print("\nLogistic Regression:")
print(f"Predicted Field Slice: \t\t{logReg.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{logReg.predict_proba([xTest.iloc[index]])[0]}")

# print("\nSVM:")
# print(f"Predicted Field Slice: \t\t{svm.predict([xTest.iloc[index]])[0]}")
# print(f"Field Slice Probabilities: \t{svm.predict_proba([xTest.iloc[index]])[0]}")

averageProbs = dt.predict_proba([xTest.iloc[index]])[0] + nb.predict_proba([xTest.iloc[index]])[0] + logReg.predict_proba([xTest.iloc[index]])[0] # + svm.predict_proba([xTest.iloc[index]])[0]
averageProbs = averageProbs / 3 

print(f"\n\nAVG Prediction: \t\t{np.argmax(averageProbs)+1}")
print(f"Field Slice AVG Probabilities: \t{averageProbs}")

VisualUtil.visualizeData(averageProbs, [1])

Testing Output: 
Actual Field Slice: 		4

Decision Tree:
Predicted Field Slice: 		3
Field Slice Probabilities: 	[0.09431138 0.26047904 0.36227545 0.19610778 0.08682635]

Naive Bayes:
Predicted Field Slice: 		4
Field Slice Probabilities: 	[0.00630868 0.09324774 0.32466916 0.37936206 0.19641235]

Logistic Regression:
Predicted Field Slice: 		4
Field Slice Probabilities: 	[0.0300238  0.1356919  0.26967401 0.33883047 0.22577982]


AVG Prediction: 		3
Field Slice AVG Probabilities: 	[0.04354795 0.16313956 0.31887287 0.30476677 0.16967284]


In [99]:
# Gather data on average predictions
length = len(xTest)
correct = 0
incorrect = 0
wrongProbs = 0
wrongDistance = 0

dt = dtOutput[0]
nb = nbOutput[0]
logReg = logRegOutput[0]
# svm = svmOutput[0]
for index in range(length):
    averageProbs = dt.predict_proba([xTest.iloc[index]])[0] + nb.predict_proba([xTest.iloc[index]])[0] + logReg.predict_proba([xTest.iloc[index]])[0] #+ svm.predict_proba([xTest.iloc[index]])[0]
    averageProbs = averageProbs / 3

    actual = yTest.iloc[index]
    predicted = np.argmax(averageProbs)+1

    percentageActual = averageProbs[actual-1]

    if predicted == actual:
        correct += 1
    else:
        incorrect += 1 
        wrongProbs += percentageActual
        wrongDistance += abs(actual-predicted)


# Correct Prediction Count
print(correct)
# Incorrect Prediction Count
print(incorrect)
# Average probability of actual slice when guess is incorrect
print(wrongProbs/incorrect)
# Average distance from actual slice to guess slice when guess is incorrect
print(wrongDistance/incorrect)

4392
9560
0.20483854131525492
1.490376569037657


In [None]:
# 4) Model Iterations and Improvements

In [11]:
# 5) Data Visualization
importlib.reload(VisualUtil)

# Temporary method of getting percentages for testing purposes
infieldPercentages  = np.random.dirichlet(np.ones(4), size=1)[0]
outfieldPercentages = np.random.dirichlet(np.ones(2), size=1)[0]

VisualUtil.visualizeData(infieldPercentages, outfieldPercentages, "FieldTest")


In [101]:
# Average Pitcher Data Processing and Running
importlib.reload(Preprocessing)
importlib.reload(DataUtil)

pitchingAveragesDF = DataUtil.getRawDataFrame('Data/PitchMetricAverages_AsOf_2024-03-11.csv')

# drop nan values from the used columns
specific_columns = ["PitcherThrows", "BatterSide", "TaggedPitchType", "RelSpeed", "InducedVertBreak", "HorzBreak", "RelHeight", "RelSide", "SpinAxis", "SpinRate", "VertApprAngle", "HorzApprAngle"] # pitcher averages

averagesX = pitchingAveragesDF[specific_columns] # pitcher averages

averagesX = averagesX[averagesX["PitcherThrows"].isin(["Left", "Right", "Both"])] # 1, 2, 3 (can remove Both)
averagesX["PitcherThrows"] = averagesX["PitcherThrows"].map({"Left":1, "Right":2, "Both":3})
averagesX = averagesX[averagesX["BatterSide"].isin(["Left","Right"])] # 1, 2
averagesX["BatterSide"] = averagesX["BatterSide"].map({"Left":1, "Right":2})
averagesX = averagesX[averagesX["TaggedPitchType"].isin(["Fastball", "Sinker", "Cutter", "Curveball", "Slider", "Changeup", "Splitter", "Knuckleball"])] # 1,2,3,4,5,6,7,8
averagesX["TaggedPitchType"] = averagesX["TaggedPitchType"].map({"Fastball":1, "Sinker":2, "Cutter":3, "Curveball":4, "Slider":5, "Changeup":6, "Splitter":7, "Knuckleball":8})
# normalize this based on min and maxes from training data
averagesX = DataUtil.normalizeData(averagesX, originalNotNormX)

# Change the value of index to look at different datapoints
importlib.reload(VisualUtil)
# 3) Model Testing:
dt = dtOutput[0]
nb = nbOutput[0]
logReg = logRegOutput[0]
# svm = svmOutput[0]

for index in range(len(pitchingAveragesDF)):
    averageProbs= []
    averageProbs = dt.predict_proba([averagesX.iloc[index]])[0] + nb.predict_proba([averagesX.iloc[index]])[0] + logReg.predict_proba([averagesX.iloc[index]])[0]
    averageProbs = averageProbs / 3 

    # print(f"\n\nAVG Prediction: \t\t{np.argmax(averageProbs)+1}")
    # print(f"Field Slice AVG Probabilities: \t{averageProbs}")
    fileName = pitchingAveragesDF.iloc[index][0].replace(",", "_").replace(" ", "") + "_" + pitchingAveragesDF.iloc[index]["TaggedPitchType"] + "_" + pitchingAveragesDF.iloc[index]["BatterSide"] + "Batter"
    VisualUtil.visualizeData(averageProbs, [1], fileName)   



DecompressionBombError: Image size (220478360 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.