In [None]:
# Model Training:
# 1) Load all data from preprocessing (training/test splits, etc)
# 2) Begin Training Models
    #  a) Decision Tree
    #  b) Naive Bayes
    #  c) Logistic Regression
    #  d) SVM
# 3) Testing Models
# 4) New Iterations

In [1]:
# Imports
from Models import ModelUtil
from Data import Preprocessing, DataUtil
from Visualization import VisualUtil, batch_image_to_excel
from Logs import logging as logs
from sklearn.ensemble import AdaBoostClassifier

import importlib
import configparser
import numpy as np

config = configparser.ConfigParser()
config.read('Data//config.ini')

importlib.reload(Preprocessing)
importlib.reload(ModelUtil)
importlib.reload(VisualUtil)
importlib.reload(batch_image_to_excel)
importlib.reload(logs)

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

In [2]:
# 1) Load all data from preprocessing 
importlib.reload(Preprocessing)
newprocessing = 'True' in config['DATA']['USE_NEW_PREPROCESSING']
infieldDataFrame, outfieldDataFrame = Preprocessing.dataFiltering([], newprocessing)

In [3]:
# All of this is mapping the strings to numbers for both infieldDataFrame and outfieldDataFrame so that the correlation matrix can be computed
# This can most likely be moved to a method in the logging.py file
infieldDF4Matrix = infieldDataFrame.copy()
outfieldDF4Matrix = outfieldDataFrame.copy()
strColumns = [] 
for cName in outfieldDF4Matrix.columns:
    if(str(outfieldDF4Matrix[cName].dtype) in 'object'):
        strColumns.append(cName)
rValueDict = {}
for cName in strColumns:
    i = 0
    infieldUniques = infieldDF4Matrix[cName].unique()
    for rValue in infieldUniques:
        rValueDict.update({rValue:i})
        i+=1
    infieldDF4Matrix[cName] = infieldDF4Matrix[cName].map(rValueDict)
    uniqueVals = [x for x in outfieldDF4Matrix[cName].unique() if x not in infieldUniques]
    for rValue in uniqueVals: 
        rValueDict.update({rValue:i})
        i+=1
    outfieldDF4Matrix[cName] = outfieldDF4Matrix[cName].map(rValueDict)
infieldDF4Matrix = infieldDF4Matrix.replace(np.nan, 0)
infieldDF4Matrix = infieldDF4Matrix.replace('', 0)
outfieldDF4Matrix = outfieldDF4Matrix.replace(np.nan, 0)
outfieldDF4Matrix = outfieldDF4Matrix.replace('', 0)

# Correlation does not imply causation.
# -1 means that the 2 variables have an inverse linear relationship: when X increases, Y decreases
# 0 means no linear correlation between X and Y
# 1 means that the 2 variables have a linear relationship: when X increases, Y increases too.
infieldcorrmatrix = infieldDF4Matrix.corr()
outfieldcorrmatrix = outfieldDF4Matrix.corr()
if (config['LOGGING']['Excel'] == 'True'):
    logs.writeToExcelSheet(infieldcorrmatrix, "Infield Correlation Matrix")
    logs.writeToExcelSheet(outfieldcorrmatrix, "Outfield Correlation Matrix")
if (config['LOGGING']['Debug'] == 'True'):
    print(infieldcorrmatrix)
    print(outfieldcorrmatrix)

                  PitcherThrows  BatterSide  TaggedPitchType  PlateLocHeight  \
PitcherThrows          1.000000   -0.027616         0.043216       -0.011886   
BatterSide            -0.027616    1.000000        -0.045209       -0.017370   
TaggedPitchType        0.043216   -0.045209         1.000000       -0.138346   
PlateLocHeight        -0.011886   -0.017370        -0.138346        1.000000   
PlateLocSide           0.053604    0.280891        -0.065869        0.072434   
ZoneSpeed              0.116216    0.058189        -0.420955        0.133941   
RelSpeed               0.113582    0.058624        -0.439868        0.157745   
VertRelAngle          -0.041096   -0.049072         0.412838        0.184069   
HorzRelAngle          -0.858233    0.051716         0.010592        0.016744   
SpinRate               0.095525   -0.023776         0.146569       -0.046987   
SpinAxis               0.054622    0.116916        -0.211874        0.063631   
RelHeight              0.023386   -0.007

In [4]:
importlib.reload(logs)
# 2) Trains all Models and exports all data to an Excel Sheet
max_depth = 50
max_features = 30
max_leaf_nodes = 150
# could also add ways to change it for these hyperparams below for other models
var_smoothing = 1e-9
lr = 0.8
e = 100
rC = 1
kernel='linear'
degree= 1
gamma= 'scale'
coef0= 0.0

for j in range(int(config['TRAIN']['TimesRun'])):
        xTrain, xTest, yTrain, yTest = ModelUtil.modelDataSplitting(infieldDataFrame, j, 0.25,'InfieldTrainingFilter')
        print(xTrain)
        if("True" in config['MODELS']['DTC']):
            dtOutput = ModelUtil.runDT(xTrain, yTrain, xTest, yTest, max_depth, max_features, max_leaf_nodes)
        if("True" in config['MODELS']['NB']):   
            nbOutput = ModelUtil.runNB(xTrain, yTrain, xTest, yTest, var_smoothing)
        if("True" in config['MODELS']['LR']):
            logRegOutput = ModelUtil.runLogReg(xTrain, yTrain, xTest, yTest, lr, e)
        if("True" in config['MODELS']['SVM']):
            svmOutput = ModelUtil.runSVM(xTrain, yTrain, xTest, yTest, rC, kernel, degree, gamma, coef0)
        if("True" in config['MODELS']['RF']):
            for i in range(0, len(trainIn)):
                direction, distance = ModelUtil.runRFR(trainIn[i], trainOut[i], testIn[i], testOut[i])

Training Class Splits (count, then percentage):
[7331, 10937, 10141, 8411, 5021]
[0.1752, 0.2614, 0.2424, 0.201, 0.12]

Testing Class Splits (count, then percentage):
[2391, 3528, 3503, 2870, 1655]
[0.1714, 0.253, 0.2512, 0.2058, 0.1187]
         PitcherThrows  BatterSide  TaggedPitchType  RelSpeed  \
14938              0.0         1.0         0.000000  0.726142   
1424169            0.0         1.0         0.142857  0.735032   
368343             0.0         0.0         0.000000  0.792787   
1016364            0.0         0.0         0.000000  0.634941   
1230481            1.0         1.0         0.000000  0.572196   
...                ...         ...              ...       ...   
1252489            1.0         1.0         0.000000  0.547599   
1517388            1.0         1.0         0.000000  0.715020   
1121504            1.0         1.0         0.571429  0.631072   
1159182            1.0         0.0         0.000000  0.838012   
63728              1.0         1.0         0.00

In [5]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# a) Decision Tree
# Need to test these hyperparameters for best case
# Maybe make a way to superset these
max_depth =      [50, 40]
max_features =   [30, 20]
max_leaf_nodes = [150, 100]
hyperparamlist = []
# This just makes the permutations of the hyperparameters above. Lets you test on many hyperparams.
for n in range(len(max_depth)):
    for k in range(len(max_features)):
        for m in range(len(max_leaf_nodes)):
            hyperparamlist.append([max_depth[n], max_features[k], max_leaf_nodes[m]])
            
# for each permutation, it runs a certain amount of time that you specify in the config (30 rn bc of Dozier) and saves the outcome to an excel sheet
# requires to rerun the training set every time because otherwise will give you the same outcome every time
# Also proves that its the models ability, not the luck of the draw for the data
for lst in hyperparamlist:
    for j in range(int(config['TRAIN']['TimesRun'])):
        xTrain, xTest, yTrain, yTest = ModelUtil.modelDataSplitting(infieldDataFrame, j, 0.25,'InfieldTrainingFilter')
        dtOutput = ModelUtil.runDT(xTrain, yTrain, xTest, yTest, lst[0], lst[1], lst[2])


Training Class Splits (count, then percentage):
[7331, 10937, 10141, 8411, 5021]
[0.1752, 0.2614, 0.2424, 0.201, 0.12]

Testing Class Splits (count, then percentage):
[2391, 3528, 3503, 2870, 1655]
[0.1714, 0.253, 0.2512, 0.2058, 0.1187]
training decision tree model...
done!
getting statistics...

logging statistics...
printing statistics...
Model Type: DecisionTree

Training Size = 41841
Testing Size = 13947

Training Accuracy = 0.3545565354556535
Testing Accuracy = 0.31454793145479315

Training Average Error = 0.9579837957983796
Testing Average Error = 1.00516240051624

Training Recall = [0.2870004092211158, 0.5585626771509554, 0.2565821911054137, 0.30781119961954584, 0.28500298745269864]
Testing Recall = [0.2542869092429946, 0.5102040816326531, 0.2092492149586069, 0.27979094076655053, 0.2676737160120846]

Training f1 (micro, macro, weighted) = [0.3545565354556535, 0.3391974749620864, 0.36376931230237974]
Testing f1 (micro, macro, weighted) = [0.31454793145479315, 0.30215935704234387

PermissionError: [Errno 13] Permission denied: 'logs/ModelStatistics_03-2024.xlsx'

In [6]:
# TODO
# This is meant to take all the values from the 30 runs and average them and output them to another sheet of averages for different models
# Then will need to do this for all the models
# Can take this and put it into an excelAverages function
#prob rename this

# could move these column letter names and do something with that so not hardcoded
if("True" in config['LOGGING']['Excel']):
    sColumns = ['Training Accuracy', 'Testing Accuracy', 'Training Average Error', 'Testing Average Error', 'Training F1(micro)', 'Training F1(macro)', 'Training F1(weighted)', 
                'Testing F1(micro)', 'Testing F1(macro)', 'Testing F1(weighted)', 'Training AUC(macro)', 'Training AUC(weighted)', 'Testing AUC(macro)', 'Testing AUC(weighted)', 
                'Section 0 Probability', 'Section 1 Probability', 'Section 2 Probability', 'Section 3 Probability', 'Section 4 Probability']
    if("True" in config['MODELS']['DTC']):
        # columns in excel: I J K L W X Y Z AA AB AC AD AE AF AG AH AI AJ AK   
        sColumnsLetter = ['I','J','K','L','W','X','Y','Z','AA','AB','AC','AD','AE','AF','AG','AH','AI','AJ','AK']
        logs.excelAverages('DecisionTree',sColumns,sColumnsLetter)
    if("True" in config['MODELS']['NB']):
        sColumnsLetter = ['D','E','F','G','R','S','T','U','V','W','X','Y','Z','AA','AB','AC','AD','AE','AF']
        logs.excelAverages('NaiveBayes',sColumns,sColumnsLetter)
    if("True" in config['MODELS']['LR']):
        sColumnsLetter = ['E','F','G','H','S','T','U','V','W','X','Y','Z','AA','AB','AC','AD','AE','AF','AG']
        logs.excelAverages('LogisticRegression',sColumns,sColumnsLetter)
    if("True" in config['MODELS']['SVM']):
        sColumnsLetter = ['H','I','J','K','V','W','X','Y','Z','AA','AB','AC','AD','AE','AF','AG','AH','AI','AJ']
        logs.excelAverages('SVM',sColumns,sColumnsLetter)
    if("True" in config['MODELS']['RF']):
        logs.excelAverages('RandomForest',sColumns,sColumnsLetter)


printing statistics...
['DecisionTree', 0.3541924118898294, 0.3170911728855879, 0.9477271800668358, 0.9957275231021638, 0.35419241188982953, 0.33068775415843044, 0.3675521094918608, 0.3170911728855879, 0.2928692149416022, 0.33143188995663475, 0.9847581645604119, 0.979903450662093, 0.9847374514200574, 0.979964033103721, 0.17427365742549086, 0.2591933471236853, 0.24422106836188848, 0.20233619514882606, 0.11997573194010537]
exporting statistics to Excel...
printing statistics...
['NaiveBayes', 0.2964659863132653, 0.2958366195836619, 1.075202950853629, 1.0739227073922708, 0.2964659863132653, 0.2887416417721127, 0.3012824998366026, 0.2958366195836619, 0.28839967642320286, 0.3006147728300677, 0.979867738016271, 0.9765412629813442, 0.9799412274625465, 0.9766571094881374, 0.2083801492914866, 0.22077246678139767, 0.24116619372955317, 0.214315743719245, 0.1153654464783174, 0.0]
exporting statistics to Excel...
printing statistics...
['LogisticRegression', 0.330053456338679, 0.32877321287732125, 

In [48]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# b) Naive Bayes

var_smoothing = 1e-9
for j in range(int(config['TRAIN']['TimesRun'])):
        xTrain, xTest, yTrain, yTest = ModelUtil.modelDataSplitting(infieldDataFrame, j, 0.25,'InfieldTrainingFilter')
        nbOutput = ModelUtil.runNB(xTrain, yTrain, xTest, yTest, var_smoothing)

Training Class Splits (count, then percentage):
[7331, 10937, 10141, 8411, 5021]
[0.1752, 0.2614, 0.2424, 0.201, 0.12]

Testing Class Splits (count, then percentage):
[2391, 3528, 3503, 2870, 1655]
[0.1714, 0.253, 0.2512, 0.2058, 0.1187]
training Naive Bayes model...
done!
getting statistics...
printing statistics...
Model Type: NaiveBayes

Training Size = 41841
Testing Size = 13947

Training Accuracy = 0.29604932960493296
Testing Accuracy = 0.29454362945436297

Training Average Error = 1.0771014077101408
Testing Average Error = 1.0743529074352907

Training Recall = [0.3381530486973128, 0.30849410258754684, 0.2432698944877231, 0.3693972179289026, 0.19119697271459868]
Testing Recall = [0.34002509410288584, 0.29790249433106575, 0.23265772195261206, 0.3867595818815331, 0.19274924471299093]

Training f1 (micro, macro, weighted) = [0.29604932960493296, 0.28975871008268334, 0.29863696657497774]
Testing f1 (micro, macro, weighted) = [0.29454362945436297, 0.2892997211981692, 0.2976738458851368

In [49]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# c)Logistic Regression
lr = 0.8
e = 100
logRegOutput = ModelUtil.runLogReg(xTrain, yTrain, xTest, yTest, lr, e)

training logistic regression model...
done!
getting statistics...
printing statistics...
Model Type: LogisticRegression

Training Size = 41841
Testing Size = 13947

Training Accuracy = 0.3300112330011233
Testing Accuracy = 0.3215028321502832

Training Average Error = 0.9621662962166296
Testing Average Error = 0.9605649960564996

Training Recall = [0.18469513026872186, 0.677882417481942, 0.06419485257864116, 0.4632029485197955, 0.0981876120294762]
Testing Recall = [0.18025930572982016, 0.6655328798185941, 0.057664858692549247, 0.4710801393728223, 0.09123867069486405]

Training f1 (micro, macro, weighted) = [0.3300112330011233, 0.2657490027040982, 0.38071432141815975]
Testing f1 (micro, macro, weighted) = [0.3215028321502832, 0.2586301923488591, 0.37329185006495824]

Training auc (macro, weighted) = [0.9739243934201258, 0.9655939136777918]
Testing auc (macro, weighted) = [0.9752589280958007, 0.9656993219454908]

Hyper-Parameters: 

Learning Rate: 0.8
Epochs: 100

Accuracy Score for Predi

In [50]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# d) SVM
rC = 1
kernel='linear'
degree= 1
gamma= 'scale'
coef0= 0.0
svmOutput = ModelUtil.runSVM(xTrain, yTrain, xTest, yTest, rC, kernel, degree, gamma, coef0)

In [None]:
# z) RandomForestRegressor
for i in range(0, len(trainIn)):
    direction, distance = ModelUtil.runRFR(trainIn[i], trainOut[i], testIn[i], testOut[i])

In [8]:
# Change the value of index to look at different datapoints
importlib.reload(VisualUtil)
# 3) Model Testing:
dt = dtOutput[0]
nb = nbOutput[0]
logReg = logRegOutput[0]
# svm = svmOutput[0]

print("Testing Output: ")
# index of test value:
index = 4555
print(f"Actual Field Slice: \t\t{yTest.iloc[index]}")

print("\nDecision Tree:")
print(f"Predicted Field Slice: \t\t{dt.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{dt.predict_proba([xTest.iloc[index]])[0]}")

print("\nNaive Bayes:")
print(f"Predicted Field Slice: \t\t{nb.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{nb.predict_proba([xTest.iloc[index]])[0]}")

print("\nLogistic Regression:")
print(f"Predicted Field Slice: \t\t{logReg.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{logReg.predict_proba([xTest.iloc[index]])[0]}")

# print("\nSVM:")
# print(f"Predicted Field Slice: \t\t{svm.predict([xTest.iloc[index]])[0]}")
# print(f"Field Slice Probabilities: \t{svm.predict_proba([xTest.iloc[index]])[0]}")

averageProbs = dt.predict_proba([xTest.iloc[index]])[0] + nb.predict_proba([xTest.iloc[index]])[0] + logReg.predict_proba([xTest.iloc[index]])[0] # + svm.predict_proba([xTest.iloc[index]])[0]
averageProbs = averageProbs / 3 

print(f"\n\nAVG Prediction: \t\t{np.argmax(averageProbs)+1}")
print(f"Field Slice AVG Probabilities: \t{averageProbs}")

VisualUtil.visualizeData(averageProbs, [1], 'TestPic.png')

Testing Output: 
Actual Field Slice: 		1

Decision Tree:
Predicted Field Slice: 		2
Field Slice Probabilities: 	[0.29530201 0.36241611 0.19463087 0.09395973 0.05369128]

Naive Bayes:
Predicted Field Slice: 		1
Field Slice Probabilities: 	[0.83235083 0.12339158 0.0147003  0.00654327 0.02301402]

Logistic Regression:
Predicted Field Slice: 		2
Field Slice Probabilities: 	[0.29888845 0.3241051  0.20229239 0.11440054 0.06031351]


AVG Prediction: 		1
Field Slice AVG Probabilities: 	[0.47551376 0.26997093 0.13720785 0.07163452 0.04567294]


In [15]:
# 5) Data Visualization
importlib.reload(VisualUtil)

# Temporary method of getting percentages for testing purposes
infieldPercentages  = np.random.dirichlet(np.ones(4), size=1)[0]
outfieldPercentages = np.random.dirichlet(np.ones(2), size=1)[0]
outfieldCoordinates = np.random.uniform(low=[-45, 150], high=[45, 400], size=(30,2))

VisualUtil.visualizeData(infieldPercentages, outfieldCoordinates, "FieldTest")


In [7]:
# Average Pitcher Data Processing and Running
importlib.reload(Preprocessing)
importlib.reload(DataUtil)
importlib.reload(VisualUtil)
importlib.reload(batch_image_to_excel)


pitchingAveragesDF = DataUtil.getRawDataFrame('Data/PitchMetricAverages_AsOf_2024-03-11.csv')
# drop nan values from the used columns
specific_columns = ["PitcherThrows", "BatterSide", "TaggedPitchType", "RelSpeed", "InducedVertBreak", "HorzBreak", "RelHeight", "RelSide", "SpinAxis", "SpinRate", "VertApprAngle", "HorzApprAngle"] # pitcher averages
infieldDataFrame = infieldDataFrame[specific_columns] 
averagesX = pitchingAveragesDF[specific_columns] # pitcher averages
#averagesX = averagesX[["PitcherThrows", "BatterSide", "TaggedPitchType", "PlateLocHeight", "PlateLocSide", "ZoneSpeed", "RelSpeed", "SpinRate", "HorzBreak", "VertBreak"]]

averagesX["PitcherThrows"] = averagesX["PitcherThrows"].map({"Left":1, "Right":2, "Both":3})
averagesX["BatterSide"] = averagesX["BatterSide"].map({"Left":1, "Right":2})
averagesX["TaggedPitchType"] = averagesX["TaggedPitchType"].map({"Fastball": 1, "FourSeamFastBall":1, "Sinker":2, "TwoSeamFastBall":2, "Cutter":3, "Curveball":4, "Slider":5, "ChangeUp":6, "Splitter":7, "Knuckleball":8})

# normalize this based on min and maxes from training data
averagesX = DataUtil.normalizeData(averagesX, infieldDataFrame)

# Change the value of index to look at different datapoints
importlib.reload(VisualUtil)
# 3) Model Testing:
dt = dtOutput[0]
nb = nbOutput[0]
logReg = logRegOutput[0]
# svm = svmOutput[0]
for index in range(pitchingAveragesDF.shape[0]):
    print(index)
    averageProbs= []
    averageProbs = dt.predict_proba([averagesX.iloc[index]])[0] + nb.predict_proba([averagesX.iloc[index]])[0] + logReg.predict_proba([averagesX.iloc[index]])[0]
    averageProbs = averageProbs / 3 

    # print(f"\n\nAVG Prediction: \t\t{np.argmax(averageProbs)+1}")
    # print(f"Field Slice AVG Probabilities: \t{averageProbs}")
    fileName = pitchingAveragesDF.iloc[index][0].replace(",", "_").replace(" ", "") + "_" + pitchingAveragesDF.iloc[index]["TaggedPitchType"] + "_" + pitchingAveragesDF.iloc[index]["BatterSide"] + "Batter"
    VisualUtil.visualizeData(averageProbs, [1], fileName)   

batch_image_to_excel.create_excel()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
Looking in: c:\Users\Trent\Desktop\Senior Design\shifting_model\Output
Invalid filename format: FieldTest.png
Invalid filename format:

In [11]:
# # This is for putting the right visuals on the correct excel sheets
# # For each player in the pitching averages, have a whole excel page for them
# import os
# importlib.reload(logs)
# print(pitchingAveragesDF)
# picList = []
# fileList = os.listdir("Visualization")
# for x in pitchingAveragesDF["Pitcher"].unique():
#     for y in fileList:
#         if x.replace(",", "_").replace(" ", "") in y:
#             picList.append(y)
#     logs.writeToImageExcelSheet(picList,x)
#     picList = []

           Pitcher PitcherId BatterId PitcherThrows BatterSide  \
0    Allsup, Chase       nan      nan         Right      Right   
1    Allsup, Chase       nan      nan         Right      Right   
2    Allsup, Chase       nan      nan         Right      Right   
3    Allsup, Chase       nan      nan         Right      Right   
4    Allsup, Chase       nan      nan         Right      Right   
..             ...       ...      ...           ...        ...   
239   Watts, Dylan       nan      nan         Right       Left   
240   Watts, Dylan       nan      nan         Right       Left   
241   Watts, Dylan       nan      nan         Right       Left   
242   Watts, Dylan       nan      nan         Right       Left   
243   Watts, Dylan       nan      nan         Right       Left   

      TaggedPitchType AutoPitchType PitchCall TaggedHitType PlayResult  ...  \
0            ChangeUp           nan       nan           nan        nan  ...   
1           Curveball           nan       nan    