In [None]:
# Model Training:
# 1) Load all data from preprocessing (training/test splits, etc)
# 2) Begin Training Models
    #  a) Decision Tree
    #  b) Naive Bayes
    #  c) Logistic Regression
    #  d) SVM
# 3) Testing Models
# 4) New Iterations

In [48]:
# Imports
from Models import ModelUtil
from Data import Preprocessing, DataUtil
from Visualization import VisualUtil
from Logs import logging as logs
from sklearn.ensemble import AdaBoostClassifier

import importlib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
import configparser
import numpy as np

config = configparser.ConfigParser()
config.read('Data//config.ini')

importlib.reload(Preprocessing)
importlib.reload(ModelUtil)
importlib.reload(VisualUtil)
importlib.reload(logs)

import warnings
warnings.filterwarnings("ignore")

In [58]:
# 1) Load all data from preprocessing 
importlib.reload(Preprocessing)

if("False" in config['DATA']['USE_NEW_PREPROCESSING']):
    infieldDataFrame, outfieldDataFrame = Preprocessing.dataFiltering([], False)

    # drop nan values from the used columns
    specific_columns = ["PitcherThrows", "BatterSide", "TaggedPitchType", "PlateLocHeight", "PlateLocSide", "ZoneSpeed", "SpinRate", "RelSpeed", "HorzBreak", "VertBreak"]
    infieldDataFrame = infieldDataFrame.dropna(axis=0, how='any',subset=specific_columns)

else:
    normDataFrame = Preprocessing.dataProcessing()
    infieldDataFrame, outfieldDataFrame = Preprocessing.dataFiltering(normDataFrame, True)

In [None]:
infieldcorrmatrix = infieldDataFrame.corr()
infieldcorrmatrix.to_csv('Infield_Correlation_Matrix')
outfieldcorrmatrix = outfieldDataFrame.corr()
outfieldcorrmatrix.to_csv('Outfield_Correlation_Matrix')

In [66]:
importlib.reload(Preprocessing)

if("False" in config['DATA']['USE_NEW_PREPROCESSING']):
    Y = infieldDataFrame["FieldSlice"]
    X = infieldDataFrame[["PitcherThrows", "BatterSide", "TaggedPitchType", "PlateLocHeight", "PlateLocSide", "ZoneSpeed", "SpinRate", "RelSpeed", "HorzBreak", "VertBreak"]] 
    X = DataUtil.normalizeData(X)
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.25, random_state=11)
    # adb = AdaBoostClassifier()
    # adb_model = adb.fit(xTrain, yTrain)

    # calculate split information:
    trainingClassSplit = [0, 0, 0, 0, 0]
    for i in yTrain:
        trainingClassSplit[i-1] += 1

    
    testingClassSplit = [0, 0, 0, 0, 0]
    for i in yTest:
        testingClassSplit[i-1] += 1

    trainingClassPercent = []
    for i in trainingClassSplit:
        trainingClassPercent.append(round(i/len(yTrain), 4))

    testingClassPercent = []
    for i in testingClassSplit:
        testingClassPercent.append(round(i/len(yTest), 4))

    print("Training Class Splits (count, then percentage):")
    print(trainingClassSplit)
    print(trainingClassPercent)
    print("\nTesting Class Splits (count, then percentage):")
    print(testingClassSplit)
    print(testingClassPercent)
else:
    infieldY = infieldDataFrame[0][['Direction','Distance']]
    infieldX = infieldDataFrame[0][infieldDataFrame[1]] 
    if("True" in config['SPLIT']['TTS']):
        xTrain, xTest, yTrain, yTest = train_test_split(infieldX, infieldY, test_size=0.20, random_state=11)
        
    elif("True" in config['SPLIT']['KFold']):
        kf = KFold(n_splits=5, shuffle=True)
        for train_index, test_index in kf.split(infieldX):
            xTrain, xTest = infieldX.iloc[train_index,:], infieldX.iloc[test_index,:]
            yTrain, yTest = infieldY.iloc[train_index,:], infieldY.iloc[test_index,:]

    elif("True" in config['SPLIT']['LOOCV']):
        loo = LeaveOneOut()
        for train_index, test_index in loo.split(infieldX):
            xTrain, xTest = infieldX.iloc[train_index,:], infieldX.iloc[test_index,:]
            yTrain, yTest = infieldY.iloc[train_index,:], infieldY.iloc[test_index,:]

    else:
        print("No Splitting Method Selected")
        

# GroupKFold: (avoids putting data from the same group in the test set -- useful for Pitcher/Batter ID when we implement that.)

Training Class Splits (count, then percentage):
[7280, 10890, 10257, 8471, 4956]
[0.1739, 0.2602, 0.2451, 0.2024, 0.1184]

Testing Class Splits (count, then percentage):
[2444, 3583, 3391, 2813, 1721]
[0.1752, 0.2568, 0.243, 0.2016, 0.1234]


In [None]:
# 2) Training Models

In [67]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# a) Decision Tree
# Need to test these hyperparameters for best case
max_depth = 50
max_features = 30
max_leaf_nodes = 150
dtOutput = ModelUtil.runDT(xTrain, yTrain, xTest, yTest, max_depth, max_features, max_leaf_nodes)


training decision tree model...
done!
getting statistics...

printing statistics...
Model Type: DecisionTree

Training Size = 41854
Testing Size = 13952

Training Accuracy = 0.36718115353371245
Testing Accuracy = 0.3353641055045872

Training Average Error = 0.9165910068332775
Testing Average Error = 0.9595040137614679

Training Recall = [0.27225274725274723, 0.5891643709825528, 0.30135517207760554, 0.29229134694841225, 0.283091202582728]
Testing Recall = [0.2295417348608838, 0.5540050237231371, 0.2695370097316426, 0.2694632065410594, 0.26786751888436955]

Training f1 (micro, macro, weighted) = [0.36718115353371245, 0.3498229078263638, 0.377188837360659]
Testing f1 (micro, macro, weighted) = [0.3353641055045872, 0.31898396185209704, 0.3458779663848676]

Training auc (macro, weighted) = [0.9919328682635605, 0.9890802485874709]
Testing auc (macro, weighted) = [0.9919480570866608, 0.9889777955420621]

Hyper-Parameters: 

Max Tree Depth: 50
Max Tree Features: 30
Max Leaf Nodes: 150

Accurac

In [61]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# b) Naive Bayes

var_smoothing = 1e-9
nbOutput = ModelUtil.runNB(xTrain, yTrain, xTest, yTest, var_smoothing)

training Naive Bayes model...
done!
getting statistics...
printing statistics...
Model Type: NaiveBayes

Training Size = 41854
Testing Size = 13952

Training Accuracy = 0.29932622927318775
Testing Accuracy = 0.30124713302752293

Training Average Error = 1.0603048693075932
Testing Average Error = 1.0596330275229358

Training Recall = [0.3501373626373626, 0.26483011937557394, 0.28565857463195865, 0.3834258056900012, 0.1850282485875706]
Testing Recall = [0.3473813420621931, 0.26960647502093216, 0.28870539663815986, 0.3775328830430146, 0.20162696106914585]

Training f1 (micro, macro, weighted) = [0.29932622927318775, 0.29279114174307147, 0.3021302145504058]
Testing f1 (micro, macro, weighted) = [0.30124713302752293, 0.29704959345347176, 0.303421573090737]

Training auc (macro, weighted) = [0.9796107525553882, 0.9745492787123667]
Testing auc (macro, weighted) = [0.9801851479792166, 0.9750596891616053]

Hyper-Parameters: 

Var Smoothing: 1e-09

Accuracy Score for Predicting on Training Data:

In [62]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# c)Logistic Regression
lr = 0.8
e = 100
logRegOutput = ModelUtil.runLogReg(xTrain, yTrain, xTest, yTest, lr, e)

training logistic regression model...
done!
getting statistics...
printing statistics...
Model Type: LogisticRegression

Training Size = 41854
Testing Size = 13952

Training Accuracy = 0.3336598652458546
Testing Accuracy = 0.3320670871559633

Training Average Error = 0.9501839728580302
Testing Average Error = 0.9519065366972477

Training Recall = [0.19945054945054946, 0.6603305785123967, 0.08092034708004289, 0.4730256168102939, 0.09786117836965294]
Testing Recall = [0.20662847790507366, 0.6502930505163271, 0.08728988498967856, 0.47707074297902596, 0.09296920395119117]

Training f1 (micro, macro, weighted) = [0.3336598652458546, 0.2742870861830683, 0.3793637661152824]
Testing f1 (micro, macro, weighted) = [0.3320670871559633, 0.27532402620013113, 0.37606800677094027]

Training auc (macro, weighted) = [0.9693485792190861, 0.9519703671146772]
Testing auc (macro, weighted) = [0.9696370401980529, 0.953305452126292]

Hyper-Parameters: 

Learning Rate: 0.8
Epochs: 100

Accuracy Score for Pred

In [10]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# d) SVM
rC = 1
kernel='linear'
degree= 1
gamma= 'scale'
coef0= 0.0
svmOutput = ModelUtil.runSVM(xTrain, yTrain, xTest, yTest, rC, kernel, degree, gamma, coef0)

training SVM model...
done!
getting statistics...
logging statistics...


  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]


printing statistics...
Model Type: SVM

Training Size = 42213
Testing Size = 14072

Training Accuracy = 0.31333949257337784
Testing Accuracy = 0.32227117680500283

Training Average Error = 0.9562220169142207
Testing Average Error = 0.9404491188175099

Training Recall = [0.0, 0.8033190480532506, 0.0, 0.5206884356949193, 0.0]
Testing Recall = [0.0, 0.8188306340927807, 0.0, 0.5387018396390142, 0.0]

Training f1 (micro, macro, weighted) = [0.31333949257337784, 0.16828958750340162, 0.43098074084009164]
Testing f1 (micro, macro, weighted) = [0.32227117680500283, 0.17354902523824683, 0.44205483936829404]

Training auc (macro, weighted) = Error
Testing auc (macro, weighted) = Error

Hyper-Parameters: 

Regularization Constant: 1
Kernel Type: linear
Kernel Degree1
Kernel Coefficient (gamma): scale
Independent Term in Kernel (coef0): 0.0

Accuracy Score for Predicting on Training Data: 0.9562
Accuracy Score for Predicting on Test Data: 0.3223


  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]



Overall Average Probabilities
-------------------------------------
Section 1: 16.61%
Section 2: 26.53%
Section 3: 25.04%
Section 4: 20.27%
Section 5: 11.55%

Field Slice Counts for Training Data
--------------------------------------------------
Section	Truth	Prediction
1		7295		28007.0
2		10967		14206.0
3		10348		nan
4		8483		nan
5		5120		nan
Amount Correct: 13227
Amount Incorrect: 28986

Field Slice Counts for Testing Data
--------------------------------------------------
Section	Truth	Prediction
1		2513		9367.0
2		3643		4705.0
3		3427		nan
4		2881		nan
5		1608		nan
Amount Correct: 4535
Amount Incorrect: 9537
done!


In [None]:
# z) RandomForestRegressor
for i in range(0, len(trainIn)):
    direction, distance = ModelUtil.runRFR(trainIn[i], trainOut[i], testIn[i], testOut[i])

In [63]:
# Change the value of index to look at different datapoints
importlib.reload(VisualUtil)
# 3) Model Testing:
dt = dtOutput[0]
nb = nbOutput[0]
logReg = logRegOutput[0]
# svm = svmOutput[0]

print("Testing Output: ")
# index of test value:
index = 6432
print(f"Actual Field Slice: \t\t{yTest.iloc[index]}")

print("\nDecision Tree:")
print(f"Predicted Field Slice: \t\t{dt.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{dt.predict_proba([xTest.iloc[index]])[0]}")

print("\nNaive Bayes:")
print(f"Predicted Field Slice: \t\t{nb.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{nb.predict_proba([xTest.iloc[index]])[0]}")

print("\nLogistic Regression:")
print(f"Predicted Field Slice: \t\t{logReg.predict([xTest.iloc[index]])[0]}")
print(f"Field Slice Probabilities: \t{logReg.predict_proba([xTest.iloc[index]])[0]}")

# print("\nSVM:")
# print(f"Predicted Field Slice: \t\t{svm.predict([xTest.iloc[index]])[0]}")
# print(f"Field Slice Probabilities: \t{svm.predict_proba([xTest.iloc[index]])[0]}")

averageProbs = dt.predict_proba([xTest.iloc[index]])[0] + nb.predict_proba([xTest.iloc[index]])[0] + logReg.predict_proba([xTest.iloc[index]])[0] # + svm.predict_proba([xTest.iloc[index]])[0]
averageProbs = averageProbs / 3 

print(f"\n\nAVG Prediction: \t\t{np.argmax(averageProbs)+1}")
print(f"Field Slice AVG Probabilities: \t{averageProbs}")

VisualUtil.visualizeData(averageProbs, [1])

Testing Output: 
Actual Field Slice: 		1

Decision Tree:
Predicted Field Slice: 		2
Field Slice Probabilities: 	[0.23381643 0.35942029 0.22995169 0.12173913 0.05507246]

Naive Bayes:
Predicted Field Slice: 		2
Field Slice Probabilities: 	[0.14106099 0.31966119 0.30380781 0.18913204 0.04633797]

Logistic Regression:
Predicted Field Slice: 		2
Field Slice Probabilities: 	[0.21911846 0.33834919 0.24991333 0.14666562 0.0459534 ]


AVG Prediction: 		2
Field Slice AVG Probabilities: 	[0.19799862 0.33914356 0.26122428 0.15251227 0.04912128]


In [64]:
# Gather data on average predictions
length = len(xTest)
correct = 0
incorrect = 0
wrongProbs = 0
wrongDistance = 0

dt = dtOutput[0]
nb = nbOutput[0]
logReg = logRegOutput[0]
# svm = svmOutput[0]
for index in range(length):
    averageProbs = dt.predict_proba([xTest.iloc[index]])[0] + nb.predict_proba([xTest.iloc[index]])[0] + logReg.predict_proba([xTest.iloc[index]])[0] #+ svm.predict_proba([xTest.iloc[index]])[0]
    averageProbs = averageProbs / 3

    actual = yTest.iloc[index]
    predicted = np.argmax(averageProbs)+1

    percentageActual = averageProbs[actual-1]

    if predicted == actual:
        correct += 1
    else:
        incorrect += 1 
        wrongProbs += percentageActual
        wrongDistance += abs(actual-predicted)


# Correct Prediction Count
print(correct)
# Incorrect Prediction Count
print(incorrect)
# Average probability of actual slice when guess is incorrect
print(wrongProbs/incorrect)
# Average distance from actual slice to guess slice when guess is incorrect
print(wrongDistance/incorrect)

4538
9414
0.2064765880034086
1.4736562566390483


In [None]:
# 4) Model Iterations and Improvements

In [307]:
# 5) Data Visualization

# Temporary method of getting percentages for testing purposes
infieldPercentages  = np.random.dirichlet(np.ones(4), size=1)[0]
outfieldPercentages = np.random.dirichlet(np.ones(1), size=1)[0]

VisualUtil.visualizeData(infieldPercentages, outfieldPercentages)
