In [None]:
# Model Training:
# 1) Load all data from preprocessing (training/test splits, etc)
# 2) Begin Training Models
    #  a) Decision Tree
    #  b) Naive Bayes
    #  c) Logistic Regression
    #  d) SVM
# 3) Testing Models
# 4) New Iterations

In [37]:
# Imports
from Models import ModelUtil
from Data import Preprocessing, DataUtil
from Visualization import VisualUtil
from Logs import logging as logs
from sklearn.ensemble import AdaBoostClassifier

import importlib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
import configparser
import numpy as np

config = configparser.ConfigParser()
config.read('Data//config.ini')

importlib.reload(Preprocessing)
importlib.reload(ModelUtil)
importlib.reload(VisualUtil)
importlib.reload(logs)

<module 'Logs.logging' from 'c:\\Users\\Trent\\Desktop\\Senior Design\\shifting_model\\Logs\\logging.py'>

In [38]:
# 1) Load all data from preprocessing 
importlib.reload(Preprocessing)

if("False" in config['DATA']['USE_NEW_PREPROCESSING']):
    infieldDataFrame, outfieldDataFrame = Preprocessing.dataFiltering([], False)
else:
    normDataFrame = Preprocessing.dataProcessing()
    infieldDataFrame, outfieldDataFrame = Preprocessing.dataFiltering(normDataFrame, True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df = df[df["BatterSide"].isin(["Left","Right"])] # 1, 2


In [None]:
infieldcorrmatrix = infieldDataFrame.corr()
infieldcorrmatrix.to_csv('Infield_Correlation_Matrix')
outfieldcorrmatrix = outfieldDataFrame.corr()
outfieldcorrmatrix.to_csv('Outfield_Correlation_Matrix')

In [45]:
importlib.reload(Preprocessing)

if("False" in config['DATA']['USE_NEW_PREPROCESSING']):
    Y = infieldDataFrame["FieldSlice"]
    X = infieldDataFrame[["PitcherThrows", "BatterSide", "TaggedPitchType", "PlateLocHeight", "PlateLocSide", "ZoneSpeed"]] #, "SpinRate""RelSpeed" "HorzBreak", "VertBreak",, "HorzBreak", "VertBreak"]]
    X = DataUtil.normalizeData(X)
    xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.25, random_state=11)
    # adb = AdaBoostClassifier()
    # adb_model = adb.fit(xTrain, yTrain)
else:
    infieldY = infieldDataFrame[0][['Direction','Distance']]
    infieldX = infieldDataFrame[0][infieldDataFrame[1]] 
    if("True" in config['SPLIT']['TTS']):
        xTrain, xTest, yTrain, yTest = train_test_split(infieldX, infieldY, test_size=0.20, random_state=11)
        
    elif("True" in config['SPLIT']['KFold']):
        kf = KFold(n_splits=5, shuffle=True)
        for train_index, test_index in kf.split(infieldX):
            xTrain, xTest = infieldX.iloc[train_index,:], infieldX.iloc[test_index,:]
            yTrain, yTest = infieldY.iloc[train_index,:], infieldY.iloc[test_index,:]

    elif("True" in config['SPLIT']['LOOCV']):
        loo = LeaveOneOut()
        for train_index, test_index in loo.split(infieldX):
            xTrain, xTest = infieldX.iloc[train_index,:], infieldX.iloc[test_index,:]
            yTrain, yTest = infieldY.iloc[train_index,:], infieldY.iloc[test_index,:]

    else:
        print("No Splitting Method Selected")
        

# GroupKFold: (avoids putting data from the same group in the test set -- useful for Pitcher/Batter ID when we implement that.)

In [None]:
# 2) Training Models

In [47]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# a) Decision Tree
# Need to test these hyperparameters for best case
max_depth = 50
max_features = 30
max_leaf_nodes = 150
result = ModelUtil.runDT(xTrain, yTrain, xTest, yTest, max_depth, max_features, max_leaf_nodes)


training decision tree model...
done!
getting statistics...

logging statistics...
printing statistics...
Model Type: DecisionTree

Training Size = 42213
Testing Size = 14072

Training Accuracy = 0.36216331461871937
Testing Accuracy = 0.33733655486071634

Training Average Error = 0.943879847440362
Testing Average Error = 0.9712905059693008

Training Recall = [0.3592871830020562, 0.5583112975289505, 0.2243911867027445, 0.34138865967228577, 0.258984375]
Testing Recall = [0.3179466772781536, 0.5317046390337634, 0.21272249781149694, 0.3099618188129122, 0.24191542288557213]

Hyper-Parameters: 

Max Tree Depth: 50
Max Tree Features: 30
Max Leaf Nodes: 150

Accuracy Score for Predicting on Training Data: 0.9439
Accuracy Score for Predicting on Test Data: 0.3373

Overall Average Probabilities
-------------------------------------
Section 1: 17.41%
Section 2: 26.05%
Section 3: 24.47%
Section 4: 19.97%
Section 5: 12.10%

Field Slice Counts for Training Data
--------------------------------------

  dftemp = dftest[dftest["Correct"] == True]
  dfTrainStats = dfTrainStats.rename(columns={"FieldSliceActual":"Field Slice",0:"Count of Actual"})
  dftemp = dftrain[dftrain["Correct"] == True]
  log.append("Accuracy Score for Predicting on Training Data: " + str('{:.4f}'.format(train_stats[0])))
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  
  probs = model.predict_proba(test_x)


In [48]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# b) Naive Bayes

var_smoothing = 1e-9
result = ModelUtil.runNB(xTrain, yTrain, xTest, yTest, var_smoothing)

         PitcherThrows  BatterSide  TaggedPitchType  PlateLocHeight  \
1368973            0.0         1.0         0.000000        0.364856   
173530             1.0         0.0         0.142857        0.400011   
43641              1.0         1.0         0.000000        0.453915   
1593920            1.0         1.0         0.142857        0.373789   
169437             0.0         0.0         0.428571        0.419880   
...                ...         ...              ...             ...   
683939             1.0         1.0         0.571429        0.285084   
153093             1.0         0.0         0.000000        0.601687   
456840             1.0         1.0         0.000000        0.433127   
857652             1.0         0.0         0.000000        0.508333   
210564             0.0         1.0         0.000000        0.476817   

         PlateLocSide  ZoneSpeed  
1368973      0.554002   0.676092  
173530       0.640059   0.767135  
43641        0.622729   0.864158  
1593920

  dftemp = dftest[dftest["Correct"] == True]
  dfTrainStats = dfTrainStats.rename(columns={"FieldSliceActual":"Field Slice",0:"Count of Actual"})
  dftemp = dftrain[dftrain["Correct"] == True]
  log.append("Accuracy Score for Predicting on Training Data: " + str('{:.4f}'.format(train_stats[0])))
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  
  probs = model.predict_proba(test_x)


In [35]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# c)Logistic Regression
lr = 0.8
e = 100
result = ModelUtil.runLogReg(xTrain, yTrain, xTest, yTest, lr, e)

training logistic regression model...
done!
getting statistics...
logging statistics...
printing statistics...
Model Type: LogisticRegression

Training Size = 42213
Testing Size = 14072

Training Accuracy = 0.3322673110179329
Testing Accuracy = 0.33889994314951677

Hyper-Parameters: 

Learning Rate: 0.8
Epochs: 100

Accuracy Score for Predicting on Training Data: 0.9272
Accuracy Score for Predicting on Test Data: 0.3389

Overall Average Probabilities
-------------------------------------
Section 1: 17.41%
Section 2: 26.06%
Section 3: 24.55%
Section 4: 19.98%
Section 5: 12.00%

Field Slice Counts for Training Data
--------------------------------------------------
Section	Truth	Prediction
1		7295		2500
2		10967		22578
3		10348		4940
4		8483		11500
5		5120		695
Amount Correct: 14026
Amount Incorrect: 28187

Field Slice Counts for Testing Data
--------------------------------------------------
Section	Truth	Prediction
1		2513		805
2		3643		7684
3		3427		1543
4		2881		3847
5		1608		193
Amo

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActua

In [36]:
importlib.reload(ModelUtil)
importlib.reload(logs)
# d) SVM
rC = 1
kernel='linear'
degree= 1
gamma= 'scale'
coef0= 0.0
result = ModelUtil.runSVM(xTrain, yTrain, xTest, yTest, rC, kernel, degree, gamma, coef0)

training SVM model...
done!
getting statistics...
logging statistics...


  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]


printing statistics...
Model Type: SVM

Training Size = 42213
Testing Size = 14072

Training Accuracy = 0.31333949257337784
Testing Accuracy = 0.32227117680500283

Hyper-Parameters: 

Regularization Constant: 1
Kernel Type: linear
Kernel Degree1
Kernel Coefficient (gamma): scale
Independent Term in Kernel (coef0): 0.0

Accuracy Score for Predicting on Training Data: 0.9562
Accuracy Score for Predicting on Test Data: 0.3223


  dfTestStats = dftest.groupby(["FieldSliceActual"]).size().reset_index()
  dfTestStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]
  dfTrainStats = dftrain.groupby(["FieldSliceActual"]).size().reset_index()
  dfTrainStats["Correct"] = dftemp.groupby(["FieldSliceActual"]).size().reset_index()[0]



Overall Average Probabilities
-------------------------------------
Section 1: 16.98%
Section 2: 26.34%
Section 3: 24.40%
Section 4: 20.21%
Section 5: 12.07%

Field Slice Counts for Training Data
--------------------------------------------------
Section	Truth	Prediction
1		7295		28007.0
2		10967		14206.0
3		10348		nan
4		8483		nan
5		5120		nan
Amount Correct: 13227
Amount Incorrect: 28986

Field Slice Counts for Testing Data
--------------------------------------------------
Section	Truth	Prediction
1		2513		9367.0
2		3643		4705.0
3		3427		nan
4		2881		nan
5		1608		nan
Amount Correct: 4535
Amount Incorrect: 9537
done!


In [None]:
# z) RandomForestRegressor
for i in range(0, len(trainIn)):
    direction, distance = ModelUtil.runRFR(trainIn[i], trainOut[i], testIn[i], testOut[i])

In [None]:
# 3) Model Testing:

In [None]:
# 4) Model Iterations and Improvements

In [None]:
# 5) Data Visualization

# Temporary method of getting percentages for testing purposes
slices = 5
infieldPercentages  = np.random.dirichlet(np.ones(slices), size=1)[0]
outfieldPercentages = np.random.dirichlet(np.ones(slices), size=1)[0]

VisualUtil.visualizeData(infieldPercentages, outfieldPercentages)
