Inspiration for Neural Net: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

# Data Pre-Processing

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pitcher
from random import shuffle
torch.manual_seed(1)

<torch._C.Generator at 0x7f575571edf0>

In [53]:
player = 'verlander'
data = pd.read_csv('Data/raw_data/'+player+'.csv')
data = pitcher.clean_data(data)
ABs = pitcher.get_abs(data)
reps = pitcher.get_reps(ABs)
reps = pitcher.drop_nas(reps)

In [54]:
#get train, validate, and test sets (60-20-20)
shuffle(reps)
cutoff1 = int(len(reps)*0.6)
cutoff2 = cutoff1 + int(len(reps)*.2)
train = reps[1:cutoff1]
validate = reps[cutoff1:cutoff2]
test = reps[cutoff2:]


#get train, validate, and test batches
batch_size = 50
train_batches = pitcher.get_batches(train,batch_size)
test_batches = pitcher.get_batches(test,batch_size)
validate_batches = pitcher.get_batches(validate,batch_size)

# Model Construction

In [55]:
def test_accuracy(batches,model):
    length = len(batches)*len(batches[0][0])
    num_right = 0
    ch_count = 0
    predict_ch = 0
    cu_count = 0
    predict_cu = 0
    sl_count = 0
    predict_sl = 0
    ff_count = 0
    predict_ff = 0
    ff_right = 0
    ch_right = 0
    sl_right = 0
    cu_right = 0
    for batch in batches:
        with torch.no_grad():
            prev_pitches,pre_pitch, ptypes = batch
            prevs_in = torch.tensor(prev_pitches, dtype=torch.float)
            rep_in = torch.tensor(pre_pitch, dtype=torch.float)
            #targets = tag_to_ix[ptypes]
            if len(ptypes) > 1:
                targets = [tag_to_ix[ptype] for ptype in ptypes]
            else:
                targets = tag_to_ix[ptypes[0]]
            tag_scores = model((prevs_in,rep_in))
            #print(tag_scores)
            preds = [tag_score.max(0) for tag_score in tag_scores]
            #print(index.item())
            if len(ptypes) > 1:
                for pred,target in zip(preds,targets):
                    _,index = pred
                    if target == 1:
                        ff_count += 1
                    if index.item() == 1:
                        predict_ff += 1
                    if target == 1 and index.item() == 1:
                        ff_right += 1
                    if target == 0:
                        cu_count += 1
                    if index.item() == 0:
                        predict_cu += 1
                    if target == 0 and index.item() == 0:
                        cu_right += 1
                    if index.item() == target:
                        num_right += 1
            else:
                _,index = preds[0]
                if targets == 1:
                    ff_count += 1
                if index.item() == 1:
                    predict_ff += 1
                if targets == 1 and index.item() == 1:
                    ff_right += 1
                if targets == 0:
                    cu_count += 1
                if index.item() == 0:
                    predict_cu += 1
                if targets == 0 and index.item() == 0:
                    cu_right += 1
                if index.item() == targets:
                    num_right += 1
    ff_rate = ff_count/length
    pred_ff_rate = predict_ff/length
    ff_acc = ff_right/ff_count
    cu_rate = cu_count/length
    pred_cu_rate = predict_cu/length
    cu_acc = cu_right/cu_count
    accuracy = num_right/length
    print("______________________________________")
    print("Non-Fastball rate:",cu_rate)
    print("Predicted Non-Fastball rate:",pred_cu_rate)
    print("Non-Fastball accuracy:",cu_acc)
    print("______________________________________")
    print("Fastball rate:",ff_rate)
    print("Predicted Fastball rate:",pred_ff_rate)
    print("Fastball accuracy:",ff_acc)
    print("______________________________________")
    print("Accuracy:",accuracy)
    print("Accuracy above naive guess:",accuracy - ff_rate)
    print("______________________________________")

In [56]:
#change this for different pitcher
'''tag_to_ix = {'CU':0,
             'FF':1,
             'SL':2,
             'CH':3}'''
tag_to_ix = {'NF':0,
             'FF':1}

# -- Input Dimensions -- DO NOT CHANGE
PREV_PITCH_DIM = 25
NUM_PREV_PITCHES = 3
GAME_STATE_DIM = 15 
GAME_OUT_DIM = 15

# -- Hyperparameters -- DO CHANGE 
HIDDEN_DIM = 120
OUT_DIM = 15
lstm_layers = 1
learning_rate = 0.0001

torch.set_default_tensor_type(torch.FloatTensor)
    
class PitchPredict(nn.Module):
    def __init__(self, prev_pitch_dim, hidden_dim, num_prev_pitches,out_dim, game_state_dim, game_out_dim, num_ptypes):
        super(PitchPredict, self).__init__()
        #get constants
        self.hidden_dim = hidden_dim
        self.prev_pitch_dim = prev_pitch_dim
        self.num_prev_pitches = num_prev_pitches
        self.out_dim = out_dim
        self.game_state_dim = game_state_dim
        self.game_out_dim = game_out_dim
        
        ####Define Layers####
        
        ########################## LSTM for past five pitches#######################################
        # The LSTM takes previous pitch vectors as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.RNN(prev_pitch_dim, hidden_dim,num_layers=lstm_layers,batch_first=True)#FIDDLE WITH num_layers

        # The linear layer that maps from hidden state space to a vector
        # with dimensionality OUT_DIM
        self.hidden2out = nn.Linear(hidden_dim, out_dim)
        
        ############## FULLY CONNECTED LAYERS for LSTM OUTPUT + game_state vector #################
        
        # layer to map the game state vector to a different dimension
        # self.l1 = nn.Linear(self.game_state_dim, self.game_out_dim)  
        
        # This Fully connected layer maps from the output of the final hidden layer output from the LSTM,
        # dimension = OUT_DIM, with the game state vector, dimension = GAME_STATE_DIM
        # to a vector of length of the number of ptypes to pass through softmax for probabilities
        self.fc1 = nn.Linear((self.out_dim + self.game_out_dim), 2)

                             
    def forward(self, rep):
        past_pitches,game_state = rep
        lstm_out, _ = self.lstm(past_pitches.view(batch_size,self.num_prev_pitches, -1))
        learned_rep = self.hidden2out(lstm_out.view(batch_size,self.num_prev_pitches, -1))
        game_rep = game_state.view(batch_size,self.game_state_dim)
        #game_rep = self.l1(game_state.view(batch_size,self.game_state_dim))
        #game_rep = F.relu(game_rep)
        encoding = learned_rep[:,self.num_prev_pitches - 1:,:]
        fc_in = torch.cat((encoding.view(batch_size,self.out_dim),game_rep.view(batch_size,self.game_out_dim)),dim=1)
        fc = self.fc1(fc_in.view(batch_size,self.game_out_dim+self.out_dim))
        tag_scores = F.log_softmax(fc,dim=0)
        return tag_scores

# Testing

# Training

In [64]:
model = PitchPredict(PREV_PITCH_DIM, HIDDEN_DIM, NUM_PREV_PITCHES, OUT_DIM, GAME_STATE_DIM, GAME_OUT_DIM, len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    print("***************Pre-Training Accuracy*****************")
    test_accuracy(train_batches,model)
print("**************************Training*****************************")
for epoch in range(1000):
    #shuffle(train_batches)
    for batch in train_batches:
        prev_pitches,pre_pitch, ptypes = batch
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        #get input tensors ready
        prevs_in = torch.tensor(prev_pitches, dtype=torch.float)
        game_state_in = torch.tensor(pre_pitch, dtype=torch.float)
        
        #get target value
        #target = tag_to_ix[ptype]
        target = [ tag_to_ix[ptype] for ptype in ptypes]

        # Step 3. Run our forward pass.
        tag_scores = model((prevs_in,game_state_in))
               
        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, torch.tensor(target,dtype=torch.long))
        loss.backward()
        optimizer.step()
        
        preds = [tag_score.max(0) for tag_score in tag_scores]

                
    #display post-epoch results
    print('epoch:',epoch+1,"loss:",loss.item())

# See what the scores are after training
with torch.no_grad():
    print("****************Post-Training Accuracy********************")
    test_accuracy(train_batches,model.eval())
    print("**********************************************************")

***************Pre-Training Accuracy*****************
______________________________________
Non-Fastball rate: 0.42080338266384776
Predicted Non-Fastball rate: 0.5016490486257928
Non-Fastball accuracy: 0.46694131832797425
______________________________________
Fastball rate: 0.5791966173361522
Predicted Fastball rate: 0.4983509513742072
Fastball accuracy: 0.4731347641991532
______________________________________
Accuracy: 0.4705285412262156
Accuracy above naive guess: -0.10866807610993662
______________________________________
**************************Training*****************************
epoch: 1 loss: 3.9207334518432617
epoch: 2 loss: 3.9105803966522217
epoch: 3 loss: 3.9061243534088135
epoch: 4 loss: 3.902644395828247
epoch: 5 loss: 3.8998770713806152
epoch: 6 loss: 3.897428512573242
epoch: 7 loss: 3.895267963409424
epoch: 8 loss: 3.8934314250946045
epoch: 9 loss: 3.8919076919555664
epoch: 10 loss: 3.8906466960906982
epoch: 11 loss: 3.8896055221557617
epoch: 12 loss: 3.88873386383

epoch: 218 loss: 3.855623483657837
epoch: 219 loss: 3.85551381111145
epoch: 220 loss: 3.8554046154022217
epoch: 221 loss: 3.8552968502044678
epoch: 222 loss: 3.855191230773926
epoch: 223 loss: 3.8550853729248047
epoch: 224 loss: 3.8549814224243164
epoch: 225 loss: 3.8548784255981445
epoch: 226 loss: 3.8547773361206055
epoch: 227 loss: 3.8546767234802246
epoch: 228 loss: 3.854578971862793
epoch: 229 loss: 3.8544821739196777
epoch: 230 loss: 3.8543875217437744
epoch: 231 loss: 3.854295015335083
epoch: 232 loss: 3.854203224182129
epoch: 233 loss: 3.854114294052124
epoch: 234 loss: 3.8540260791778564
epoch: 235 loss: 3.853940486907959
epoch: 236 loss: 3.853855609893799
epoch: 237 loss: 3.8537728786468506
epoch: 238 loss: 3.853691339492798
epoch: 239 loss: 3.8536124229431152
epoch: 240 loss: 3.853534460067749
epoch: 241 loss: 3.8534581661224365
epoch: 242 loss: 3.853383779525757
epoch: 243 loss: 3.853311538696289
epoch: 244 loss: 3.8532397747039795
epoch: 245 loss: 3.8531689643859863
epoch:

epoch: 449 loss: 3.814781904220581
epoch: 450 loss: 3.8146026134490967
epoch: 451 loss: 3.814424514770508
epoch: 452 loss: 3.814248561859131
epoch: 453 loss: 3.8140740394592285
epoch: 454 loss: 3.813903570175171
epoch: 455 loss: 3.8137335777282715
epoch: 456 loss: 3.813565731048584
epoch: 457 loss: 3.8134000301361084
epoch: 458 loss: 3.813235282897949
epoch: 459 loss: 3.8130743503570557
epoch: 460 loss: 3.8129141330718994
epoch: 461 loss: 3.812755823135376
epoch: 462 loss: 3.8125998973846436
epoch: 463 loss: 3.8124451637268066
epoch: 464 loss: 3.8122928142547607
epoch: 465 loss: 3.812142848968506
epoch: 466 loss: 3.8119940757751465
epoch: 467 loss: 3.81184720993042
epoch: 468 loss: 3.8117024898529053
epoch: 469 loss: 3.8115592002868652
epoch: 470 loss: 3.8114185333251953
epoch: 471 loss: 3.811277389526367
epoch: 472 loss: 3.811140775680542
epoch: 473 loss: 3.8110039234161377
epoch: 474 loss: 3.8108696937561035
epoch: 475 loss: 3.8107383251190186
epoch: 476 loss: 3.810605525970459
epoch

epoch: 680 loss: 3.8024001121520996
epoch: 681 loss: 3.8024075031280518
epoch: 682 loss: 3.8024020195007324
epoch: 683 loss: 3.802417278289795
epoch: 684 loss: 3.802435874938965
epoch: 685 loss: 3.8024258613586426
epoch: 686 loss: 3.8024299144744873
epoch: 687 loss: 3.8024489879608154
epoch: 688 loss: 3.8024446964263916
epoch: 689 loss: 3.802433967590332
epoch: 690 loss: 3.8024489879608154
epoch: 691 loss: 3.8024518489837646
epoch: 692 loss: 3.8024353981018066
epoch: 693 loss: 3.802433490753174
epoch: 694 loss: 3.80244517326355
epoch: 695 loss: 3.802424907684326
epoch: 696 loss: 3.802412509918213
epoch: 697 loss: 3.802415370941162
epoch: 698 loss: 3.802408218383789
epoch: 699 loss: 3.8023765087127686
epoch: 700 loss: 3.802374839782715
epoch: 701 loss: 3.802367925643921
epoch: 702 loss: 3.802335739135742
epoch: 703 loss: 3.802314519882202
epoch: 704 loss: 3.8023135662078857
epoch: 705 loss: 3.802276372909546
epoch: 706 loss: 3.80224609375
epoch: 707 loss: 3.802227735519409
epoch: 708 lo

epoch: 911 loss: 3.7740235328674316
epoch: 912 loss: 3.7739369869232178
epoch: 913 loss: 3.7736949920654297
epoch: 914 loss: 3.773606061935425
epoch: 915 loss: 3.773364305496216
epoch: 916 loss: 3.773268938064575
epoch: 917 loss: 3.7730283737182617
epoch: 918 loss: 3.7729341983795166
epoch: 919 loss: 3.772690534591675
epoch: 920 loss: 3.7725894451141357
epoch: 921 loss: 3.7723464965820312
epoch: 922 loss: 3.7722485065460205
epoch: 923 loss: 3.7719995975494385
epoch: 924 loss: 3.7719011306762695
epoch: 925 loss: 3.7716524600982666
epoch: 926 loss: 3.7715468406677246
epoch: 927 loss: 3.771299123764038
epoch: 928 loss: 3.771193504333496
epoch: 929 loss: 3.770941734313965
epoch: 930 loss: 3.7708401679992676
epoch: 931 loss: 3.770582914352417
epoch: 932 loss: 3.7704784870147705
epoch: 933 loss: 3.770223617553711
epoch: 934 loss: 3.770108699798584
epoch: 935 loss: 3.7698581218719482
epoch: 936 loss: 3.769742727279663
epoch: 937 loss: 3.769488573074341
epoch: 938 loss: 3.7693772315979004
epoc

# Results

In [65]:
with torch.no_grad():
    print("****************Validation Accuracy********************")
    test_accuracy(validate_batches,model)
    print("**********************************************************")

****************Validation Accuracy********************
______________________________________
Non-Fastball rate: 0.41668789808917195
Predicted Non-Fastball rate: 0.36738853503184715
Non-Fastball accuracy: 0.44451238153469885
______________________________________
Fastball rate: 0.583312101910828
Predicted Fastball rate: 0.6326114649681529
Fastball accuracy: 0.6877047390259882
______________________________________
Accuracy: 0.5863694267515923
Accuracy above naive guess: 0.003057324840764264
______________________________________
**********************************************************


In [66]:
with torch.no_grad():
    print("****************Test Accuracy********************")
    test_accuracy(test_batches,model)
    print("**********************************************************")

****************Test Accuracy********************
______________________________________
Non-Fastball rate: 0.4173248407643312
Predicted Non-Fastball rate: 0.36560509554140125
Non-Fastball accuracy: 0.4548229548229548
______________________________________
Fastball rate: 0.5826751592356688
Predicted Fastball rate: 0.6343949044585987
Fastball accuracy: 0.6982947092260603
______________________________________
Accuracy: 0.596687898089172
Accuracy above naive guess: 0.014012738853503182
______________________________________
**********************************************************
