Inspiration for Neural Net: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

# Data Pre-Processing

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pitcher
from random import shuffle
torch.manual_seed(1)

<torch._C.Generator at 0x7f575571edf0>

In [53]:
player = 'verlander'
data = pd.read_csv('Data/raw_data/'+player+'.csv')
data = pitcher.clean_data(data)
ABs = pitcher.get_abs(data)
reps = pitcher.get_reps(ABs)
reps = pitcher.drop_nas(reps)

In [54]:
#get train, validate, and test sets (60-20-20)
shuffle(reps)
cutoff1 = int(len(reps)*0.6)
cutoff2 = cutoff1 + int(len(reps)*.2)
train = reps[1:cutoff1]
validate = reps[cutoff1:cutoff2]
test = reps[cutoff2:]


#get train, validate, and test batches
batch_size = 50
train_batches = pitcher.get_batches(train,batch_size)
test_batches = pitcher.get_batches(test,batch_size)
validate_batches = pitcher.get_batches(validate,batch_size)

# Model Construction

In [55]:
def test_accuracy(batches,model):
    length = len(batches)*len(batches[0][0])
    num_right = 0
    ch_count = 0
    predict_ch = 0
    cu_count = 0
    predict_cu = 0
    sl_count = 0
    predict_sl = 0
    ff_count = 0
    predict_ff = 0
    ff_right = 0
    ch_right = 0
    sl_right = 0
    cu_right = 0
    for batch in batches:
        with torch.no_grad():
            prev_pitches,pre_pitch, ptypes = batch
            prevs_in = torch.tensor(prev_pitches, dtype=torch.float)
            rep_in = torch.tensor(pre_pitch, dtype=torch.float)
            #targets = tag_to_ix[ptypes]
            if len(ptypes) > 1:
                targets = [tag_to_ix[ptype] for ptype in ptypes]
            else:
                targets = tag_to_ix[ptypes[0]]
            tag_scores = model((prevs_in,rep_in))
            #print(tag_scores)
            preds = [tag_score.max(0) for tag_score in tag_scores]
            #print(index.item())
            if len(ptypes) > 1:
                for pred,target in zip(preds,targets):
                    _,index = pred
                    if target == 1:
                        ff_count += 1
                    if index.item() == 1:
                        predict_ff += 1
                    if target == 1 and index.item() == 1:
                        ff_right += 1
                    if target == 0:
                        cu_count += 1
                    if index.item() == 0:
                        predict_cu += 1
                    if target == 0 and index.item() == 0:
                        cu_right += 1
                    if index.item() == target:
                        num_right += 1
            else:
                _,index = preds[0]
                if targets == 1:
                    ff_count += 1
                if index.item() == 1:
                    predict_ff += 1
                if targets == 1 and index.item() == 1:
                    ff_right += 1
                if targets == 0:
                    cu_count += 1
                if index.item() == 0:
                    predict_cu += 1
                if targets == 0 and index.item() == 0:
                    cu_right += 1
                if index.item() == targets:
                    num_right += 1
    ff_rate = ff_count/length
    pred_ff_rate = predict_ff/length
    ff_acc = ff_right/ff_count
    cu_rate = cu_count/length
    pred_cu_rate = predict_cu/length
    cu_acc = cu_right/cu_count
    accuracy = num_right/length
    print("______________________________________")
    print("Non-Fastball rate:",cu_rate)
    print("Predicted Non-Fastball rate:",pred_cu_rate)
    print("Non-Fastball accuracy:",cu_acc)
    print("______________________________________")
    print("Fastball rate:",ff_rate)
    print("Predicted Fastball rate:",pred_ff_rate)
    print("Fastball accuracy:",ff_acc)
    print("______________________________________")
    print("Accuracy:",accuracy)
    print("Accuracy above naive guess:",accuracy - ff_rate)
    print("______________________________________")

In [56]:
#change this for different pitcher
'''tag_to_ix = {'CU':0,
             'FF':1,
             'SL':2,
             'CH':3}'''
tag_to_ix = {'NF':0,
             'FF':1}

# -- Input Dimensions -- DO NOT CHANGE
PREV_PITCH_DIM = 25
NUM_PREV_PITCHES = 3
GAME_STATE_DIM = 15 
GAME_OUT_DIM = 15

# -- Hyperparameters -- DO CHANGE 
HIDDEN_DIM = 120
OUT_DIM = 15
lstm_layers = 1
learning_rate = 0.0001

torch.set_default_tensor_type(torch.FloatTensor)
    
class PitchPredict(nn.Module):
    def __init__(self, prev_pitch_dim, hidden_dim, num_prev_pitches,out_dim, game_state_dim, game_out_dim, num_ptypes):
        super(PitchPredict, self).__init__()
        #get constants
        self.hidden_dim = hidden_dim
        self.prev_pitch_dim = prev_pitch_dim
        self.num_prev_pitches = num_prev_pitches
        self.out_dim = out_dim
        self.game_state_dim = game_state_dim
        self.game_out_dim = game_out_dim
        
        ####Define Layers####
        
        ########################## LSTM for past five pitches#######################################
        # The LSTM takes previous pitch vectors as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.RNN(prev_pitch_dim, hidden_dim,num_layers=lstm_layers,batch_first=True)#FIDDLE WITH num_layers

        # The linear layer that maps from hidden state space to a vector
        # with dimensionality OUT_DIM
        self.hidden2out = nn.Linear(hidden_dim, out_dim)
        
        ############## FULLY CONNECTED LAYERS for LSTM OUTPUT + game_state vector #################
        
        # This Fully connected layer maps from the output of the final hidden layer output from the LSTM,
        # dimension = OUT_DIM, with the game state vector, dimension = GAME_STATE_DIM
        # to a vector of length of the number of ptypes to pass through softmax for probabilities
        self.fc1 = nn.Linear((self.out_dim + self.game_out_dim), 2)

                             
    def forward(self, rep):
        past_pitches,game_state = rep
        lstm_out, _ = self.lstm(past_pitches.view(batch_size,self.num_prev_pitches, -1))
        learned_rep = self.hidden2out(lstm_out.view(batch_size,self.num_prev_pitches, -1))
        game_rep = game_state.view(batch_size,self.game_state_dim)
        #game_rep = self.l1(game_state.view(batch_size,self.game_state_dim))
        #game_rep = F.relu(game_rep)
        encoding = learned_rep[:,self.num_prev_pitches - 1:,:]
        fc_in = torch.cat((encoding.view(batch_size,self.out_dim),game_rep.view(batch_size,self.game_out_dim)),dim=1)
        fc = self.fc1(fc_in.view(batch_size,self.game_out_dim+self.out_dim))
        tag_scores = F.log_softmax(fc,dim=0)
        return tag_scores

# Testing

# Training

In [79]:
model = PitchPredict(PREV_PITCH_DIM, HIDDEN_DIM, NUM_PREV_PITCHES, OUT_DIM, GAME_STATE_DIM, GAME_OUT_DIM, len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    print("***************Pre-Training Accuracy*****************")
    test_accuracy(train_batches,model)
print("**************************Training*****************************")
for epoch in range(200):
    #shuffle(train_batches)
    for batch in train_batches:
        prev_pitches,pre_pitch, ptypes = batch
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        #get input tensors ready
        prevs_in = torch.tensor(prev_pitches, dtype=torch.float)
        game_state_in = torch.tensor(pre_pitch, dtype=torch.float)
        
        #get target value
        #target = tag_to_ix[ptype]
        target = [ tag_to_ix[ptype] for ptype in ptypes]

        # Step 3. Run our forward pass.
        tag_scores = model((prevs_in,game_state_in))
               
        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, torch.tensor(target,dtype=torch.long))
        loss.backward()
        optimizer.step()
        
        preds = [tag_score.max(0) for tag_score in tag_scores]

                
    #display post-epoch results
    print('epoch:',epoch+1,"loss:",loss.item())

# See what the scores are after training
with torch.no_grad():
    print("****************Post-Training Accuracy********************")
    test_accuracy(train_batches,model.eval())
    print("**********************************************************")

***************Pre-Training Accuracy*****************
______________________________________
Non-Fastball rate: 0.42080338266384776
Predicted Non-Fastball rate: 0.49192389006342496
Non-Fastball accuracy: 0.5162781350482315
______________________________________
Fastball rate: 0.5791966173361522
Predicted Fastball rate: 0.508076109936575
Fastball accuracy: 0.5257701854285297
______________________________________
Accuracy: 0.5217758985200845
Accuracy above naive guess: -0.057420718816067695
______________________________________
**************************Training*****************************
epoch: 1 loss: 3.8729336261749268
epoch: 2 loss: 3.871647596359253
epoch: 3 loss: 3.872035026550293
epoch: 4 loss: 3.872884511947632
epoch: 5 loss: 3.8739614486694336
epoch: 6 loss: 3.875012159347534
epoch: 7 loss: 3.8759572505950928
epoch: 8 loss: 3.8768155574798584
epoch: 9 loss: 3.8775980472564697
epoch: 10 loss: 3.8782894611358643
epoch: 11 loss: 3.8788743019104004
epoch: 12 loss: 3.879343271255

# Results

In [80]:
with torch.no_grad():
    print("****************Validation Accuracy********************")
    test_accuracy(validate_batches,model)
    print("**********************************************************")

****************Validation Accuracy********************
______________________________________
Non-Fastball rate: 0.41668789808917195
Predicted Non-Fastball rate: 0.4965605095541401
Non-Fastball accuracy: 0.6398654845612962
______________________________________
Fastball rate: 0.583312101910828
Predicted Fastball rate: 0.5034394904458599
Fastball accuracy: 0.6058091286307054
______________________________________
Accuracy: 0.62
Accuracy above naive guess: 0.036687898089171944
______________________________________
**********************************************************


In [81]:
with torch.no_grad():
    print("****************Test Accuracy********************")
    test_accuracy(test_batches,model)
    print("**********************************************************")

****************Test Accuracy********************
______________________________________
Non-Fastball rate: 0.4173248407643312
Predicted Non-Fastball rate: 0.4937579617834395
Non-Fastball accuracy: 0.6416361416361417
______________________________________
Fastball rate: 0.5826751592356688
Predicted Fastball rate: 0.5062420382165606
Fastball accuracy: 0.6121556624398776
______________________________________
Accuracy: 0.6244585987261146
Accuracy above naive guess: 0.04178343949044583
______________________________________
**********************************************************
