Inspiration for Neural Net: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

# Data Pre-Processing

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pitcher
torch.manual_seed(1)

<torch._C.Generator at 0x7fb67cbccdf0>

In [2]:
player = 'verlander'
data = pd.read_csv('Data/raw_data/'+player+'.csv')
data = pitcher.clean_data(data)
games = pitcher.get_games(data)
reps = pitcher.get_reps(games)
reps = pitcher.drop_nas(reps)
reps = pitcher.drop_pitches(reps)

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [182]:
#run for less fastballs
train = pitcher.drop_ff(reps[1:30000])
test = reps[30000:]

In [219]:
#run for unbiased distribution
train = reps[1:33000]
test = reps[33000:]

In [27]:
# work-in-progress
def get_batches(train,batch_size):   
    out = []
    i = 0
    batch_ptypes = []
    batch_pre_pitch = []
    batch_prev_pitches = []
    for rep in train:
        prev_pitches,prev_types,pre_pitch,ptype = rep
        if i % batch_size == 0:
            out.append([batch_prev_pitches,batch_pre_pitch,batch_ptypes])
            batch_ptypes = [ptype]
            batch_pre_pitch = [pre_pitch]
            batch_prev_pitches = [prev_pitches]
        else:
            batch_ptypes.append(ptype)
            batch_pre_pitch.append(pre_pitch)
            batch_prev_pitches.append(prev_pitches)
        i += 1
    return out[1:]

In [220]:
train_batches = get_batches(train,50)
test_batches = get_batches(test,50)

In [280]:
def test_accuracy(batches,model):
    length = len(batches)*len(batches[0][0])
    num_right = 0
    ch_count = 0
    predict_ch = 0
    cu_count = 0
    predict_cu = 0
    sl_count = 0
    predict_sl = 0
    ff_count = 0
    predict_ff = 0
    for batch in batches:
        with torch.no_grad():
            prev_pitches ,pre_pitch, ptypes = batch
            prevs_in = torch.tensor(prev_pitches, dtype=torch.float)
            rep_in = torch.tensor(pre_pitch, dtype=torch.float)
            targets = [tag_to_ix[ptype] for ptype in ptypes]
            if torch.cuda.is_available():
                tag_scores = model((prevs_in,rep_in)).cuda()
            else:
                tag_scores = model((prevs_in,rep_in))
            preds = [tag_score.max(0) for tag_score in tag_scores]
            #print(index.item())
            for pred,target in zip(preds,targets):
                _,index = pred
                if target == 1:
                    ff_count += 1
                if index.item() == 1:
                    predict_ff += 1
                if target == 0:
                    cu_count += 1
                if index.item() == 0:
                    predict_cu += 1
                if target == 2:
                    sl_count += 1
                if index.item() == 2:
                    predict_sl += 1
                if target == 3:
                    ch_count += 1
                if index.item() == 3:
                    predict_ch += 1
                if index.item() == target:
                    num_right += 1   
    ff_rate = ff_count/length
    pred_ff_rate = predict_ff/length
    cu_rate = cu_count/length
    pred_cu_rate = predict_cu/length
    sl_rate = sl_count/length
    pred_sl_rate = predict_sl/length
    ch_rate = ch_count/length
    pred_ch_rate = predict_ch/length
    accuracy = num_right/length
    print("______________________________________")
    print("curve rate:",cu_rate)
    print("Predicted curve rate:",pred_cu_rate)
    print("______________________________________")
    print("Fourseam Fastball rate:",ff_rate)
    print("Predicted Fourseam Fastball rate:",pred_ff_rate)
    print("______________________________________")
    print("Changeup rate:",ch_rate)
    print("Predicted changeup rate:",pred_ch_rate)
    print("______________________________________")
    print("slider rate:",sl_rate)
    print("Predicted slider rate:",pred_sl_rate)
    print("______________________________________")
    print("Accuracy:",accuracy)
    print("Accuracy above naive guess:",accuracy - ff_rate)
    print("______________________________________")
    

# Model Construction

In [298]:
#change this for different pitcher
tag_to_ix = {'CU':0,
             'FF':1,
             'SL':2,
             'CH':3} 

# -- Input Dimensions -- DO NOT CHANGE
PREV_PITCH_DIM = 21
NUM_PREV_PITCHES = 5 
GAME_STATE_DIM = 15 

# -- Hyperparameters -- DO CHANGE 
HIDDEN_DIM = 21
OUT_DIM = 15
GAME_OUT_DIM = 15
batch_size = 50
lstm_layers = 2
learning_rate = 0.0001

if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
    
class PitchPredict(nn.Module):
    def __init__(self, prev_pitch_dim, hidden_dim, num_prev_pitches,out_dim, game_state_dim, game_out_dim, num_ptypes):
        super(PitchPredict, self).__init__()
        #get constants
        self.hidden_dim = hidden_dim
        self.prev_pitch_dim = prev_pitch_dim
        self.num_prev_pitches = num_prev_pitches
        self.out_dim = out_dim
        self.game_state_dim = game_state_dim
        self.game_out_dim = game_out_dim
        
        ####Define Layers####
        
        ########################## LSTM for past five pitches#######################################
        # The LSTM takes previous pitch vectors as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(prev_pitch_dim, hidden_dim,num_layers=lstm_layers,batch_first=True)#FIDDLE WITH num_layers

        # The linear layer that maps from hidden state space to a vector
        # with dimensionality OUT_DIM
        self.hidden2out = nn.Linear(hidden_dim, out_dim)
        
        ############## FULLY CONNECTED LAYERS for LSTM OUTPUT + game_state vector #################
        
        # layer to map the game state vector to a different dimension
        #self.l1 = nn.Linear(self.game_state_dim, self.game_out_dim)  
        
        # This Fully connected layer maps from the output of the final hidden layer output from the LSTM,
        # dimension = OUT_DIM, with the game state vector, dimension = GAME_STATE_DIM
        # to a vector of length of the number of ptypes to pass through softmax for probabilities
        self.fc1 = nn.Linear((self.out_dim + self.game_out_dim), num_ptypes)

                             
    def forward(self, rep):
        past_pitches,game_state = rep
        lstm_out, _ = self.lstm(past_pitches.view(batch_size,self.num_prev_pitches, -1))
        learned_rep = self.hidden2out(lstm_out.view(batch_size,self.num_prev_pitches, -1))
        game_rep = game_state.view(batch_size,self.game_state_dim)
        #game_rep = self.l1(game_state.view(batch_size,self.game_state_dim))
        encoding = F.relu(learned_rep[:,self.num_prev_pitches - 1:,:])
        fc_in = torch.cat((encoding.view(batch_size,self.out_dim),game_rep.view(batch_size,self.game_out_dim)),dim=1)
        fc = self.fc1(fc_in.view(batch_size,self.game_out_dim+self.out_dim))
        fc = F.relu(fc)
        tag_scores = F.log_softmax(fc,dim=0)
        return tag_scores

# Training and Testing

In [301]:
model = PitchPredict(PREV_PITCH_DIM, HIDDEN_DIM, NUM_PREV_PITCHES, OUT_DIM, GAME_STATE_DIM, GAME_OUT_DIM, len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)#FIDDLE WITH LEARNING RATE (lr)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    print("***************Pre-Training Accuracy*****************")
    test_accuracy(train_batches,model)
print("**************************Training*****************************")
for epoch in range(200):
    length = len(train_batches)*len(train_batches[0][0])
    num_right = 0
    ff_count = 0
    predict_ff = 0
    for batch in train_batches:
        prev_pitches ,pre_pitch, ptypes = batch
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        #get input tensors ready
        prevs_in = torch.tensor(prev_pitches, dtype=torch.float)
        game_state_in = torch.tensor(pre_pitch, dtype=torch.float)
        
        #get target value
        target = [ tag_to_ix[ptype] for ptype in ptypes]

        # Step 3. Run our forward pass.
        if torch.cuda.is_available():
            tag_scores = model((prevs_in,game_state_in)).cuda()
        else:
            tag_scores = model((prevs_in,game_state_in))
               
        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores.view(batch_size,-1), torch.tensor(target,dtype=torch.long))
        loss.backward()
        optimizer.step()
        
        preds = [tag_score.max(0) for tag_score in tag_scores]
        #print(index.item())
        for pred,target in zip(preds,targets):
            _,index = pred
            if target == 1:
                ff_count += 1
            if index.item() == 1:
                predict_ff += 1
            if index.item() == target:
                num_right += 1   
                
    #display post-epoch results
    ff_rate = ff_count/length
    pred_ff_rate = predict_ff/length
    accuracy = num_right/length
    print('epoch:',epoch+1,"loss:",loss.item())

# See what the scores are after training
with torch.no_grad():
    print("****************Post-Training Accuracy********************")
    test_accuracy(train_batches,model.eval())
    print("**********************************************************")

***************Pre-Training Accuracy*****************
______________________________________
curve rate: 0.17429438543247344
Predicted curve rate: 0.3362670713201821
______________________________________
Fourseam Fastball rate: 0.5750834597875569
Predicted Fourseam Fastball rate: 0.22355083459787556
______________________________________
Changeup rate: 0.1303793626707132
Predicted changeup rate: 0.2755083459787557
______________________________________
slider rate: 0.12024279210925645
Predicted slider rate: 0.16467374810318663
______________________________________
Accuracy: 0.24063732928679818
Accuracy above naive guess: -0.3344461305007588
______________________________________
**************************Training*****************************
epoch: 1 loss: 3.8987011909484863
epoch: 2 loss: 3.8913588523864746
epoch: 3 loss: 3.887312412261963
epoch: 4 loss: 3.876291275024414
epoch: 5 loss: 3.8633370399475098
epoch: 6 loss: 3.8521759510040283
epoch: 7 loss: 3.8406543731689453
epoch: 8 l

In [302]:
with torch.no_grad():
    print("****************Test Accuracy********************")
    test_accuracy(test_batches,model)
    print("**********************************************************")

****************Test Accuracy********************
______________________________________
curve rate: 0.1596923076923077
Predicted curve rate: 0.23476923076923076
______________________________________
Fourseam Fastball rate: 0.5864615384615385
Predicted Fourseam Fastball rate: 0.23015384615384615
______________________________________
Changeup rate: 0.027692307692307693
Predicted changeup rate: 0.27784615384615385
______________________________________
slider rate: 0.22615384615384615
Predicted slider rate: 0.2572307692307692
______________________________________
Accuracy: 0.3252307692307692
Accuracy above naive guess: -0.26123076923076927
______________________________________
**********************************************************


### high score using balanced set (25% FF)
- **TRAINING ACCURACY HIGH SCORE:** 38 (39% FF)

- **TEST ACCURACY HIGH SCORE:** 43.7 (37% FF) -14%- naive 


- ****HYPERPARAMETERS:****

    - LSTM num_layers: 1
    
    - OUT_SIZE: 15
    
    - HIDDEN_DIMENSION: 10
    
    - EPOCHS: 5
   
    - LEARNING RATE: 0.001
    
    - LOSS FUNCTION: NLLLOSS
    
    - OPTIMIZER: ADAM

### high score using unbalanced set (50+% FF)
- **TRAINING ACCURACY HIGH SCORE:** 57.8 (91% FF)

- **TEST ACCURACY HIGH SCORE:** 60.86 (88% FF) 2.5%+ naive 


- ****HYPERPARAMETERS:****

    - LSTM num_layers: 1
    
    - OUT_SIZE: 15
    
    - HIDDEN_DIMENSION: 10
    
    - EPOCHS: 20
   
    - LEARNING RATE: 0.001
    
    - LOSS FUNCTION: NLLLOSS
    
    - OPTIMIZER: ADAM