Inspiration for Neural Net: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

# Data Pre-Processing

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pitcher
torch.manual_seed(1)

<torch._C.Generator at 0x7fc77b616db0>

In [2]:
verlander = pd.read_csv('Data/raw_data/verlander.csv')
ver = pitcher.clean_data(verlander)
games = pitcher.get_games(ver)
reps = pitcher.get_reps(games)
reps = pitcher.drop_nas(reps)
reps = pitcher.drop_pitches(reps)
#train = drop_ff(reps[1:33000])

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [4]:

train = pitcher.drop_ff(reps[1:30000])
test = reps[30000:]

In [None]:
train = reps[1:33000]
test = reps[33000:]

In [None]:
# work-in-progress
def get_batches(train,batch_size):   
    out = []
    i = 0
    batch = []
    for rep in train:
        i += 1
        if i != 1 and ((i) % (batch_size) == 0):
            out.append(batch)
            batch = [rep]
        else:
            batch.append(rep)
    return out

In [None]:
def test_accuracy(test,model):
    num_right = 0
    ff_count = 0
    predict_ff = 0
    predictions = []
    for rep in test:
        with torch.no_grad():
            prev_pitches,prev_types,pre_pitch,ptype = rep
            prevs_in = torch.tensor(prev_pitches, dtype=torch.float)
            rep_in = torch.tensor(pre_pitch, dtype=torch.float)
            target = tag_to_ix[ptype]
            if torch.cuda.is_available():
                tag_scores = model((prevs_in,rep_in)).cuda()
            else:
                tag_scores = model((prevs_in,rep_in))
            value, index = tag_scores.max(0)
            #print(index.item())
            predictions.append((index.item(),target))
            if target == 1:
                ff_count += 1
            if index.item() == 1:
                predict_ff += 1
            if index.item() == target:
                num_right += 1   
    ff_rate = ff_count/len(test)
    pred_ff_rate = predict_ff/len(test)
    accuracy = num_right/len(test)
    print("Fourseam Fastball rate:",ff_rate)
    print("Predicted Fourseam Fastball rate:",pred_ff_rate)
    print("Accuracy:",accuracy)
    print("Accuracy above naive guess:",accuracy - ff_rate)
    return predictions
    

# Model Construction

In [None]:
#change this for different pitcher
tag_to_ix = {'CU':0,
             'FF':1,
             'SL':2,
             'CH':3} 

PREV_PITCH_DIM = 21
HIDDEN_DIM = 10 #FIDDLE WITH THIS
NUM_PREV_PITCHES = 5
OUT_DIM = 5  #FIDDLE WITH THIS
GAME_STATE_DIM = 15
GAME_OUT_DIM = 5  #FIDDLE WITH THIS

if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
    
class PitchPredict(nn.Module):
    def __init__(self, prev_pitch_dim, hidden_dim, num_prev_pitches,out_dim, game_state_dim, game_out_dim, num_ptypes):
        super(PitchPredict, self).__init__()
        #get constants
        self.hidden_dim = hidden_dim
        self.prev_pitch_dim = prev_pitch_dim
        self.num_prev_pitches = num_prev_pitches
        self.out_dim = out_dim
        self.game_state_dim = game_state_dim
        self.game_out_dim = game_out_dim
        
        ####Define Layers####
        
        ########################## LSTM for past five pitches#######################################
        # The LSTM takes previous pitch vectors as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(prev_pitch_dim, hidden_dim,num_layers=1)#FIDDLE WITH num_layers

        # The linear layer that maps from hidden state space to a vector
        # with dimensionality OUT_DIM
        self.hidden2out = nn.Linear(hidden_dim, out_dim)
        
        ############## FULLY CONNECTED LAYERS for LSTM OUTPUT + game_state vector #################
        
        # layer to map the game state vector to a different dimension
        self.l1 = nn.Linear(self.game_state_dim, self.game_out_dim)  
        
        # This Fully connected layer maps from the output of the final hidden layer output from the LSTM,
        # dimension = OUT_DIM, with the game state vector, dimension = GAME_STATE_DIM
        # to a vector of length of the number of ptypes to pass through softmax for probabilities
        self.fc1 = nn.Linear(self.out_dim + self.game_out_dim, num_ptypes)

                             
    def forward(self, rep):
        past_pitches,game_state = rep
        lstm_out, _ = self.lstm(past_pitches.view(self.num_prev_pitches, 1, -1))
        learned_rep = self.hidden2out(lstm_out.view(self.num_prev_pitches, -1))
        game_rep = self.l1(game_state)
        fc_in = torch.cat((learned_rep.view(self.num_prev_pitches*self.out_dim)[(self.num_prev_pitches-1)*self.out_dim:],game_rep))
        fc = self.fc1(fc_in)
        tag_scores = F.log_softmax(fc,dim=0)
        return tag_scores

# Training and Testing

In [None]:
model = PitchPredict(PREV_PITCH_DIM, HIDDEN_DIM, NUM_PREV_PITCHES, OUT_DIM, GAME_STATE_DIM, GAME_OUT_DIM, len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.002)#FIDDLE WITH LEARNING RATE (lr)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    print("***************Pre-Training Accuracy*****************")
    test_accuracy(train,model)
print("**************************Training*****************************")
for epoch in range(5):
    num_right = 0
    for rep in train:
        prev_pitches,prev_types,pre_pitch,ptype = rep
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        #get input tensors ready
        prevs_in = torch.tensor(prev_pitches, dtype=torch.float)
        game_state_in = torch.tensor(pre_pitch, dtype=torch.float)
        
        #get target value
        target = tag_to_ix[ptype]

        # Step 3. Run our forward pass.
        if torch.cuda.is_available():
            tag_scores = model((prevs_in,game_state_in)).cuda()
        else:
            tag_scores = model((prevs_in,game_state_in))
               
        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores.view(1,-1), torch.tensor([target],dtype=torch.long))
        loss.backward()
        optimizer.step()
        
        #get accuracy
        value, index = tag_scores.max(0)
        if index.item() == target:
            num_right += 1
            
    #display post-epoch results      
    accuracy = num_right/len(train)
    print('epoch:',epoch+1,"loss:",loss.item())

# See what the scores are after training
with torch.no_grad():
    print("****************Post-Training Accuracy********************")
    test_accuracy(train,model.eval())
    print("**********************************************************")
with torch.no_grad():
    print("*******************Test Accuracy**********************")
    test_accuracy(test,model.eval())
    print("*****************************************************")

In [None]:
with torch.no_grad():
    print("****************Post-Training Test Accuracy********************")
    preds = test_accuracy(train,model)
    print("**********************************************************")

In [None]:
pitch = 3
i = 0
j = 0
for pred,target in preds:
    if pred == pitch:
        i += 1
    if target == pitch:
        j += 1
print("predicated rate:",i/len(preds),"actual rate:",j/len(preds))

### high score using balanced set (25% FF)
- **TRAINING ACCURACY HIGH SCORE:** 38 (39% FF)

- **TEST ACCURACY HIGH SCORE:** 43.7 (37% FF) -14%- naive 


- ****HYPERPARAMETERS:****

    - LSTM num_layers: 1
    
    - OUT_SIZE: 15
    
    - HIDDEN_DIMENSION: 10
    
    - EPOCHS: 5
   
    - LEARNING RATE: 0.001
    
    - LOSS FUNCTION: NLLLOSS
    
    - OPTIMIZER: ADAM

### high score using unbalanced set (50+% FF)
- **TRAINING ACCURACY HIGH SCORE:** 57.8 (91% FF)

- **TEST ACCURACY HIGH SCORE:** 60.86 (88% FF) 2.5%+ naive 


- ****HYPERPARAMETERS:****

    - LSTM num_layers: 1
    
    - OUT_SIZE: 15
    
    - HIDDEN_DIMENSION: 10
    
    - EPOCHS: 20
   
    - LEARNING RATE: 0.001
    
    - LOSS FUNCTION: NLLLOSS
    
    - OPTIMIZER: ADAM