Inspiration for Neural Net: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

# Data Pre-Processing

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)

<torch._C.Generator at 0x7f1204d6ce30>

In [2]:
#Verlander:
##DET(2008-2017) HOU(2017-2019)
def verlander_home(verlander):
    if verlander['home_team'] == 'DET':
        if verlander['game_year'] in [2008,2009,2010,2011,2012,2013,2014,2015,2016,2017]:
                return 1
        else: return 0 
    elif verlander['home_team'] == 'HOU':
        if verlander['away_team'] == 'DET':
            if verlander['game_year'] == 2017:
                return 0
            else: return 0 
        elif verlander['game_year'] in [2017,2018,2019]:
            return 1    
        else: return 0   
    else: return 0 
#Lester:
## BOS(2008-2014) OAK(2014) CHC(2015-2019)
def lester_home(lester):
    if lester['home_team'] == 'BOS':
        if lester['game_year'] in [2008,2009,2010,2011,2012,2013,2014]:
            return 1
        else: return 0
    elif lester['home_team'] == 'CHC':
        if lester['game_year'] in [2015,2016,2017,2018,2019]:
            return 1
        else:
            return 0
    elif lester['home_team'] == 'OAK':
        if lester['away_team'] == 'BOS':
            if lester['game_year'] == 2014:
                return 0
            else:
                return 0
        elif lester['game_year'] == 2014:
            return 1
        else: return 0
    else:
        return 0
#Scherzer
## ARI(2008-2009) DET(2010-2014) WAS(2015-2019)
def scherzer_home(scherzer):
    if scherzer['home_team'] == 'ARI':
        if scherzer['game_year'] in [2008,2009]:
            return 1
        else: return 0
    elif scherzer['home_team'] == 'DET':
        if scherzer['game_year'] in [2010,2011,2012,2013,2014]:
            return 1
        else: return 0
    elif scherzer['home_team'] == 'WAS':
        if scherzer['game_year'] in [2015,2016,2017,2018,2019]:
            return 1 
        else: return 0
    else: return 0
#Hamels:
##PHI(2008-2015) TEX(2015-2018) CHC(2018-2019) 
def hamels_home(hamels):
    if hamels['home_team'] == 'PHI':
        if hamels['game_year'] in [2008,2009,2010,2011,2012,2013,2014,2015]:
            return 1
        else: return 0
    elif hamels['home_team'] == 'TEX':
        if hamels['game_year'] in [2015,2016,2017,2018]:
            return 1
        else: return 0
    elif hamels['home_team'] == 'CHC':
        if hamels['game_year'] in [2018,2019]:
            return 1
        else: return 0
    else: return 0

In [3]:
def clean_data(pitcher):
    pitcher['on_1b'] = pitcher['on_1b'].apply(lambda row: 1 if row == row else 0)
    pitcher['on_2b'] = pitcher['on_2b'].apply(lambda row: 1 if row == row else 0)
    pitcher['on_3b'] = pitcher['on_3b'].apply(lambda row: 1 if row == row else 0)
    pitcher['stand'] = pitcher['stand'].apply(lambda row: 1 if row == 'L' else 0)
    pitcher.rename(columns={'stand':'stand_L'},inplace=True)
    if pitcher['player_name'][0] == 'Justin Verlander':
        pitcher['home_team'] = pitcher.apply(lambda row: verlander_home(row),axis=1)
    if pitcher['player_name'][0] == 'Jon Lester':
        pitcher['home_team'] = pitcher.apply(lambda row: lester_home(row),axis=1)
    if pitcher['player_name'][0] == 'Cole Hamels':
        pitcher['home_team'] = pitcher.apply(lambda row: hamels_home(row),axis=1)
    if pitcher['player_name'][0] == 'Max Scherzer':
        pitcher['home_team'] = pitcher.apply(lambda row: scherzer_home(row),axis=1)  
    pitcher_cols = ['game_pk','pitch_type','pfx_x','pfx_z','description','release_speed',
                    'plate_x','plate_z','stand_L', 'home_team', 'balls','strikes',
                    'on_3b', 'on_2b', 'on_1b', 'outs_when_up']
    pitcher = pitcher[pitcher_cols]
    balls_ohe = pd.get_dummies(pitcher['balls'],prefix='balls')
    strikes_ohe = pd.get_dummies(pitcher['strikes'],prefix='strikes')
    outs_ohe = pd.get_dummies(pitcher['outs_when_up'],prefix='outs')
    des_ohe = pd.get_dummies(pitcher['description'])
    pitcher.drop(['balls','strikes','outs_when_up','description'],axis=1,inplace=True)
    pitcher =  pitcher.join(balls_ohe)
    pitcher = pitcher.join(strikes_ohe)
    pitcher = pitcher.join(outs_ohe)
    pitcher = pitcher.join(des_ohe)
    return pitcher

In [4]:
#transform data into a list of games 
def get_games(pitcher):
    game_id = pitcher['game_pk']
    pitcher = pitcher.values
    games = []
    game = []
    for index in reversed(range(len(game_id))):
        game.append(pitcher[index])
        if index != (len(game_id) - 1) and game_id[index] != game_id[index + 1]:
            games.append(game)
            game = []
    return games                   

In [5]:
#transform data into representations for model, i.e. (PREV_PITCHES,PREV_PTYPES,PRE_PITCH,PTYPE) where:
#PREV_PITCHES is a list of 5 vectors, each representing the last five pitches in terms of velocity, plate location,
#horizontal and vertical movement, and outcome.
#PREV_TYPES are the previous 5 pitch types
#PRE_PITCH is a  one_hot vector describing the pre-pitch game state for the pitch to predict, in terms of number of outs,
#BALLS, and strikes, runners on base, if the pitcher is home or not, and whether the batter is left handed
#PTYPE is the target pitch type for prediction 
def get_reps(pitcher_games):
    reps = []
    for game in pitcher_games:
        ptypes = [pitch[1] for pitch in game]
        prev_pitch_cont = [pitch[2:7].tolist() for pitch in game]
        prev_pitch_disc = [pitch[22:].tolist() for pitch in game]
        pitches = zip(prev_pitch_cont,prev_pitch_disc)
        prev_pitches = [cont + disc for cont,disc in pitches]
        pre_pitch = [pitch[7:22].tolist() for pitch in game]
        game_len = len(game)
        for i in range(game_len):
            if i < (game_len - 1) - 6:
                rep = (prev_pitches[i:i+5],ptypes[i:i+5],pre_pitch[i+5],ptypes[i+5])
                reps.append(rep)
    return reps    

In [6]:
#drop any representation that contains a NaN value
def drop_nas(reps):
    good_reps = []
    for rep in reps:
        is_na = False
        prev_pitches,prev_types,pre_pitch,ptype = rep
        if ptype != ptype:
            is_na = True
        else:
            for pitch in prev_pitches:
                for stat in pitch:
                    if stat != stat:
                        is_na = True
            for pitch in prev_types:
                if pitch != pitch:
                    is_na = True
        if not is_na:
            good_reps.append(rep)
            is_na = False
    return good_reps        

In [7]:
#drop half the fastballs in the set of representations, this creates an even distribution of ptypes in the training set
def drop_ff(reps):
    good_reps = []
    i = 0
    for rep in reps:
        prev_pitches,prev_types,pre_pitch,ptype = rep
        if ptype == 'FF':
            i += 1
            if i%4 == 0:
                good_reps.append(rep)
        else:
            good_reps.append(rep)
    return good_reps

In [8]:
#count the number of fourseam-fastballs in the set of representations
def num_ff(reps):
    i = 0
    for rep in reps:
            prev_pitches,prev_types,pre_pitch,ptype = rep
            if ptype == 'FF':
                i += 1
    return i

In [9]:
def drop_pitches(reps):
    good_reps = []
    for rep in reps:
        prev_pitches,prev_types,pre_pitch,ptype = rep
        if ptype in ['CU','FF','SL','CH']:
                good_reps.append(rep)
    return good_reps

In [81]:
verlander = pd.read_csv('Data/raw_data/verlander.csv')
ver = clean_data(verlander)
games = get_games(ver)
reps = get_reps(games)
reps = drop_nas(reps)
reps = drop_pitches(reps)
#train = drop_ff(reps[1:33000])

In [53]:
len(reps)

36291

In [82]:
train = drop_ff(reps[1:30000])
test = reps[30000:]

In [29]:
#get train and test sets
verlander = pd.read_csv('Data/raw_data/verlander.csv')
ver = clean_data(verlander)
games = get_games(ver)
reps = get_reps(games)
reps = drop_nas(reps)
reps = drop_pitches(reps)
#train = drop_ff(reps[1:33000])
train = reps[1:33000]
test = reps[33000:]

# Model Construction

In [10]:
# work-in-progress
def get_batches(train,batch_size):   
    out = []
    i = 0
    batch = []
    for rep in train:
        i += 1
        if i != 1 and ((i) % (batch_size) == 0):
            out.append(batch)
            batch = [rep]
        else:
            batch.append(rep)
    return out

In [17]:
x = torch.tensor([[0,1],[2,3]])
x.view(4)[2:]

tensor([2, 3])

In [110]:
#change this for different pitcher
tag_to_ix = {'CU':0,
             'FF':1,
             'SL':2,
             'CH':3}
             

PREV_PITCH_DIM = 21
HIDDEN_DIM = 10 #FIDDLE WITH THIS
NUM_PREV_PITCHES = 5
OUT_DIM = 15  #FIDDLE WITH THIS
GAME_STATE_DIM = 15
GAME_OUT_DIM = 3  #FIDDLE WITH THIS
if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
    
class PitchPredict(nn.Module):
    def __init__(self, prev_pitch_dim, hidden_dim, num_prev_pitches,out_dim, game_state_dim, game_out_dim, num_ptypes):
        super(PitchPredict, self).__init__()
        #get constants
        self.hidden_dim = hidden_dim
        self.prev_pitch_dim = prev_pitch_dim
        self.num_prev_pitches = num_prev_pitches
        self.out_dim = out_dim
        self.game_state_dim = game_state_dim
        self.game_out_dim = game_out_dim
        
        ####Define Layers####
        
        ########################## LSTM for past five pitches################################
        # The LSTM takes previous pitch vectors as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(prev_pitch_dim, hidden_dim,num_layers=1)#FIDDLE WITH num_layers

        # The linear layer that maps from hidden state space to a vector
        # with dimensionality OUT_DIM
        self.hidden2out = nn.Linear(hidden_dim, out_dim)
        
        ############## FULLY CONNECTED LAYER for LSTM OUTPUTS + game_state vector #################
        
        #layer to map the game state vector to a different dimension
        #self.l1 = nn.Linear(self.game_state_dim, self.game_out_dim)  
        
        #This Fully connected layer maps from the output of the final hidden layer output from the LSTM,
        # dimension = OUT_DIM, with the game state vector, dimension = GAME_STATE_DIM
        # to a vector of length of the number of ptypes to pass through softmax for probabilities
        self.fc1 = nn.Linear(self.out_dim + self.game_state_dim, num_ptypes)
        #self.fc1 = nn.Linear((self.out_dim*self.num_prev_pitches)+self.game_out_dim,num_ptypes)
                             
    def forward(self, rep):
        past_pitches,game_state = rep
        lstm_out, _ = self.lstm(past_pitches.view(self.num_prev_pitches, 1, -1))
        learned_rep = self.hidden2out(lstm_out.view(self.num_prev_pitches, -1))
        #game_rep = self.l1(game_state)
        fc_in = torch.cat((learned_rep.view(self.num_prev_pitches*self.out_dim)[(self.num_prev_pitches-1)*self.out_dim:],game_state))
        fc = self.fc1(fc_in)
        tag_scores = F.log_softmax(fc,dim=0)
        return tag_scores

In [42]:
def test_accuracy(test,model):
    num_right = 0
    ff_count = 0
    predict_ff = 0
    predictions = []
    for rep in test:
        with torch.no_grad():
            prev_pitches,prev_types,pre_pitch,ptype = rep
            prevs_in = torch.tensor(prev_pitches, dtype=torch.float)
            rep_in = torch.tensor(pre_pitch, dtype=torch.float)
            target = tag_to_ix[ptype]
            if torch.cuda.is_available():
                tag_scores = model((prevs_in,rep_in)).cuda()
            else:
                tag_scores = model((prevs_in,rep_in))
            value, index = tag_scores.max(0)
            #print(index.item())
            predictions.append((index.item(),target))
            if target == 1:
                ff_count += 1
            if index.item() == 1:
                predict_ff += 1
            if index.item() == target:
                num_right += 1   
    ff_rate = ff_count/len(test)
    pred_ff_rate = predict_ff/len(test)
    accuracy = num_right/len(test)
    print("Fourseam Fastball rate:",ff_rate)
    print("Predicted Fourseam Fastball rate:",pred_ff_rate)
    print("Accuracy:",accuracy)
    print("Accuracy above naive guess:",accuracy - ff_rate)
    return predictions
    

# Training and Testing

In [111]:
model = PitchPredict(PREV_PITCH_DIM, HIDDEN_DIM, NUM_PREV_PITCHES, OUT_DIM, GAME_STATE_DIM, GAME_OUT_DIM, len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)#FIDDLE WITH LEARNING RATE (lr)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    print("***************Pre-Training Accuracy*****************")
    test_accuracy(train,model)
print("**************************Training*****************************")
for epoch in range(5):
    num_right = 0
    for rep in train:
        prev_pitches,prev_types,pre_pitch,ptype = rep
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        #get input tensors ready
        prevs_in = torch.tensor(prev_pitches, dtype=torch.float)
        game_state_in = torch.tensor(pre_pitch, dtype=torch.float)
        
        #get target value
        target = tag_to_ix[ptype]

        # Step 3. Run our forward pass.
        if torch.cuda.is_available():
            tag_scores = model((prevs_in,game_state_in)).cuda()
        else:
            tag_scores = model((prevs_in,game_state_in))
               
        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores.view(1,-1), torch.tensor([target],dtype=torch.long))
        loss.backward()
        optimizer.step()
        
        #get accuracy
        value, index = tag_scores.max(0)
        if index.item() == target:
            num_right += 1
            
    #display post-epoch results      
    accuracy = num_right/len(train)
    print('epoch:',epoch+1,"loss:",loss.item())

# See what the scores are after training
with torch.no_grad():
    print("****************Post-Training Accuracy********************")
    test_accuracy(train,model.eval())
    print("**********************************************************")
with torch.no_grad():
    print("*******************Test Accuracy**********************")
    test_accuracy(test,model.eval())
    print("*****************************************************")

***************Pre-Training Accuracy*****************
Fourseam Fastball rate: 0.25281359906213363
Predicted Fourseam Fastball rate: 0.06770222743259086
Accuracy: 0.2597303634232122
Accuracy above naive guess: 0.006916764361078576
**************************Training*****************************
epoch: 1 loss: 0.8353438973426819
epoch: 2 loss: 0.8366174697875977
epoch: 3 loss: 0.8378267288208008
epoch: 4 loss: 0.8373398780822754
epoch: 5 loss: 0.8481465578079224
****************Post-Training Accuracy********************
Fourseam Fastball rate: 0.25281359906213363
Predicted Fourseam Fastball rate: 0.3888042203985932
Accuracy: 0.3816529894490035
Accuracy above naive guess: 0.12883939038686987
**********************************************************
*******************Test Accuracy**********************
Fourseam Fastball rate: 0.5801939278334128
Predicted Fourseam Fastball rate: 0.37672865999046257
Accuracy: 0.4369734541408361
Accuracy above naive guess: -0.14322047369257673
**************

In [112]:
with torch.no_grad():
    print("****************Post-Training Test Accuracy********************")
    preds = test_accuracy(test,model)
    print("**********************************************************")

****************Post-Training Test Accuracy********************
Fourseam Fastball rate: 0.5801939278334128
Predicted Fourseam Fastball rate: 0.37672865999046257
Accuracy: 0.4369734541408361
Accuracy above naive guess: -0.14322047369257673
**********************************************************


In [107]:
i = 0
for pred,target in preds:
    if pred == 3:
        i += 1

In [108]:
i/len(preds)

0.023366714353838816

### high score using balanced set (25% FF)
- **TRAINING ACCURACY HIGH SCORE:** 38 (39% FF)

- **TEST ACCURACY HIGH SCORE:** 43.7 (37% FF) -14%- naive 


- ****HYPERPARAMETERS:****

    - LSTM num_layers: 1
    
    - OUT_SIZE: 15
    
    - HIDDEN_DIMENSION: 10
    
    - EPOCHS: 5
   
    - LEARNING RATE: 0.001
    
    - LOSS FUNCTION: NLLLOSS
    
    - OPTIMIZER: ADAM

### high score using unbalanced set (50+% FF)
- **TRAINING ACCURACY HIGH SCORE:** 57.8 (91% FF)

- **TEST ACCURACY HIGH SCORE:** 60.86 (88% FF) 2.5%+ naive 


- ****HYPERPARAMETERS:****

    - LSTM num_layers: 1
    
    - OUT_SIZE: 15
    
    - HIDDEN_DIMENSION: 10
    
    - EPOCHS: 20
   
    - LEARNING RATE: 0.001
    
    - LOSS FUNCTION: NLLLOSS
    
    - OPTIMIZER: ADAM