Predicting pitches using the Pytorch N-Gram Language Modeling example (https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html ).

In [717]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
verlander = pd.read_csv('Data/raw_data/verlander.csv')
scherzer = pd.read_csv('Data/raw_data/scherzer.csv')
lester =  pd.read_csv('Data/raw_data/lester.csv')
hamels =  pd.read_csv('Data/raw_data/hamels.csv')

In [718]:
#different pitch types 
verlander['pitch_type'].unique()

array(['CU', 'FF', 'SL', 'CH', nan, 'FC', 'FT', 'IN', 'PO'], dtype=object)

In [719]:
#get A list of at-bats. Each at-bat is a list of pitches that the pitcher threw to a specific batter
def get_abs(pitcher):
    pitcher_abs = []
    ab_num = pitcher['at_bat_number']
    pitch_types = pitcher['pitch_type']
    ab_pitches = []
    for index in reversed(range(len(pitch_types))):
        if index == 0:
            ab_pitches.append(pitch_types[index])
        elif ab_num[index] != ab_num[index - 1]:
            pitcher_abs.append(ab_pitches)
            ab_pitches = [pitch_types[index]]
        else:
            ab_pitches.append(pitch_types[index])
    return pitcher_abs

In [720]:
#decide what pitcher to run the model with here
ab_list = get_abs(verlander)

In [721]:
#remove any at-bat that has a NaN pitch type
def clear_nan_abs(ab_list):
    nan_abs = 0
    for ab in ab_list:
        is_na = False
        for pitch in ab:
            if pitch != pitch:
                is_na = True
                break
        if is_na:
            nan_abs += 1
            ab_list.remove(ab)
    return nan_abs


In [722]:
#clear nan columns
x = 1
while x != 0:
    x = clear_nan_abs(ab_list)

In [723]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)

<torch._C.Generator at 0x7fe54d95f350>

In [724]:
#get train and test sets
train = ab_list[1:7856]
test = ab_list[7856:]

In [725]:
#this creates the ngrams from the at-bats, with the context length as a parameter
def create_ngrams(data,con_len):
    ngrams = []
    for ab in data:
        ab = (["NA"]*con_len) + ab
        for i in range(len(ab) - con_len):
              context = ab[i:i+con_len]
              target = ab[i+con_len]
              ngrams.append((context,target))                        
    return ngrams

In [726]:
#create ngrams for test and train
ngrams_test = create_ngrams(test,5)
ngrams_train = create_ngrams(train,5)

In [727]:
#Example: displayed below are 3 at-bats
train[:3]

[['FT', 'FT'], ['CH'], ['CH', 'FT', 'CH', 'FT', 'CH', 'CH']]

In [728]:
#here is the translation of each of those pitches to an ngram. The first pitch of the at-bat is padded with 'NA' values,
# and the padding continues until the number of prior-pitches in the at-bat equals the context size 
ngrams_train[:9]

[(['NA', 'NA', 'NA', 'NA', 'NA'], 'FT'),
 (['NA', 'NA', 'NA', 'NA', 'FT'], 'FT'),
 (['NA', 'NA', 'NA', 'NA', 'NA'], 'CH'),
 (['NA', 'NA', 'NA', 'NA', 'NA'], 'CH'),
 (['NA', 'NA', 'NA', 'NA', 'CH'], 'FT'),
 (['NA', 'NA', 'NA', 'CH', 'FT'], 'CH'),
 (['NA', 'NA', 'CH', 'FT', 'CH'], 'FT'),
 (['NA', 'CH', 'FT', 'CH', 'FT'], 'CH'),
 (['CH', 'FT', 'CH', 'FT', 'CH'], 'CH')]

In [729]:
#this function gets back the rate of the most frequent pitch (fastball in nearly every case), which will serve as the 
#benchmark
def get_benchmark(test):
    benchmark = 0
    for i in range(len(test)):
        if test[i][1] == 'FF':
            benchmark += 1
    return benchmark/len(test)

In [730]:
#must beat this mark
get_benchmark(ngrams_test)

0.586978255436141

In [732]:
#model I copied from pytorch website lol
CONTEXT_SIZE = 5
EMBEDDING_DIM = 10
torch.set_default_tensor_type(torch.cuda.FloatTensor)

vocab = set(['CU', 'FF', 'SL', 'CH', 'NA', 'FC', 'FT', 'IN', 'PO'])
pitch_to_ix = {pitch: i for i, pitch in enumerate(vocab)}
ix_to_pitch = {i: pitch for i, pitch in enumerate(vocab)}


class NGramModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, 64)
        self.linear3 = nn.Linear(64, vocab_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        out = F.relu(out)
        out = self.linear3(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

losses = []
loss_function = nn.NLLLoss()
model = NGramModel(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(5):
    total_loss = 0
    for context, target in ngrams_train:
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([pitch_to_ix[p] for p in context], dtype=torch.long)
        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()
        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs).cuda()
        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([pitch_to_ix[target]], dtype=torch.long))
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
    # Get the Python number from a 1-element Tensor by calling tensor.item()
    print("epoch:",epoch+1,"loss:",loss.item())

epoch: 1 loss: 1.3114299774169922
epoch: 2 loss: 1.228050947189331
epoch: 3 loss: 1.2007803916931152
epoch: 4 loss: 1.1942858695983887
epoch: 5 loss: 1.1964590549468994


In [733]:
#calculate accuracy
correct = 0
for context,pitch in ngrams_test:
    values, idx = model(torch.tensor([pitch_to_ix[p] for p in context], dtype=torch.long))[0].max(0)
    #if ix_to_pitch[idx.item()] != 'FF':
        #print(ix_to_pitch[idx.item()])
    if ix_to_pitch[idx.item()] == pitch:
        correct += 1
print("accuracy above naive guess:",(correct/len(ngrams_test))-get_benchmark(ngrams_test),"%")

accuracy above naive guess: -0.0004998750312422695 %
