# Hand and Position Model

This model will be similar to the previous one, but the vectors in the sequences will also include the pitch and position data from the at bat.  

### Data
First, the new example vectors need to be created.  The y tensor is exactly the same, but extra work needs to be done to create X.

In [76]:
import pickle
import numpy as np
 
full_data = [] 
year = 2016
for m in [3,4,5,6,7,8]:
    fn = "../data/pitches_{}_{}.p".format(year, m)
    seqs = pickle.load(open(fn, "rb"))
    full_data += seqs

cleaned_data = [] # no 0 or 1 length sequences. 
longest_seq = 0
empties_or_single = 0
pitch_types = set()
pos_types   = set()

for line in full_data:
    if(len(line[1]) > longest_seq): longest_seq = len(line[1])
    if(len(line[1]) <= 1): 
        empties_or_single += 1
    else:
        cleaned_data.append(line)
        pos_types.add(line[0][2]) 
        for p in line[1]: # the seq is the second element, first is the feature vector
            pitch_types.add(p)

print("longest sequence length: {}\nempties: {}\ntotal (clean): {}\npitch types: {}".format(longest_seq, 
                                                                                            empties_or_single,
                                                                                            len(cleaned_data),
                                                                                            len(pitch_types)))

print("pos types: {}".format(len(pos_types)))
# Saving the cleaned data to a pickle to make it easier to work with the other models. 
pickle.dump(cleaned_data, open("../data/pitches_full_{}.p".format(year), "wb"))

print(pos_types)

longest sequence length: 18
empties: 13757
total (clean): 110216
pitch types: 16
pos types: 125
{'DH-1B', '2B-SS', 'RF', 'CF-3B', 'PR-2B', 'LF-C', 'DH-3B', 'SS-1B', 'SS-3B-SS', 'C', '1B-2B', '1B-LF', '1B-P', 'SS-2B', 'LF-3B', 'PH-1B', 'DH-SS', '1B-3B', 'PR-DH-3B', 'CF-SS', '3B-P', 'LF-CF', '1B-CF', 'LF-2B', 'RF-LF-1B', 'PH-DH', 'DH-2B', '3B-RF', 'PR-RF', 'LF-RF-LF', '1B-RF', '2B-CF', '3B-RF-2B', 'CF-2B', 'C-3B', 'DH-LF', 'PH-3B-1B', '2B-1B', 'SS-LF', 'SS-P', '2B-LF-RF', 'LF-SS', 'LF-P-LF-P', 'PH-LF-RF', 'PH-2B', '2B-RF', 'SS-3B', 'PR-LF-CF', '3B-RF-LF', 'PH-LF-CF', 'PH-DH-RF', 'C-LF-P-2B', 'PR-1B', '3B-2B', 'PH-RF-LF', 'CF-LF-CF', '2B-3B-LF', 'PH-SS', 'RF-CF', '2B', 'RF-LF', '3B-RF-3B', 'RF-LF-3B', 'SS', 'DH', '3B-LF', 'PH-3B', '3B-2B-LF', 'PH-DH-2B', 'PR-RF-CF', 'LF', '3B-SS', 'RF-SS', 'RF-LF-CF', '2B-3B', '1B-C', 'P-LF-P', 'PH-CF', 'C-1B', 'PH', 'P', 'LF-1B-LF', 'PH-C', 'PR-3B-1B', 'PH-2B-1B', 'PR-DH', 'LF-CF-LF', 'LF-1B', 'CF-1B', '1B-SS', 'PH-1B-LF', 'LF-RF', 'PR-1B-LF-1B', 'SS-RF'

Woa! Lot more positions than I was anticipating.  Looks like they allow for multiple positions.  I think I can still handle this, but the positions feature will have to be a one-hot of all the individual positions, and a batters pos vector would contain a value for each position listed.  These could be normalized, even. 

Regardless, the position will be a one-hot vector of length 12.  This will be concatenated with each pitches onehot vector.  In addition, two other values will be added.  Two 0/1 values that represent the handedness of the batter and pitcher.  

In [77]:
simple_poss = set()
for p in pos_types:
    p_split = p.split("-")
    for i in p_split:
        simple_poss.add(i)
# This should just be the regular list of positions. 
print(len(simple_poss), simple_poss)

12 {'DH', 'CF', 'C', 'RF', 'P', 'PH', '3B', 'PR', '2B', 'LF', '1B', 'SS'}


In [78]:
cleaned_data[0]

[['R', 'R', 'RF'], ['FT', 'FF', 'CH']]

In [54]:
# Creating X - padded sequences of one-hots. Need a dictionary of pitch types and positions.
pitch_map = {
    'KC': 0,
    'CH': 1,
    'SL': 2,
    'SI': 3,
    'FO': 4,
    'FS': 5,
    'CU': 6,
    'PO': 7,
    'KN': 8,
    'FF': 9,
    'EP': 10,
    'IN': 11,
    'SC': 12,
    'FT': 13,
    'FC': 14,
    'UN': 15
}

pos_map = {
    '1B': 0,
    '2B': 1,
    '3B': 2,
    'PR': 3,
    'P':  4,
    'C':  5,
    'DH': 6,
    'SS': 7,
    'PH': 8,
    'CF': 9,
    'RF': 10,
    'LF': 11
}

# Need to think about handling the 'both' pitchers and 'switch' hitters. 
hand_map ={
    'L': 0.0,
    'R': 1.0,
    'B': 0.5, # Just doing this for now. 
    'S': 0.5
}

MAX_LENGTH = longest_seq
NUM_EXTRA_FEATURES = len(pos_map)+2 # pitcher hand, batter hand, batter pos

def create_feature_vec(seq):
    # Create positions one-hot
    pos = np.zeros((len(pos_map),), dtype=np.float32)
    for p in seq[0][2].split("-"):
        pos[pos_map[p]] = 1.0
    
    # Handedness features
    hands = np.zeros((2,), dtype=np.float32)
    hands[0] = hand_map[seq[0][0]]
    hands[1] = hand_map[seq[0][1]]
    return np.concatenate((hands, pos))
    
def create_oneshot_seq(seq):
    # Pitch Sequence
    pitches = []
    i = 0
    for p in seq[1]:
        p_oh = np.zeros((len(pitch_map),), dtype=np.float32)
        p_oh[pitch_map[p]] = 1.0
        pitches.append(p_oh)
        i += 1
    for j in range(i, MAX_LENGTH):# Pad to length. 
        pitches.append(np.zeros((len(pitch_map)), dtype=np.float32))
    return np.array(pitches)

def create_target(seq):
    ret = []
    i = 0
    for p in seq[1][1:]:
        ret.append(pitch_map[p])
        i += 1
    for j in range(i, MAX_LENGTH):
        ret.append(0)
    return ret

X_full = [] # Sequences of onehots.
f_full = [] # Feature vectors
y_full = [] # index of correct pitch in the one-hot, starting at X[1]
for line in cleaned_data:
    X_full.append(create_oneshot_seq(line))
    f_full.append(create_feature_vec(line))
    y_full.append(create_target(line))

# Each entry in X_full will be a 2 entry array.  The first element is the feature vector and the
# second entry is the actual pitch sequence.

# The sequence should have the same length as 'max length'
# the feature vector should be 12 + 2 = 14 (num_pos + num_hands)

print("seq_len = {}\nfeature_len = {}".format(len(X_full[0]), len(f_full[0])))

seq_len = 18
feature_len = 14


### Model
I feel like there is an issue with just extending the tensor that goes from cell to cell in the RNN.  If at each iteration, we get an output that represents the logits for each feature in the feature vector, whats to stop the network from just predicting the handedness and position at each step, because it never changes?

Is there a way to restrict the calculation of the logits to just the 16 pitch outputs?  Could I make the input be the 16+2+12 vector, make the internals output 16, and the next input would be output+(2+12features)?

Can I force the prediction to just be the result of a FC from a subset of output tensor?  That seems like the right direction. Mask, or create a new tensor, then change the prediction op to use that.  Then the loss stuff would stay the same.  

In [72]:
import tensorflow as tf
tf.reset_default_graph()

# Assumptions about data:
#  - X Padded to MAX_SIZE, with 0-vectors of size(pitch_types)
#  - X also includes the additional hand and pos feature vector. 
#  - y Padded to MAX_SIZE, with 0's.  (get length off of X, though)

##### Construction Phase ###############
NUM_INPUTS   = 16    # Size of the input vector (the number of possible pitch types)
NUM_OUTPUTS  = 16    # Want a pitch type out, so same size as input.
NUM_NEURONS  = 10    # Number of neurons inside the RNN cell.  
MAX_SIZE     = 18    # the maximum size of a sequence.  Everything gets padded to this, and masked.
FEATURE_SIZE = 14    # Size of the additional feature vector.

BATCH_SIZE = 5
LEARNING_RATE = 0.015

### RNN Graph
X = tf.placeholder( tf.float32, [BATCH_SIZE, MAX_SIZE, NUM_INPUTS] )
F = tf.placeholder( tf.float32, [BATCH_SIZE, FEATURE_SIZE] )

# y is X shifted to the left, but also converted to the *index* of the correct logit - for seq2seq loss.
y = tf.placeholder( tf.int32, [BATCH_SIZE, MAX_SIZE] ) 

# Get a 1D Tensor to hold the 'true' length of each padded sequence in a batch
collapsed_features = tf.sign(tf.reduce_max(tf.abs(X), 2)) # use max+abs to see what elements arent 0-vectors
seq_len  = tf.cast( tf.reduce_sum(collapsed_features, 1), tf.int32 ) # Count the 1's to get length.
seq_mask = tf.sequence_mask(seq_len, maxlen=MAX_SIZE, dtype=tf.float32) # Create a mask from these lengths

basic_cell = tf.contrib.rnn.BasicRNNCell( num_units=NUM_NEURONS )
outputs, states = tf.nn.dynamic_rnn( basic_cell, X, dtype=tf.float32, sequence_length=seq_len ) 

# Here's the difference in implementation.  The logits are computed from a concatenation of the
# outputs and the 'static' feature vector.  

F_expanded = tf.tile(tf.expand_dims(F, 1), [1, MAX_SIZE, 1])

combined_outputs = tf.concat((outputs, F_expanded), 2)

# combined_output = tf.SOMETHING.concatenate( outputs, features )
logits = tf.contrib.layers.fully_connected(combined_outputs, NUM_OUTPUTS)
# logits = tf.contrib.layers.fully_connected(outputs, NUM_OUTPUTS)


### Loss, Optimization, Training.  
loss = tf.contrib.seq2seq.sequence_loss(logits, 
                                        y, 
                                        seq_mask, 
                                        average_across_timesteps=True, 
                                        average_across_batch=True)
tf.summary.scalar('loss', loss)

optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)

# Once we have a loss function, we can just let the optimizer do its job. (hopefully)
training_op = optimizer.minimize( loss )

init = tf.global_variables_initializer()


In [75]:
#### Training Phase ###############
EPOCHS     = 10 # Will need to figure out what this should be. dVC stuff?
ITERATIONS = 10000

# NOTE: Hacky right now, but just want to get data into the model.
# TODO: actually turn the data into tf.Dataset objects? 
# TODO: or use some of the batch operations?
def get_training_batch(X, f, y, batch_size):
    ids = np.random.randint(0, len(X), batch_size) 
    return np.array(X)[ids], np.array(f)[ids], np.array(y)[ids]

merged = tf.summary.merge_all()

# Testing for now, want to see if it actually updates on one batch.
with tf.Session() as sess:
    summary_writer = tf.summary.FileWriter('../data', sess.graph)
    init.run()
    
    # For debugging
    X_batch, F_batch, y_batch = get_training_batch( X_full, f_full, y_full, BATCH_SIZE )
    print("shape(F): ",  sess.run(F, feed_dict={X: X_batch, F: F_batch, y: y_batch}).shape)
    print("shape(tiled_features): ",  sess.run(combined_outputs, feed_dict={X: X_batch, F: F_batch, y: y_batch}).shape)    
  
    for i in range(ITERATIONS):
        X_batch, F_batch, y_batch = get_training_batch( X_full, f_full, y_full, BATCH_SIZE )   
        _, l, summary = sess.run([training_op, loss, merged], feed_dict={X: X_batch, F: F_batch, y: y_batch})
        summary_writer.add_summary(summary, i)
        if i%10 == 0: print("loss at i {}: {}".format(i, l))
            
    summary_writer.close()


shape(F):  (5, 14)
shape(tiled_features):  (5, 18, 24)
loss at i 0: 2.6451919078826904
loss at i 10: 2.42616605758667
loss at i 20: 2.3113667964935303
loss at i 30: 2.2893896102905273
loss at i 40: 1.9962695837020874
loss at i 50: 2.0949418544769287
loss at i 60: 2.1340460777282715
loss at i 70: 1.9927934408187866
loss at i 80: 1.7872798442840576
loss at i 90: 1.6347181797027588
loss at i 100: 1.7173080444335938
loss at i 110: 1.958595871925354
loss at i 120: 1.9544323682785034
loss at i 130: 2.432661771774292
loss at i 140: 1.4367483854293823
loss at i 150: 1.9806965589523315
loss at i 160: 1.9730268716812134
loss at i 170: 1.9884119033813477
loss at i 180: 1.9526020288467407
loss at i 190: 1.9420955181121826
loss at i 200: 2.107860803604126
loss at i 210: 1.5580101013183594
loss at i 220: 1.5647457838058472
loss at i 230: 1.9798862934112549
loss at i 240: 1.6896913051605225
loss at i 250: 1.7287826538085938
loss at i 260: 1.4648513793945312
loss at i 270: 1.7118891477584839
loss at i

loss at i 2380: 1.5496885776519775
loss at i 2390: 1.8234124183654785
loss at i 2400: 1.6629376411437988
loss at i 2410: 1.4765913486480713
loss at i 2420: 1.8025481700897217
loss at i 2430: 1.4280924797058105
loss at i 2440: 1.5513323545455933
loss at i 2450: 1.6918102502822876
loss at i 2460: 1.7705575227737427
loss at i 2470: 1.5357840061187744
loss at i 2480: 1.798229455947876
loss at i 2490: 2.1960113048553467
loss at i 2500: 1.801081895828247
loss at i 2510: 2.2344343662261963
loss at i 2520: 1.4584532976150513
loss at i 2530: 1.6940016746520996
loss at i 2540: 1.450158715248108
loss at i 2550: 1.6697673797607422
loss at i 2560: 2.269953489303589
loss at i 2570: 1.6179018020629883
loss at i 2580: 2.014347791671753
loss at i 2590: 1.7201809883117676
loss at i 2600: 2.14612078666687
loss at i 2610: 1.8518308401107788
loss at i 2620: 1.2271506786346436
loss at i 2630: 1.8853482007980347
loss at i 2640: 2.1139519214630127
loss at i 2650: 1.5687170028686523
loss at i 2660: 1.485453605

loss at i 4740: 1.6993424892425537
loss at i 4750: 1.9289298057556152
loss at i 4760: 2.3035643100738525
loss at i 4770: 1.6932036876678467
loss at i 4780: 1.8010753393173218
loss at i 4790: 1.9341217279434204
loss at i 4800: 1.6575204133987427
loss at i 4810: 1.4792699813842773
loss at i 4820: 1.3333314657211304
loss at i 4830: 1.4460935592651367
loss at i 4840: 1.6035950183868408
loss at i 4850: 1.5122863054275513
loss at i 4860: 2.0259172916412354
loss at i 4870: 1.9948936700820923
loss at i 4880: 1.3695167303085327
loss at i 4890: 1.7331959009170532
loss at i 4900: 1.9805819988250732
loss at i 4910: 1.7371567487716675
loss at i 4920: 1.1094332933425903
loss at i 4930: 1.8216477632522583
loss at i 4940: 1.6491690874099731
loss at i 4950: 1.3422423601150513
loss at i 4960: 1.5432418584823608
loss at i 4970: 2.018401861190796
loss at i 4980: 1.8296537399291992
loss at i 4990: 1.6752114295959473
loss at i 5000: 1.967334270477295
loss at i 5010: 1.6441985368728638
loss at i 5020: 1.4928

loss at i 7100: 1.5216366052627563
loss at i 7110: 2.2214460372924805
loss at i 7120: 1.8268442153930664
loss at i 7130: 2.1025795936584473
loss at i 7140: 1.8093243837356567
loss at i 7150: 1.7985748052597046
loss at i 7160: 2.3780689239501953
loss at i 7170: 2.591331720352173
loss at i 7180: 1.9981609582901
loss at i 7190: 1.816736102104187
loss at i 7200: 1.3454113006591797
loss at i 7210: 2.162644386291504
loss at i 7220: 1.4984389543533325
loss at i 7230: 1.5378327369689941
loss at i 7240: 1.6162934303283691
loss at i 7250: 1.6243751049041748
loss at i 7260: 1.521057367324829
loss at i 7270: 1.9787521362304688
loss at i 7280: 1.9871846437454224
loss at i 7290: 2.6016650199890137
loss at i 7300: 1.6835061311721802
loss at i 7310: 1.3180091381072998
loss at i 7320: 1.8899675607681274
loss at i 7330: 1.6435832977294922
loss at i 7340: 2.1245462894439697
loss at i 7350: 1.9004918336868286
loss at i 7360: 2.2346608638763428
loss at i 7370: 1.4983867406845093
loss at i 7380: 1.390581369

loss at i 9460: 1.374603509902954
loss at i 9470: 1.8527960777282715
loss at i 9480: 2.0412962436676025
loss at i 9490: 1.14285409450531
loss at i 9500: 2.0627753734588623
loss at i 9510: 1.8791292905807495
loss at i 9520: 2.0615267753601074
loss at i 9530: 1.5503530502319336
loss at i 9540: 1.341149091720581
loss at i 9550: 1.9804472923278809
loss at i 9560: 1.395098328590393
loss at i 9570: 1.7393008470535278
loss at i 9580: 1.9806724786758423
loss at i 9590: 1.4780060052871704
loss at i 9600: 1.8604604005813599
loss at i 9610: 2.005167007446289
loss at i 9620: 1.9821746349334717
loss at i 9630: 1.728538155555725
loss at i 9640: 1.513158917427063
loss at i 9650: 1.4065732955932617
loss at i 9660: 1.9304578304290771
loss at i 9670: 1.8239959478378296
loss at i 9680: 1.4215291738510132
loss at i 9690: 2.1249008178710938
loss at i 9700: 1.9312522411346436
loss at i 9710: 1.3415294885635376
loss at i 9720: 1.3063488006591797
loss at i 9730: 1.5919761657714844
loss at i 9740: 1.5957603454