#### Imports 

In [234]:
import tensorflow as tf
import numpy as np
import sys
import re
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential 
from tensorflow.keras import layers
from pprint import pprint

#### Definitions

In [116]:
def load_games_list(filename='corpus.txt', verbose = False):
    '''
    in: filename, the corpus
    out: list of list of events in each game
    '''
    if verbose:
        print('Reading corpus')
    with open(filename,'r') as f:
        return [game.split() for game in f.readlines()]
    
def strip_aZ(games_list):
    '''
    in: list of lists of games
    out: stripped to just ^a-zA-Z
    '''
    print('Stripping games_list of non-[^a-zA-Z] characters')
    regex = re.compile('[^a-zA-Z]')
    games_list = [[regex.sub('',event) for event in game] for game in games_list]
    #print(games_list[:3][:5])
    return games_list
    
    
def make_vocabulary(games_list, verbose=False):
    '''
    in: list of lists of games
    out: 
        vocabulary, the number of distinct events
            and
        event_2_ind, a lookup dictionary for the index of each event
    '''
    if verbose:
        print('Constructing vocabular and event2id dictionary')
    
    #full events list to make vocabulary and ids
    events = flatten_games_to_events(games_list)
    #select distinct
    distinct_events = list(set(events))
    vocabulary = len(distinct_events)
    #make id dictionary
    event_2_id = {}
    for event in distinct_events:
        event_2_id[event] = distinct_events.index(event)
    id_2_event = dict(zip(event_2_id.values(), event_2_id.keys()))
    return vocabulary, event_2_id, id_2_event
   
def games_list_to_ids(games_list, event_2_id, verbose=False):
    '''
    in: games_list, list of lists of events in string format
        event_2_id, id dictionary constructed from full vocabulary
    out: 
        games_list, list of lists of events in id format
    '''
    if verbose:
        print('Encoding a list of games by event id')
    return [[event_2_id[event] for event in game] for game in games_list]

def id_list_to_event(id_list, id_2_event, verbose=False):
    '''
    in: id_list a list of events in id format
        id_2_event: lookup dictionary
    out: 
        event_list: a list of events in event format
    '''
    return [id_2_event[idd] for idd in id_list ]

def train_test_split_games_list(games_list, train_frac = .8, verbose=False):
    if verbose:
        print(f'Making train test split. train_frac = {train_frac}')
    split_ind = int(len(games_list)*train_frac)
    train = games_list[:split_ind]
    test = games_list[split_ind:]
    return train, test

def flatten_games_to_events(games_list): 
    return [event for game in games_list
                     for event in game]

def load_data(filename):
    '''
    in: filename, a .txt file whose lines are nhl games where events
    are represented by strings separated by spaces
    '''
    games_list = load_games_list('corpus.txt', 
                                 verbose=True)[:200]
    
    ###This is to make things fast for now
    games_list = strip_aZ(games_list)
    
    #building word to index dictionary and vocabulary
    vocabulary, event_2_id, id_2_event = make_vocabulary(games_list, 
                                                         verbose=True)
    
    #convert to ids
    games_list = games_list_to_ids(games_list, 
                                   event_2_id, 
                                   verbose=True)
    
    #train test split
    train_data, test_data = train_test_split_games_list(games_list, 
                                                        verbose=True)
    #flatten training (testing) data to list of events
    train_data = flatten_games_to_events(train_data)
    test_data = flatten_games_to_events(test_data)
    valid_data = None
    
    reversed_dictionary = None
    
    return (
            train_data, 
            valid_data, 
            test_data, 
            vocabulary, 
            reversed_dictionary, 
            event_2_id,
            id_2_event
           )

In [94]:
class KerasBatchGenerator(object):
    '''
    generates batches for Keras to train neural networks 
    should I grab the batches randomly?
    '''
    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        self.current_idx = 0
        self.skip_step = skip_step
        
    def generate(self):
        #input is just the number of steps in each in, and the batch size
        x = np.zeros((self.batch_size, 
                      self.num_steps))
        #output will be one-hots of dimension vocabulary
        y = np.zeros((self.batch_size, 
                      self.num_steps, 
                      self.vocabulary))
        while True:#never terminate
            for i in range(self.batch_size):
                #if I would run over the edge, reset idx
                if self.current_idx + self.num_steps >= len(self.data):
                    self.current_idx = 0
                x[i,:] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps+1]
                #make the one-hots for the y training data
                y[i,:,:] = to_categorical(temp_y, 
                                          num_classes=self.vocabulary)
                self.current_idx += self.skip_step
            yield x, y


In [185]:
def make_LSTM_RNN(vocabulary, hidden_size, num_steps, use_dropout=True):
    model = Sequential()
    model.add(layers.Embedding(vocabulary,
                               hidden_size,
                               input_length=num_steps))
    model.add(layers.LSTM(hidden_size, return_sequences=True))
    model.add(layers.LSTM(hidden_size, return_sequences=True))
    if use_dropout:
        model.add(layers.Dropout(0.5))
    model.add(layers.TimeDistributed(layers.Dense(vocabulary,
                                                  activation='softmax')))
    return model
    

#### Script

In [71]:
train_data, valid_data, test_data, vocabulary, reversed_dictionary, event_2_id, id_2_event = load_data('corpus.txt')

Reading corpus
Stripping games_list of non-[^a-zA-Z] characters
Constructing vocabular and event2id dictionary
Encoding a list of games by event id
Making train test split. train_frac = 0.8


In [235]:
import pickle 

In [None]:
def pickle_it(name, obj):
    with open('{}.pkl'.format(name), 'wb') as f:
        pickle.dump()

In [None]:
with open('event_2_id.pkl', 'wb') as f:
   pickle.dump(event_2_id, f, pickle.HIGHEST_PROTOCOL)

In [237]:
event_2_id

{'PeriodOfficialnocoords': 0,
 'Shotnocoords': 1,
 'PeriodEndnocoords': 2,
 'Giveawaynocoords': 3,
 'Hitnocoords': 4,
 'Penalty': 5,
 'ShootoutCompletenocoords': 6,
 'Goal': 7,
 'Faceoff': 8,
 'Penaltynocoords': 9,
 'GameOfficialnocoords': 10,
 'MissedShot': 11,
 'BlockedShotnocoords': 12,
 'Hit': 13,
 'BlockedShot': 14,
 'Takeawaynocoords': 15,
 'Takeaway': 16,
 'MissedShotnocoords': 17,
 'Stoppagenocoords': 18,
 'Goalnocoords': 19,
 'Shot': 20,
 'GameEndnocoords': 21,
 'GameSchedulednocoords': 22,
 'Faceoffnocoords': 23,
 'PeriodStartnocoords': 24,
 'Giveaway': 25,
 'PeriodReadynocoords': 26}

In [103]:
num_steps = 150
batch_size = 10
train_data_generator = KerasBatchGenerator(train_data, 
                                           num_steps, 
                                           batch_size,
                                           vocabulary,
                                           skip_step = num_steps)

test_data_generator = KerasBatchGenerator(test_data, 
                                          num_steps, 
                                          batch_size,
                                          vocabulary,
                                          skip_step = num_steps)

In [97]:
hidden_size = 20
model = make_LSTM_RNN(vocabulary, hidden_size, num_steps)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

In [98]:
#should write a definition, get callbacks, that returns a list of callbacks
checkpointer = keras.callbacks.ModelCheckpoint(filepath='model-{epoch:02d}.hdf5',
                               verbose=1)

In [224]:
num_epochs = 30
model.fit_generator(train_data_generator.generate(), 
                    len(train_data)//(batch_size*num_steps), 
                    num_epochs, 
                    validation_data=test_data_generator.generate(),
                    validation_steps=len(test_data)//(batch_size*num_steps),
                    callbacks = [checkpointer])

Epoch 1/30
Epoch 00001: saving model to model-01.hdf5
Epoch 2/30
Epoch 00002: saving model to model-02.hdf5
Epoch 3/30
Epoch 00003: saving model to model-03.hdf5
Epoch 4/30
Epoch 00004: saving model to model-04.hdf5
Epoch 5/30
Epoch 00005: saving model to model-05.hdf5
Epoch 6/30
Epoch 00006: saving model to model-06.hdf5
Epoch 7/30
Epoch 00007: saving model to model-07.hdf5
Epoch 8/30
Epoch 00008: saving model to model-08.hdf5
Epoch 9/30
Epoch 00009: saving model to model-09.hdf5
Epoch 10/30
Epoch 00010: saving model to model-10.hdf5
Epoch 11/30
Epoch 00011: saving model to model-11.hdf5
Epoch 12/30
Epoch 00012: saving model to model-12.hdf5
Epoch 13/30
Epoch 00013: saving model to model-13.hdf5
Epoch 14/30
Epoch 00014: saving model to model-14.hdf5
Epoch 15/30
Epoch 00015: saving model to model-15.hdf5
Epoch 16/30
Epoch 00016: saving model to model-16.hdf5
Epoch 17/30
Epoch 00017: saving model to model-17.hdf5
Epoch 18/30
Epoch 00018: saving model to model-18.hdf5
Epoch 19/30
Epoch 0

Epoch 28/30
Epoch 00028: saving model to model-28.hdf5
Epoch 29/30
Epoch 00029: saving model to model-29.hdf5
Epoch 30/30
Epoch 00030: saving model to model-30.hdf5


<tensorflow.python.keras.callbacks.History at 0x7f4de25c0a90>

In [188]:
def make_prediction_model(trained_LSTM_RNN, vocabulary, hidden_size):
    model_predicting = make_LSTM_RNN(vocabulary, hidden_size, None)
    model_predicting.set_weights(trained_LSTM_RNN.get_weights())
    return model_predicting

In [189]:
model_predictining = make_prediction_model(model, vocabulary, hidden_size)

In [216]:
def next_probs(seed_list, model_predictining):
    '''
    seed_list is the game so far in event format
    '''
    model_predictining.reset_states()
    for seed in seed_list[:-1]:
        model_predictining.predict([seed,], verbose=0)
    probs_vector = model_predictining.predict([seed_list[-1],],
                                          verbose=1)[0][0]
    probs = {}
    for i, prob in enumerate(probs_vector):
        probs[id_2_event[i]]=prob
    return probs
    

In [219]:
test_seed_list_id = [test_data[5]]

In [206]:
test_seed_list = id_list_to_event(test_seed_list_id,id_2_event)
print(test_seed_list)

['GameSchedulednocoords']


In [220]:
nps = next_probs(test_data[:10], model_predictining)



In [221]:
pprint(nps)

{'BlockedShot': 0.0062115295,
 'BlockedShotnocoords': 0.00041467499,
 'Faceoff': 0.856352,
 'Faceoffnocoords': 0.03678408,
 'GameEndnocoords': 7.033351e-05,
 'GameOfficialnocoords': 0.0004248029,
 'GameSchedulednocoords': 0.00010391814,
 'Giveaway': 0.0023818028,
 'Giveawaynocoords': 0.00043035016,
 'Goal': 0.003245439,
 'Goalnocoords': 0.00042935443,
 'Hit': 0.004769465,
 'Hitnocoords': 0.00051336444,
 'MissedShot': 0.005453769,
 'MissedShotnocoords': 0.00058681925,
 'Penalty': 0.02522075,
 'Penaltynocoords': 0.0081690885,
 'PeriodEndnocoords': 0.0018994793,
 'PeriodOfficialnocoords': 3.5807486e-06,
 'PeriodReadynocoords': 9.251992e-05,
 'PeriodStartnocoords': 6.187619e-05,
 'ShootoutCompletenocoords': 0.0010916325,
 'Shot': 0.018091103,
 'Shotnocoords': 0.00089533883,
 'Stoppagenocoords': 0.023498546,
 'Takeaway': 0.0020847202,
 'Takeawaynocoords': 0.0007197218}


In [223]:
#pred_index = np.random.choice(vocabulary, p=nps[0][0])
id_2_event[test_data[9]]

'Stoppagenocoords'

In [175]:
id_2_event

{0: 'PeriodOfficialnocoords',
 1: 'Shotnocoords',
 2: 'PeriodEndnocoords',
 3: 'Giveawaynocoords',
 4: 'Hitnocoords',
 5: 'Penalty',
 6: 'ShootoutCompletenocoords',
 7: 'Goal',
 8: 'Faceoff',
 9: 'Penaltynocoords',
 10: 'GameOfficialnocoords',
 11: 'MissedShot',
 12: 'BlockedShotnocoords',
 13: 'Hit',
 14: 'BlockedShot',
 15: 'Takeawaynocoords',
 16: 'Takeaway',
 17: 'MissedShotnocoords',
 18: 'Stoppagenocoords',
 19: 'Goalnocoords',
 20: 'Shot',
 21: 'GameEndnocoords',
 22: 'GameSchedulednocoords',
 23: 'Faceoffnocoords',
 24: 'PeriodStartnocoords',
 25: 'Giveaway',
 26: 'PeriodReadynocoords'}

In [233]:
from keras.utils import plot_model
plot_model(model_predictining, to_file='model.png')

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [229]:
!conda install pydot

Collecting package metadata: done
Solving environment: done


  current version: 4.6.14
  latest version: 4.7.8

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - pydot


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |             main           3 KB  defaults
    cairo-1.16.0               |    h18b612c_1001         1.5 MB  conda-forge
    certifi-2019.6.16          |           py37_1         149 KB  conda-forge
    expat-2.2.5                |    he1b5a44_1003         191 KB  conda-forge
    fribidi-1.0.5              |    h516909a_1002         112 KB  conda-forge
    glib-2.58.3                |    h6f030ca_1002         3.3 MB  conda-forge
    graphviz-2.40.1            |       h5933667_1         6.4 MB  conda-forge
    harfbuzz-2.4.0             |     