#### Imports 

In [52]:
import tensorflow as tf
import numpy as np
import sys
import re
from tensorflow.keras.utils import to_categorical

#### Definitions

In [63]:
def load_games_list(filename='corpus.txt', verbose = False):
    '''
    in: filename, the corpus
    out: list of list of events in each game
    '''
    if verbose:
        print('Reading corpus')
    return [game.split() for game in open(filename,'r').readlines()]
    
def strip_aZ(games_list):
    '''
    in: list of lists of games
    out: stripped to just ^a-zA-Z
    '''
    print('Stripping games_list of non-[^a-zA-Z] characters')
    regex = re.compile('[^a-zA-Z]')
    games_list = [[regex.sub('',event) for event in game] for game in games_list]
    #print(games_list[:3][:5])
    return games_list
    
    
def make_vocabulary(games_list, verbose=False):
    '''
    in: list of lists of games
    out: 
        vocabulary, the number of distinct events
            and
        event_2_ind, a lookup dictionary for the index of each event
    '''
    if verbose:
        print('Constructing vocabular and event2id dictionary')
    
    #full events list to make vocabulary and ids
    events = flatten_games_to_events(games_list)
    #select distinct
    distinct_events = list(set(events))
    vocabulary = len(distinct_events)
    #make id dictionary
    event_2_id = {}
    for event in distinct_events:
        event_2_id[event] = distinct_events.index(event)
    id_2_event = dict(zip(event_2_id.values(), event_2_id.keys()))
    return vocabulary, event_2_id, id_2_event
   
def games_list_to_ids(games_list, event_2_id, verbose=False):
    '''
    in: games_list, list of lists of events in string format
        event_2_id, id dictionary constructed from full vocabulary
    out: 
        games_list, list of lists of events in id format
    '''
    if verbose:
        print('Encoding a list of games by event id')
    return [[event_2_id[event] for event in game] for game in games_list]

def train_test_split_games_list(games_list, train_frac = .8, verbose=False):
    if verbose:
        print(f'Making train test split. train_frac = {train_frac}')
    split_ind = int(len(games_list)*train_frac)
    train = games_list[:split_ind]
    test = games_list[split_ind:]
    return train, test

def flatten_games_to_events(games_list): 
    return [event for game in games_list
                     for event in game]

def load_data(filename):
    '''
    in: filename, a .txt file whose lines are nhl games where events
    are represented by strings separated by spaces
    '''
    games_list = load_games_list('corpus.txt', 
                                 verbose=True)[:200]
    
    ###This is to make things fast for now
    games_list = strip_aZ(games_list)
    
    #building word to index dictionary and vocabulary
    vocabulary, event_2_id, id_2_event = make_vocabulary(games_list, 
                                                         verbose=True)
    
    #convert to ids
    games_list = games_list_to_ids(games_list, 
                                   event_2_id, 
                                   verbose=True)
    
    #train test split
    train_data, test_data = train_test_split_games_list(games_list, 
                                                        verbose=True)
    train_data = flatten_games_to_events(train_data)
    test_data = flatten_games_to_events(test_data)
    valid_data = None
    
    reversed_dictionary = None
    
    return (
            train_data, 
            valid_data, 
            test_data, 
            vocabulary, 
            reversed_dictionary, 
            event_2_id,
            id_2_event
           )

In [67]:
class KerasBatchGenerator(object):
    '''
    generates batches for Keras to train neural networks 
    should I grab the batches randomly?
    '''
    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        self.current_idx = 0
        self.skip_step = skip_step
        
    def generate(self):
        #input is just the number of steps in each in, and the batch size
        x = np.zeros((self.batch_size, 
                      self.num_steps))
        #output will be one-hots of dimension vocabulary
        y = np.zeros((self.batch_size, 
                      self.num_steps, 
                      self.vocabulary))
        while True:#never terminate
            for i in range(self.batch_size):
                #if I would run over the edge, reset idx
                if self.current_idx + self.num_steps >= len(self.data):
                    self.current_idx = 0
                x[i,:] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps+1]
                #make the one-hots for the y training data
                y[i,:,:] = to_categorical(temp_y, 
                                          num_classes=self.vocabulary)
                self.current_idx += self.skip_step
            yield x, y


In [64]:
train_data, valid_data, test_data, vocabulary, reversed_dictionary, event_2_id, id_2_event = load_data('corpus.txt')

Reading corpus
Stripping games_list of non-[^a-zA-Z] characters
Constructing vocabular and event2id dictionary
Encoding a list of games by event id
Making train test split. train_frac = 0.8
