In [1]:
from modules import squad
import numpy as np
import re
import pandas as pd
import tensorflow as tf
from modules.embeddings import load_word_embeddings
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from tqdm import tqdm
from gensim.models import KeyedVectors
import gensim
import os

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Embedding, Input, LSTM, Dropout, Dense, Activation, TimeDistributed
from tensorflow.keras import Model

In [2]:
processor = squad.SquadProcessor()

In [3]:
processor.read_json('./data/squad_train.json')

In [4]:
data_dict = processor.to_dict()

In [5]:
#word_embeddings = load_word_embeddings('./data/glove.6B.50d.txt')

In [6]:
df = pd.DataFrame(data_dict)

In [7]:
df

Unnamed: 0,context,question,answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s
...,...,...,...
86816,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,Oregon
86817,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,Rangoon
86818,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,Minsk
86819,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,1975


In [8]:
def tokenize_words(sequences):
    return [wordpunct_tokenize(seq) for seq in sequences]

def lowercase_sequences(sequences):
    if isinstance(sequences, pd.Series):
        return sequences.str.lower()
    return [seq.lower() for seq in sequences]

In [9]:
def build_vocabulary(sequences, verbose =  True):
    vocab = {}
    for seq in tqdm(sequences, disable = (not verbose)):
        for word in seq:
            try:
                vocab[word] += 1
            except:
                vocab[word] = 1
    return vocab

In [10]:
def convert_to_word2vec(filename, vocab, dimension):
    vocab_size = len(vocab)
    
    with gensim.utils.open(filename, 'wb') as f:
        dims_utf8 = gensim.utils.to_utf8("{} {}\n".format(vocab_size, dimension))
        f.write(dims_utf8)
        
        for word, row in tqdm(vocab.items()):
            row = row.astype(np.float32)
            f.write(gensim.utils.to_utf8(word) + b" " + row.tostring())

In [11]:
#convert_to_word2vec('./data/glove.6B.50d.bin', word_embeddings, word_embeddings['the'].shape[0])

In [12]:
import operator 

def oov_words(vocabulary, word_embeddings):
    in_vocab = {}
    oov = {}
    in_vocab_count = 0
    out_vocab_count = 0
    for word in tqdm(vocabulary):
        try:
            in_vocab[word] = word_embeddings[word]
            in_vocab_count += vocabulary[word]
        except Exception as e:
            oov[word] = vocabulary[word]
            out_vocab_count += vocabulary[word]

    print('Found embeddings for {:.2%} of vocab'.format(len(in_vocab) / len(vocabulary)))
    print('Found embeddings for  {:.2%} of all text'.format(in_vocab_count / (in_vocab_count + out_vocab_count)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [13]:
def replace_tokens(sequences, to_be_replaced):
    new_seqs = sequences
    for token, new_token in to_be_replaced.items():
        new_seqs = new_seqs.str.replace(token, new_token)
    
    return new_seqs

def delete_tokens(sequences, to_be_deleted):
    to_be_deleted_str = ''.join(to_be_deleted)
    
    f = lambda text: re.sub(r'[{}]'.format(to_be_deleted_str), '', text)
    
    return sequences.apply(f)

In [14]:
def handle_num_and_punc(sequences):
    def split_num_and_dot(seq):
        return re.sub('[0-9]{1,}\.(?![0-9])', lambda m: m.group(0)[:-1] + " " + ".", seq)
    
    def split_num_and_dash(seq):
        return re.sub('[0-9]{4}–[0-9]{2}', lambda m: m.group(0)[:4] + " and " + m.group(0)[:2] + m.group(0)[5:], seq)
    
    def split_num_and_currency(seq):
        curr_to_str = {
            '€': 'euro',
            '£': 'pound',
            '$': 'dollar'
        }
        res = re.sub('[€£$]{1}[0-9]+([\.,]{1}([0-9]{0,})?)?', lambda m: m.group(0)[1:] + " " + curr_to_str[m.group(0)[0]], seq)
        if 'us$' in seq:
            print(seq)
        return res
    
    return sequences.apply(split_num_and_dot).apply(split_num_and_dash).apply(split_num_and_currency)

In [15]:
def split_quoted_text(sequences):
    f = lambda text: re.sub(r' (["\'])(?:(?=(\\?))\2.)*?\1 ', lambda m: m.group(0)[0] + " " + m.group(0)[1:-1] + " " + m.group(0)[-1], text, re.DOTALL)
    
    return sequences.apply(f)

In [16]:
def split_punctuations(sequences):
    f = lambda text: re.sub(r'[\(\)\.\"\'\[\]\,%;:?!-/]{2,}', lambda m: ' '.join(list(m.group(0))), text, flags=re.DOTALL)
    
    return sequences.apply(f)

In [17]:
word_embeddings = KeyedVectors.load_word2vec_format('glove-50d.bin', binary=True)

In [24]:
word_embeddings

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7fab2517eb20>

In [25]:
word_embeddings.most_similar('ayse')

[('bakhtiyar', 0.7318562269210815),
 ('dabbashi', 0.7161852121353149),
 ('soad', 0.7160789370536804),
 ('drolma', 0.7156862616539001),
 ('akhmedov', 0.705880880355835),
 ('farai', 0.7019412517547607),
 ('bucko', 0.7016493678092957),
 ('hamash', 0.6900523900985718),
 ('ozturk', 0.6877020597457886),
 ('muthiya', 0.6861430406570435)]

In [18]:
def preprocess_text(sequences):
    sequences = lowercase_sequences(sequences)
    
    to_be_changed = {
        '°c': 'celsius',
        '°f': 'fahrenheit',
        "qu'ran": 'quran',
        '−': '-',
        'asphalt/bitumen': 'asphalt bitumen',
        'us$': 'us $',
        '°': ' degree ',
        '”': '"',
        '—': '-',
        '⁄': '/',
        '′': "'",
        '″': '"',
        '×': 'times',
        '–': '-'
    }

    to_be_deleted = ['§', '्', '\ufeff']

    sequences = replace_tokens(sequences, to_be_changed)

    sequences = delete_tokens(sequences, to_be_deleted)

    sequences = handle_num_and_punc(sequences)

    sequences = split_punctuations(sequences)
    
    return sequences

In [19]:
train_sequences = preprocess_text(df.context)
question_sequences = preprocess_text(df.question)

in september 2007, during a lawsuit with patent holding company burst.com, apple drew attention to a patent for a similar device that was developed in 1979 . kane kramer applied for a uk patent for his design of a "plastic music box" in 1981, which he called the ixi. he was unable to secure funding to renew the us$120,000 worldwide patent, so it lapsed and kramer never profited from his idea.
in september 2007, during a lawsuit with patent holding company burst.com, apple drew attention to a patent for a similar device that was developed in 1979 . kane kramer applied for a uk patent for his design of a "plastic music box" in 1981, which he called the ixi. he was unable to secure funding to renew the us$120,000 worldwide patent, so it lapsed and kramer never profited from his idea.
in september 2007, during a lawsuit with patent holding company burst.com, apple drew attention to a patent for a similar device that was developed in 1979 . kane kramer applied for a uk patent for his design

indira gandhi international airport, situated to the southwest of delhi, is the main gateway for the city's domestic and international civilian air traffic. in 2012-13, the airport was used by more than 35 million passengers, making it one of the busiest airports in south asia. terminal 3, which cost ₹96.8 billion (us$1.4 billion) to construct between 2007 and 2010, handles an additional 37 million passengers annually.
indira gandhi international airport, situated to the southwest of delhi, is the main gateway for the city's domestic and international civilian air traffic. in 2012-13, the airport was used by more than 35 million passengers, making it one of the busiest airports in south asia. terminal 3, which cost ₹96.8 billion (us$1.4 billion) to construct between 2007 and 2010, handles an additional 37 million passengers annually.
indira gandhi international airport, situated to the southwest of delhi, is the main gateway for the city's domestic and international civilian air traffi

the greater mexico city has a gross domestic product (gdp) of us$411 billion in 2011, making mexico city urban agglomeration one of the economically largest metropolitan areas in the world. the city was responsible for generating 15.8% of mexico's gross domestic product and the metropolitan area accounted for about 22% of total national gdp. as a stand-alone country, in 2013, mexico city would be the fifth-largest economy in latin america-five times as large as costa rica's and about the same size as peru's.
the greater mexico city has a gross domestic product (gdp) of us$411 billion in 2011, making mexico city urban agglomeration one of the economically largest metropolitan areas in the world. the city was responsible for generating 15.8% of mexico's gross domestic product and the metropolitan area accounted for about 22% of total national gdp. as a stand-alone country, in 2013, mexico city would be the fifth-largest economy in latin america-five times as large as costa rica's and abo

on 21 december 2011 the bank instituted a programme of making low-interest loans with a term of three years (36 months) and 1% interest to european banks accepting loans from the portfolio of the banks as collateral. loans totalling €489.2 bn (us$640 bn) were announced. the loans were not offered to european states, but government securities issued by european states would be acceptable collateral as would mortgage-backed securities and other commercial paper that can be demonstrated to be secure. the programme was announced on 8 december 2011 but observers were surprised by the volume of the loans made when it was implemented. under its ltro it loaned €489bn to 523 banks for an exceptionally long period of three years at a rate of just one percent. the by far biggest amount of €325bn was tapped by banks in greece, ireland, italy and spain. this way the ecb tried to make sure that banks have enough cash to pay off €200bn of their own maturing debts in the first three months of 2012, an

the united states provides egypt with annual military assistance, which in 2015 amounted to us$1.3 billion. in 1989, egypt was designated as a major non-nato ally of the united states. nevertheless, ties between the two countries have partially soured since the july 2013 military coup that deposed islamist president mohamed morsi, with the obama administration condemning egypt's violent crackdown on the muslim brotherhood and its supporters, and cancelling future military exercises involving the two countries. there have been recent attempts, however, to normalise relations between the two, with both governments frequently calling for mutual support in the fight against regional and international terrorism.
the united states provides egypt with annual military assistance, which in 2015 amounted to us$1.3 billion. in 1989, egypt was designated as a major non-nato ally of the united states. nevertheless, ties between the two countries have partially soured since the july 2013 military co

with interpublic group, what company has a combined annual revenue of roughly us$21 billion?
how many indymac account holders held funds in excess of the fdic's insured amount of us$100,000?


In [28]:
train_sequences[0]

'beyoncé giselle knowles-carter ( /biːˈjɒnseɪ/ bee-yon-say) (born september 4, 1981) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r&b girl-group destiny\'s child. managed by her father, mathew knowles, the group became one of the world\'s best-selling girl groups of all time. their hiatus saw the release of beyoncé\'s debut album, dangerously in love (2003) , which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number-one singles "crazy in love" and "baby boy" .'

In [29]:
"us$1,000,000".replace('us$', 'us $')

'us $1,000,000'

In [20]:
train_tokenized = tokenize_words(train_sequences)
question_tokenized = tokenize_words(question_sequences)

In [31]:
train_tokenized[0]

['beyoncé',
 'giselle',
 'knowles',
 '-',
 'carter',
 '(',
 '/',
 'biːˈjɒnseɪ',
 '/',
 'bee',
 '-',
 'yon',
 '-',
 'say',
 ')',
 '(',
 'born',
 'september',
 '4',
 ',',
 '1981',
 ')',
 'is',
 'an',
 'american',
 'singer',
 ',',
 'songwriter',
 ',',
 'record',
 'producer',
 'and',
 'actress',
 '.',
 'born',
 'and',
 'raised',
 'in',
 'houston',
 ',',
 'texas',
 ',',
 'she',
 'performed',
 'in',
 'various',
 'singing',
 'and',
 'dancing',
 'competitions',
 'as',
 'a',
 'child',
 ',',
 'and',
 'rose',
 'to',
 'fame',
 'in',
 'the',
 'late',
 '1990s',
 'as',
 'lead',
 'singer',
 'of',
 'r',
 '&',
 'b',
 'girl',
 '-',
 'group',
 'destiny',
 "'",
 's',
 'child',
 '.',
 'managed',
 'by',
 'her',
 'father',
 ',',
 'mathew',
 'knowles',
 ',',
 'the',
 'group',
 'became',
 'one',
 'of',
 'the',
 'world',
 "'",
 's',
 'best',
 '-',
 'selling',
 'girl',
 'groups',
 'of',
 'all',
 'time',
 '.',
 'their',
 'hiatus',
 'saw',
 'the',
 'release',
 'of',
 'beyoncé',
 "'",
 's',
 'debut',
 'album',
 ',',

In [21]:
vocab = build_vocabulary(train_tokenized + question_tokenized)
vocab

100%|██████████| 173642/173642 [00:01<00:00, 101791.02it/s]


{'beyoncé': 2692,
 'giselle': 28,
 'knowles': 120,
 '-': 138729,
 'carter': 154,
 '(': 92925,
 '/': 9760,
 'biːˈjɒnseɪ': 15,
 'bee': 77,
 'yon': 15,
 'say': 1020,
 ')': 93008,
 'born': 1784,
 'september': 3423,
 '4': 5673,
 ',': 687140,
 '1981': 485,
 'is': 108198,
 'an': 40107,
 'american': 8673,
 'singer': 583,
 'songwriter': 133,
 'record': 2957,
 'producer': 698,
 'and': 341127,
 'actress': 287,
 '.': 496038,
 'raised': 1034,
 'in': 318244,
 'houston': 1406,
 'texas': 816,
 'she': 5463,
 'performed': 1319,
 'various': 3880,
 'singing': 416,
 'dancing': 147,
 'competitions': 422,
 'as': 105657,
 'a': 203391,
 'child': 1905,
 'rose': 787,
 'to': 247341,
 'fame': 418,
 'the': 879730,
 'late': 4926,
 '1990s': 1164,
 'lead': 1593,
 'of': 443618,
 'r': 1733,
 '&': 2566,
 'b': 2134,
 'girl': 337,
 'group': 6290,
 'destiny': 378,
 "'": 92540,
 's': 82670,
 'managed': 771,
 'by': 85091,
 'her': 10146,
 'father': 2015,
 'mathew': 69,
 'became': 9445,
 'one': 23930,
 'world': 15042,
 'best': 

In [22]:
out_of_vocab = oov_words(vocab, word_embeddings)
out_of_vocab

100%|██████████| 82915/82915 [00:00<00:00, 493527.71it/s]

Found embeddings for 80.70% of vocab
Found embeddings for  99.33% of all text





[('⟨', 269),
 ('midna', 215),
 ('́', 185),
 ('⟩', 155),
 ('utf', 137),
 ('km²', 134),
 ('ganondorf', 115),
 ('deréon', 112),
 ('latewood', 109),
 ('poundm', 102),
 ('yeezy', 100),
 ('aonuma', 92),
 ('kbit', 89),
 ('⟩,', 81),
 ('̇', 75),
 ('َ', 73),
 ('deshin', 71),
 ('♠', 69),
 ('±', 69),
 ('[›]', 67),
 ('shekpa', 65),
 ('maycomb', 64),
 ('ِ', 63),
 ('₹', 60),
 ('gaddafist', 59),
 ('dollarbn', 57),
 ('əs', 56),
 ('woyciechowski', 56),
 ('earlywood', 55),
 ('grzymała', 55),
 ('m²', 54),
 ('ghmc', 53),
 ('̍', 53),
 ('isil', 53),
 ('apizza', 52),
 ('(~', 52),
 ('nohant', 52),
 ('wodzińska', 52),
 ('asphalte', 50),
 ('arsphenamine', 50),
 ('11172', 50),
 ('rossabi', 50),
 ('digivolution', 49),
 ('clitellates', 48),
 ('→', 47),
 ('4512', 46),
 ('nazianzen', 45),
 ('westquay', 45),
 ('hinx', 45),
 ('prehension', 44),
 ('feedpoint', 44),
 ('vinaccia', 43),
 ('żywny', 43),
 ('discovision', 42),
 ('kuhf', 42),
 ('fradiani', 41),
 ('medreses', 40),
 ('vaiśeṣika', 40),
 ('electroforming', 40),
 (

In [38]:
word_tokenizer = Tokenizer(oov_token = '<unk>')
word_tokenizer.fit_on_texts(train_tokenized + question_tokenized)

In [40]:
word_tokenizer.word_index['<unk>']

1

In [24]:
train_input_data = word_tokenizer.texts_to_sequences(train_tokenized)
train_output_data = word_tokenizer.texts_to_sequences(question_tokenized)

In [36]:
def get_max_length(sequences):
    return max([len(seq) for seq in sequences])

In [37]:
max_input_length, max_output_length = get_max_length(train_input_data), get_max_length(train_output_data)

max_input_length, max_output_length

(815, 60)

In [38]:
padded_input_sequences = pad_sequences(train_input_data, maxlen=max_input_length, padding='post')
padded_output_sequences = pad_sequences(train_output_data, maxlen=max_output_length, padding='post')

In [39]:
padded_input_sequences.shape, padded_output_sequences.shape

((86821, 815), (86821, 60))

In [40]:
def pretrained_embedding_layer(word_embeddings, word_to_index, name):
    embedding_size = word_embeddings['the'].shape[0]
    vocab_len = len(word_to_index) + 1
    
    embedding_matrix = np.zeros((vocab_len, embedding_size))
    
    
    for word, idx in word_to_index.items():
        if word in word_embeddings:
            embedding_matrix[idx, :] = word_embeddings[word]
            
    embedding_matrix[0, :] = embedding_matrix.mean(axis = 0)
            
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_size, trainable = False, name = name + '_embedding')

    embedding_layer.build((None,))
    
    embedding_layer.set_weights([embedding_matrix])
    
    return embedding_layer

In [37]:
len(word_tokenizer.word_index)

82915

In [41]:
class Encoder():
    def __call__(self, input_shape, hidden_units, word_embeddings, word_to_index):
        # Define input layer
        raw_input = Input(shape=input_shape, name = 'encoder_input')
        # Define embedding layer and load word embeddings
        embedding_layer = pretrained_embedding_layer(word_embeddings, word_to_index, name = 'encoder')
        # Give inputs to embedding layer
        encoder_inputs = embedding_layer(raw_input)
        # Define LSTM layer
        encoder = LSTM(units=hidden_units, return_state=True, name='encoder_lstm')
        # Run LSTM on word embeddings
        _, state_h, state_c = encoder(encoder_inputs)
        # Return states of LSTM
        return raw_input, [state_h, state_c]

In [42]:
class Decoder():
    def __call__(self, input_shape, hidden_units, word_embeddings, word_to_index, encoder_states):
        # Define input layer
        raw_input = Input(shape=input_shape, name = 'decoder_input')
        # Define embedding layer and load word embeddings
        embedding_layer = pretrained_embedding_layer(word_embeddings, word_to_index, name = 'decoder')
        # Give inputs to embedding layer
        decoder_inputs = embedding_layer(raw_input)
        # Define LSTM layer
        decoder = LSTM(hidden_units, return_sequences=True, return_state=True, name = 'decoder_lstm')
        # Run LSTM on decoder inputs
        lstm_outputs, state_dec_h, state_dec_c = decoder(decoder_inputs, initial_state=encoder_states)
        # Run Dense layer with softmax
        decoder_outputs = TimeDistributed(Dense(len(word_to_index) + 1), name = 'output')(lstm_outputs)
        
        return raw_input, decoder_outputs


In [43]:
def QuestionModel(max_input_length, max_output_length, word_embeddings, word_to_index):
    
    encoder = Encoder()
    decoder = Decoder()
    
    encoder_input, encoder_states = encoder((max_input_length,), 16, word_embeddings, word_to_index)
    
    decoder_input, decoder_outputs= decoder((max_output_length,), 16, word_embeddings, word_to_index, encoder_states)
    
    model = Model(inputs = [encoder_input, decoder_input], outputs=decoder_outputs)
            
    return model, encoder_input, decoder_input

In [44]:
QModel, encoder_input, decoder_input = QuestionModel(max_input_length, max_output_length, word_embeddings, word_tokenizer.word_index)

In [45]:
QModel.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, 815)]        0                                            
__________________________________________________________________________________________________
decoder_input (InputLayer)      [(None, 60)]         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 815, 50)      4145800     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, 60, 50)       4145800     decoder_input[0][0]              
_______________________________________________________________________________________

In [46]:
if os.path.exists('checkpoints/q_model.h5'):
    print('Weights loaded!')
    QModel.load_weights('checkpoints/q_model.h5')

Weights loaded!


In [47]:
batch_size = 16

In [48]:
train_dataset = tf.data.Dataset.from_tensor_slices((padded_input_sequences, padded_output_sequences))
train_dataset = train_dataset.shuffle(buffer_size = 512).batch(batch_size)

In [49]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

In [50]:
class Inferer:
    def __call__(self, q_model):
        hidden_units = q_model.layers[4].units
        
        encoder_inputs = q_model.input[0]
        
        _, state_h_enc, state_c_enc = q_model.layers[4].output
        
        print(state_h_enc.shape)
        
        encoder_states = [state_h_enc, state_c_enc]
        
        encoder_model = Model(encoder_inputs, encoder_states)
        
        decoder_inputs = q_model.input[1]
        
        decoder_embedding = q_model.layers[3]
        
        decoder_lstm_inputs = decoder_embedding(decoder_inputs)
        
        decoder_state_input_h = Input(shape=(hidden_units,), name="input_3")
        
        decoder_state_input_c = Input(shape=(hidden_units,), name="input_4")
        
        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
        
        decoder_lstm = q_model.layers[5]
        
        decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
            decoder_lstm_inputs, initial_state=decoder_states_inputs
        )
        
        decoder_states = [state_h_dec, state_c_dec]
        
        decoder_dense = q_model.layers[-1]
        
        decoder_outputs = decoder_dense(decoder_outputs)
        
        decoder_model = Model(
            [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
        )
        
        return encoder_model, decoder_model


In [152]:
for step, (batch_x, batch_y) in enumerate(train_dataset):
    target_y = tf.one_hot(batch_y, depth = len(word_tokenizer.word_index) + 1)
    with tf.GradientTape() as tape:
            logits = QModel([batch_x, batch_y], training=True)  # Logits for this minibatch
            # Compute the loss value for this minibatch.
            loss_value = loss_fn(target_y, logits)

    grads = tape.gradient(loss_value, QModel.trainable_weights)
    optimizer.apply_gradients(zip(grads, QModel.trainable_weights))
    QModel.save_weights('checkpoints/q_model.h5')
    
    if step % 10 == 0:
        print(
            "Training loss (for one batch) at step %d: %.8f"
            % (step, float(loss_value))
        )
        print("Seen so far: %s samples" % ((step + 1) * batch_size))
    
    if step == 10:
        break

Training loss (for one batch) at step 0: 1.50543523
Seen so far: 16 samples
Training loss (for one batch) at step 10: 1.61434507
Seen so far: 176 samples


In [51]:
inferer = Inferer()

In [52]:
encoder_inferer, decoder_inferer = inferer(QModel)

(None, 16)


In [53]:
decoder_inferer.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input (InputLayer)      [(None, 60)]         0                                            
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, 60, 50)       4145800     decoder_input[0][0]              
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 16)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 16)]         0                                            
_______________________________________________________________________________________

In [54]:
QModel.layers[5].input

[<tf.Tensor 'decoder_embedding/embedding_lookup/Identity_1:0' shape=(None, 60, 50) dtype=float32>,
 <tf.Tensor 'encoder_lstm/PartitionedCall:2' shape=(None, 16) dtype=float32>,
 <tf.Tensor 'encoder_lstm/PartitionedCall:3' shape=(None, 16) dtype=float32>]

In [61]:
def generate_sequence(encoder_inferer, decoder_inferer, word_tokenizer, output_length, input_sequence):
    # Encode the input as state vectors.
    states_value = encoder_inferer.predict(input_sequence)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, output_length))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = word_tokenizer.word_index['what']
    
    states_value = [state for state in states_value]
                
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = 'what '
    
    word_count = 1
    while not stop_condition:
        output_tokens, h, c = decoder_inferer.predict([target_seq] + states_value)
        print(output_tokens.shape)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
            sampled_char = '<unk> '
        else:
            sampled_char = word_tokenizer.index_word[sampled_token_index]
            
        decoded_sentence += sampled_char + ' '

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence.split(' ')) > output_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq[0, word_count] = sampled_token_index
        
        word_count += 1

        # Update states
        states_value = [h, c]
        
    return decoded_sentence, target_seq


In [62]:
generated_q, target_seq = generate_sequence(encoder_inferer, decoder_inferer, word_tokenizer, max_output_length, padded_input_sequences[13:14])

(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)
(1, 60, 82916)


In [59]:
target_seq

array([[25.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [60]:
generated_q

'what <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  <unk>  '