In [32]:
from modules import squad
import numpy as np
import re
import pandas as pd
import tensorflow as tf
from modules.embeddings import load_word_embeddings
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from tqdm import tqdm
from gensim.models import KeyedVectors
import gensim
import os

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Embedding, Input, LSTM, Dropout, Dense, Activation, TimeDistributed
from tensorflow.keras import Model

In [2]:
processor = squad.SquadProcessor()

In [3]:
processor.read_json('./data/squad_train.json')

In [69]:
data_dict = processor.to_dict()

In [5]:
#word_embeddings = load_word_embeddings('./data/glove.6B.50d.txt')

In [6]:
df = pd.DataFrame(data_dict)

In [7]:
df

Unnamed: 0,context,question,answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s
...,...,...,...
86816,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,Oregon
86817,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,Rangoon
86818,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,Minsk
86819,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,1975


In [8]:
def tokenize_words(sequences):
    return [wordpunct_tokenize(seq) for seq in sequences]

def lowercase_sequences(sequences):
    if isinstance(sequences, pd.Series):
        return sequences.str.lower()
    return [seq.lower() for seq in sequences]

In [9]:
def build_vocabulary(sequences, verbose =  True):
    vocab = {}
    for seq in tqdm(sequences, disable = (not verbose)):
        for word in seq:
            try:
                vocab[word] += 1
            except:
                vocab[word] = 1
    return vocab

In [10]:
def convert_to_word2vec(filename, vocab, dimension):
    vocab_size = len(vocab)
    
    with gensim.utils.open(filename, 'wb') as f:
        dims_utf8 = gensim.utils.to_utf8("{} {}\n".format(vocab_size, dimension))
        f.write(dims_utf8)
        
        for word, row in tqdm(vocab.items()):
            row = row.astype(np.float32)
            f.write(gensim.utils.to_utf8(word) + b" " + row.tostring())

In [11]:
#convert_to_word2vec('./data/glove.6B.50d.bin', word_embeddings, word_embeddings['the'].shape[0])

In [12]:
import operator 

def oov_words(vocabulary, word_embeddings):
    in_vocab = {}
    oov = {}
    in_vocab_count = 0
    out_vocab_count = 0
    for word in tqdm(vocabulary):
        try:
            in_vocab[word] = word_embeddings[word]
            in_vocab_count += vocabulary[word]
        except Exception as e:
            oov[word] = vocabulary[word]
            out_vocab_count += vocabulary[word]

    print('Found embeddings for {:.2%} of vocab'.format(len(in_vocab) / len(vocabulary)))
    print('Found embeddings for  {:.2%} of all text'.format(in_vocab_count / (in_vocab_count + out_vocab_count)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [13]:
def replace_tokens(sequences, to_be_replaced):
    new_seqs = sequences
    for token, new_token in to_be_replaced.items():
        new_seqs = new_seqs.str.replace(token, new_token)
    
    return new_seqs

def delete_tokens(sequences, to_be_deleted):
    to_be_deleted_str = ''.join(to_be_deleted)
    
    f = lambda text: re.sub(r'[{}]'.format(to_be_deleted_str), '', text)
    
    return sequences.apply(f)

In [14]:
def handle_num_and_punc(sequences):
    def split_num_and_dot(seq):
        return re.sub('[0-9]{1,}\.(?![0-9])', lambda m: m.group(0)[:-1] + " " + ".", seq)
    
    def split_num_and_dash(seq):
        return re.sub('[0-9]{4}–[0-9]{2}', lambda m: m.group(0)[:4] + " and " + m.group(0)[:2] + m.group(0)[5:], seq)
    
    def split_num_and_currency(seq):
        curr_to_str = {
            '€': 'euro',
            '£': 'pound',
            '$': 'dollar'
        }
        res = re.sub('[€£$]{1}[0-9]+([\.,]{1}([0-9]{0,})?)?', lambda m: m.group(0)[1:] + " " + curr_to_str[m.group(0)[0]], seq)
        if 'us$' in seq:
            print(seq)
        return res
    
    return sequences.apply(split_num_and_dot).apply(split_num_and_dash).apply(split_num_and_currency)

In [15]:
def split_quoted_text(sequences):
    f = lambda text: re.sub(r' (["\'])(?:(?=(\\?))\2.)*?\1 ', lambda m: m.group(0)[0] + " " + m.group(0)[1:-1] + " " + m.group(0)[-1], text, re.DOTALL)
    
    return sequences.apply(f)

In [16]:
def split_punctuations(sequences):
    f = lambda text: re.sub(r'[\(\)\.\"\'\[\]\,%;:?!-/]{2,}', lambda m: ' '.join(list(m.group(0))), text, flags=re.DOTALL)
    
    return sequences.apply(f)

In [17]:
word_embeddings = KeyedVectors.load_word2vec_format('glove-50d.bin', binary=True)

In [20]:
word_embeddings

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f6c704ab460>

In [21]:
word_embeddings.most_similar('ayse')

[('bakhtiyar', 0.7318562269210815),
 ('dabbashi', 0.7161852121353149),
 ('soad', 0.7160789370536804),
 ('drolma', 0.7156862616539001),
 ('akhmedov', 0.705880880355835),
 ('farai', 0.7019412517547607),
 ('bucko', 0.7016493678092957),
 ('hamash', 0.6900523900985718),
 ('ozturk', 0.6877020597457886),
 ('muthiya', 0.6861430406570435)]

In [18]:
def preprocess_text(sequences):
    sequences = lowercase_sequences(sequences)
    
    to_be_changed = {
        '°c': 'celsius',
        '°f': 'fahrenheit',
        "qu'ran": 'quran',
        '−': '-',
        'asphalt/bitumen': 'asphalt bitumen',
        'us$': 'us $',
        '°': ' degree ',
        '”': '"',
        '—': '-',
        '⁄': '/',
        '′': "'",
        '″': '"',
        '×': 'times',
        '–': '-'
    }

    to_be_deleted = ['§', '्', '\ufeff']

    sequences = replace_tokens(sequences, to_be_changed)

    sequences = delete_tokens(sequences, to_be_deleted)

    sequences = handle_num_and_punc(sequences)

    sequences = split_punctuations(sequences)
    
    return sequences

In [19]:
train_sequences = preprocess_text(df.context)
question_sequences = preprocess_text(df.question)

in september 2007, during a lawsuit with patent holding company burst.com, apple drew attention to a patent for a similar device that was developed in 1979 . kane kramer applied for a uk patent for his design of a "plastic music box" in 1981, which he called the ixi. he was unable to secure funding to renew the us$120,000 worldwide patent, so it lapsed and kramer never profited from his idea.
in september 2007, during a lawsuit with patent holding company burst.com, apple drew attention to a patent for a similar device that was developed in 1979 . kane kramer applied for a uk patent for his design of a "plastic music box" in 1981, which he called the ixi. he was unable to secure funding to renew the us$120,000 worldwide patent, so it lapsed and kramer never profited from his idea.
in september 2007, during a lawsuit with patent holding company burst.com, apple drew attention to a patent for a similar device that was developed in 1979 . kane kramer applied for a uk patent for his design

at launch, the xbox 360 was available in two configurations: the "xbox 360" package (unofficially known as the 20 gb pro or premium), priced at us$399 or gb£279.99, and the "xbox 360 core", priced at us$299 and gb£209.99 . the original shipment of the xbox 360 version included a cut-down version of the media remote as a promotion. the elite package was launched later at us$479 . the "xbox 360 core" was replaced by the "xbox 360 arcade" in october 2007 and a 60 gb version of the xbox 360 pro was released on august 1, 2008 . the pro package was discontinued and marked down to us$249 on august 28, 2009 to be sold until stock ran out, while the elite was also marked down in price to us$299 .
at launch, the xbox 360 was available in two configurations: the "xbox 360" package (unofficially known as the 20 gb pro or premium), priced at us$399 or gb£279.99, and the "xbox 360 core", priced at us$299 and gb£209.99 . the original shipment of the xbox 360 version included a cut-down version of the

in 2001, comcast announced it would acquire the assets of the largest cable television operator at the time, at&t broadband, for us$44.5 billion. the proposed name for the merged company was "at&t comcast", but the companies ultimately decided to keep only the comcast name. in 2002, comcast acquired all assets of at&t broadband, thus making comcast the largest cable television company in the united states with over 22 million subscribers. this also spurred the start of comcast advertising sales (using at&t's groundwork) which would later be renamed comcast spotlight. as part of this acquisition, comcast also acquired the national digital television center in centennial, colorado as a wholly owned subsidiary, which is today known as the comcast media center.
in 2001, comcast announced it would acquire the assets of the largest cable television operator at the time, at&t broadband, for us$44.5 billion. the proposed name for the merged company was "at&t comcast", but the companies ultimat

iran has the second largest proved gas reserves in the world after russia, with 33.6 trillion cubic metres, and third largest natural gas production in the world after indonesia, and russia. it also ranks fourth in oil reserves with an estimated 153,600,000,000 barrels. it is opec's 2nd largest oil exporter and is an energy superpower. in 2005, iran spent us$4 billion on fuel imports, because of contraband and inefficient domestic use. oil industry output averaged 4 million barrels per day (640,000 m3/d) in 2005, compared with the peak of six million barrels per day reached in 1974 . in the early years of the 2000s (decade), industry infrastructure was increasingly inefficient because of technological lags. few exploratory wells were drilled in 2005 .
iran has the second largest proved gas reserves in the world after russia, with 33.6 trillion cubic metres, and third largest natural gas production in the world after indonesia, and russia. it also ranks fourth in oil reserves with an es

with interpublic group, what company has a combined annual revenue of roughly us$21 billion?
how many indymac account holders held funds in excess of the fdic's insured amount of us$100,000?


In [36]:
train_sequences[0]

'beyoncé giselle knowles-carter ( /biːˈjɒnseɪ/ bee-yon-say) (born september 4, 1981) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r&b girl-group destiny\'s child. managed by her father, mathew knowles, the group became one of the world\'s best-selling girl groups of all time. their hiatus saw the release of beyoncé\'s debut album, dangerously in love (2003) , which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number-one singles "crazy in love" and "baby boy" .'

In [23]:
"us$1,000,000".replace('us$', 'us $')

'us $1,000,000'

In [20]:
train_tokenized = tokenize_words(train_sequences)
question_tokenized = tokenize_words(question_sequences)

In [116]:
train_tokenized[0]

['beyoncé',
 'giselle',
 'knowles',
 '-',
 'carter',
 '(',
 '/',
 'biːˈjɒnseɪ',
 '/',
 'bee',
 '-',
 'yon',
 '-',
 'say',
 ')',
 '(',
 'born',
 'september',
 '4',
 ',',
 '1981',
 ')',
 'is',
 'an',
 'american',
 'singer',
 ',',
 'songwriter',
 ',',
 'record',
 'producer',
 'and',
 'actress',
 '.',
 'born',
 'and',
 'raised',
 'in',
 'houston',
 ',',
 'texas',
 ',',
 'she',
 'performed',
 'in',
 'various',
 'singing',
 'and',
 'dancing',
 'competitions',
 'as',
 'a',
 'child',
 ',',
 'and',
 'rose',
 'to',
 'fame',
 'in',
 'the',
 'late',
 '1990s',
 'as',
 'lead',
 'singer',
 'of',
 'r',
 '&',
 'b',
 'girl',
 '-',
 'group',
 'destiny',
 "'",
 's',
 'child',
 '.',
 'managed',
 'by',
 'her',
 'father',
 ',',
 'mathew',
 'knowles',
 ',',
 'the',
 'group',
 'became',
 'one',
 'of',
 'the',
 'world',
 "'",
 's',
 'best',
 '-',
 'selling',
 'girl',
 'groups',
 'of',
 'all',
 'time',
 '.',
 'their',
 'hiatus',
 'saw',
 'the',
 'release',
 'of',
 'beyoncé',
 "'",
 's',
 'debut',
 'album',
 ',',

In [21]:
vocab = build_vocabulary(train_tokenized + question_tokenized)
vocab

100%|██████████| 173642/173642 [00:01<00:00, 120829.18it/s]


{'beyoncé': 2692,
 'giselle': 28,
 'knowles': 120,
 '-': 138729,
 'carter': 154,
 '(': 92925,
 '/': 9760,
 'biːˈjɒnseɪ': 15,
 'bee': 77,
 'yon': 15,
 'say': 1020,
 ')': 93008,
 'born': 1784,
 'september': 3423,
 '4': 5673,
 ',': 687140,
 '1981': 485,
 'is': 108198,
 'an': 40107,
 'american': 8673,
 'singer': 583,
 'songwriter': 133,
 'record': 2957,
 'producer': 698,
 'and': 341127,
 'actress': 287,
 '.': 496038,
 'raised': 1034,
 'in': 318244,
 'houston': 1406,
 'texas': 816,
 'she': 5463,
 'performed': 1319,
 'various': 3880,
 'singing': 416,
 'dancing': 147,
 'competitions': 422,
 'as': 105657,
 'a': 203391,
 'child': 1905,
 'rose': 787,
 'to': 247341,
 'fame': 418,
 'the': 879730,
 'late': 4926,
 '1990s': 1164,
 'lead': 1593,
 'of': 443618,
 'r': 1733,
 '&': 2566,
 'b': 2134,
 'girl': 337,
 'group': 6290,
 'destiny': 378,
 "'": 92540,
 's': 82670,
 'managed': 771,
 'by': 85091,
 'her': 10146,
 'father': 2015,
 'mathew': 69,
 'became': 9445,
 'one': 23930,
 'world': 15042,
 'best': 

In [22]:
oov_words(vocab, word_embeddings)

100%|██████████| 82915/82915 [00:00<00:00, 649197.80it/s]

Found embeddings for 80.70% of vocab
Found embeddings for  99.33% of all text





[('⟨', 269),
 ('midna', 215),
 ('́', 185),
 ('⟩', 155),
 ('utf', 137),
 ('km²', 134),
 ('ganondorf', 115),
 ('deréon', 112),
 ('latewood', 109),
 ('poundm', 102),
 ('yeezy', 100),
 ('aonuma', 92),
 ('kbit', 89),
 ('⟩,', 81),
 ('̇', 75),
 ('َ', 73),
 ('deshin', 71),
 ('♠', 69),
 ('±', 69),
 ('[›]', 67),
 ('shekpa', 65),
 ('maycomb', 64),
 ('ِ', 63),
 ('₹', 60),
 ('gaddafist', 59),
 ('dollarbn', 57),
 ('əs', 56),
 ('woyciechowski', 56),
 ('earlywood', 55),
 ('grzymała', 55),
 ('m²', 54),
 ('ghmc', 53),
 ('̍', 53),
 ('isil', 53),
 ('apizza', 52),
 ('(~', 52),
 ('nohant', 52),
 ('wodzińska', 52),
 ('asphalte', 50),
 ('arsphenamine', 50),
 ('11172', 50),
 ('rossabi', 50),
 ('digivolution', 49),
 ('clitellates', 48),
 ('→', 47),
 ('4512', 46),
 ('nazianzen', 45),
 ('westquay', 45),
 ('hinx', 45),
 ('prehension', 44),
 ('feedpoint', 44),
 ('vinaccia', 43),
 ('żywny', 43),
 ('discovision', 42),
 ('kuhf', 42),
 ('fradiani', 41),
 ('medreses', 40),
 ('vaiśeṣika', 40),
 ('electroforming', 40),
 (

In [23]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(train_tokenized + question_tokenized)

In [24]:
train_input_data = word_tokenizer.texts_to_sequences(train_tokenized)
train_output_data = word_tokenizer.texts_to_sequences(question_tokenized)

In [25]:
def get_max_length(sequences):
    return max([len(seq) for seq in sequences])

In [26]:
max_input_length, max_output_length = get_max_length(train_input_data), get_max_length(train_output_data)

max_input_length, max_output_length

(815, 60)

In [27]:
padded_input_sequences = pad_sequences(train_input_data, maxlen=max_input_length, padding='post')
padded_output_sequences = pad_sequences(train_output_data, maxlen=max_output_length, padding='post')

In [42]:
padded_input_sequences.shape, padded_output_sequences.shape

((86821, 815), (86821, 60))

In [34]:
embedding_size = word_embeddings['the'].shape[0]
training_size = padded_train_sequences.shape[0]

In [28]:
def pretrained_embedding_layer(word_embeddings, word_to_index):
    embedding_size = word_embeddings['the'].shape[0]
    vocab_len = len(word_to_index) + 1
    
    embedding_matrix = np.zeros((vocab_len, embedding_size))
    
    for word, idx in word_to_index.items():
        if word in word_embeddings:
            embedding_matrix[idx, :] = word_embeddings[word]

    embedding_layer = Embedding(input_dim=vocab_len, output_dim=embedding_size, trainable = False)

    embedding_layer.build((None,))
    
    embedding_layer.set_weights([embedding_matrix])
    
    return embedding_layer

In [29]:
class Encoder():
    def __call__(self, input_shape, hidden_layers, word_embeddings, word_to_index):
        # Define input layer
        raw_input = Input(shape=input_shape)
        # Define embedding layer and load word embeddings
        embedding_layer = pretrained_embedding_layer(word_embeddings, word_to_index)
        # Give inputs to embedding layer
        encoder_inputs = embedding_layer(raw_input)
        # Define LSTM layer
        encoder = LSTM(units=hidden_layers, return_state=True)
        # Run LSTM on word embeddings
        _, state_h, state_c = encoder(encoder_inputs)
        # Return states of LSTM
        return raw_input, [state_h, state_c]

In [36]:
class Decoder():
    def __call__(self, input_shape, hidden_layers, word_embeddings, word_to_index, encoder_states):
        # Define input layer
        raw_input = Input(shape=input_shape)
        # Define embedding layer and load word embeddings
        embedding_layer = pretrained_embedding_layer(word_embeddings, word_to_index)
        # Give inputs to embedding layer
        decoder_inputs = embedding_layer(raw_input)
        # Define LSTM layer
        decoder = LSTM(hidden_layers, return_sequences=True)
        # Run LSTM on decoder inputs
        lstm_outputs = decoder(decoder_inputs, initial_state=encoder_states)
        # Run Dense layer with softmax
        decoder_outputs = TimeDistributed(Dense(len(word_to_index) + 1, activation='softmax'))(lstm_outputs)
        
        return raw_input, decoder_outputs


In [37]:
def QuestionModel(max_input_length, max_output_length, word_embeddings, word_to_index):
    
    encoder = Encoder()
    decoder = Decoder()
    
    encoder_input, encoder_states = encoder((max_input_length,), 16, word_embeddings, word_to_index)
    
    decoder_input, decoder_outputs= decoder((max_output_length,), 16, word_embeddings, word_to_index, encoder_states)
    
    model = Model(inputs = [encoder_input, decoder_input], outputs=decoder_outputs)
        
    return model

In [38]:
QModel = QuestionModel(max_input_length, max_output_length, word_embeddings, word_tokenizer.word_index)

In [39]:
QModel.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 815)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 60)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 815, 50)      4145800     input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 60, 50)       4145800     input_4[0][0]                    
_______________________________________________________________________________________

In [40]:
if os.path.exists('checkpoints/q_model.h5'):
    print('Weights loaded!')
    QModel.load_weights('checkpoints/q_model.h5')

Weights loaded!


In [41]:
batch_size = 16

In [62]:
train_dataset = tf.data.Dataset.from_tensor_slices((padded_input_sequences, padded_output_sequences))
train_dataset = train_dataset.shuffle(buffer_size = 512).batch(batch_size)

In [66]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=1e-5)
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

In [68]:
for step, (batch_x, batch_y) in enumerate(train_dataset):
    target_y = tf.one_hot(batch_y, depth = len(word_tokenizer.word_index) + 1)
    with tf.GradientTape() as tape:
            logits = QModel([batch_x, batch_y], training=True)  # Logits for this minibatch
            # Compute the loss value for this minibatch.
            loss_value = loss_fn(target_y, logits)

    grads = tape.gradient(loss_value, QModel.trainable_weights)
    optimizer.apply_gradients(zip(grads, QModel.trainable_weights))
    QModel.save_weights('checkpoints/q_model.h5')
    
    if step % 10 == 0:
        print(
            "Training loss (for one batch) at step %d: %.8f"
            % (step, float(loss_value))
        )
        print("Seen so far: %s samples" % ((step + 1) * batch_size))
    
    if step == 20:
        break

Training loss (for one batch) at step 0: 10.51101971
Seen so far: 16 samples
Training loss (for one batch) at step 10: 10.52560329
Seen so far: 176 samples
Training loss (for one batch) at step 20: 10.51206112
Seen so far: 336 samples
