In [2]:
import pandas as pd
import numpy as np
import string

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Attention
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku

# USAGE: GOAL two methods are best so run code under the *Goal 2* headers. Further cleaning coming soon

# Task definition: 
FIRST GOAL: Create a LSTM model to take context, and question, and generate an answer
INPUT: CQ together like "[CLS]CONTEXT[SEP]QUESTION[STOP]" in a list of strings
OUTPUT: START_POSITION, END_POSITION] with respect to the context
1. Load SQuAD dataset
2. Remove puntutation from CQ input
3. Create input, and output lists
4. Tokenize data
5. Pad data
6. Create a loss function to get the correct start and end


SECOND GOAL: Create a biLSTM model to take context, and question, and generate an answer
INPUT: Two inputs, each into their own biLSTM, one: "CONTEXT", two: "QUESTION"
OUTPUT: [[START_POSITION], [END_POSITION]] one-hot-encoded with respect to the context
1. Load SQuAD dataset
2. Clean Context, and Question input
3. Create input, and output lists
4. Tokenize data
5. Pad data
6. Create one-hot-encoding of ANSWERS
7. Define Model
6. Create a loss function to get the correct start and end


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
raw_data = pd.read_json("../train-v2.0.json")

In [6]:
testing_POC = raw_data["data"]

# Prep data

### First Goal
Extract the questions and context, as well as answer locations in context

In [19]:
def CQA_extraction(data):
    context_questions = []
    answers = []
    for topic in data:
        for id, cq in enumerate(topic["paragraphs"]):
            for x in cq["qas"]:
                if x["is_impossible"]==True:
                    continue
                question_text = x['question']
                context_text = cq['context']

                context_questions.append(f"[CLS] {context_text} [SEP] {question_text} [END]")
                
                answer_text = x["answers"][0]["text"]
                answer_start = x["answers"][0]["answer_start"]
                
                answers.append({"text":answer_text, "start":answer_start, "end":answer_start+len(answer_text), "context":context_text})#[f"{x['question']}"] = 
    return context_questions, answers

In [None]:
context_questions, answers = CQA_extraction(testing_POC.head(20))

### Second Goal
Let's also try to create something that takes two inputs: Context, Question

In [7]:
def CQA_extraction_twoInputs(data):
    context = []
    questions = []
    answers = []
    for topic in data:
        for id, cq in enumerate(topic["paragraphs"]):
            for x in cq["qas"]:
                if x["is_impossible"]==True:
                    continue
                question_text = x['question']
                context_text = cq['context']

                context.append(context_text)

                questions.append(question_text)
                
                answer_text = x["answers"][0]["text"]
                answer_start = x["answers"][0]["answer_start"]
                
                answers.append({"text":answer_text, "start":answer_start, "end":answer_start+len(answer_text), "context":context_text})#[f"{x['question']}"] = 
    return context, questions, answers

In [8]:
context, questions, answers = CQA_extraction_twoInputs(testing_POC.head(20))

### Clean the Questions, and Context

In [9]:
def clean_text(txt):
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt

### First Goal

In [74]:
cleaned_context_questions = [clean_text(x) for x in context_questions]

NameError: name 'context_questions' is not defined

### Second Goal

In [10]:
cleaned_context = [clean_text(x) for x in context]
cleaned_questions = [clean_text(x) for x in questions]

Let's also find the max length of the context!

In [11]:
context_length = max([len(answer["context"]) for answer in answers])
context_length

3076

### Tokenize data

In [12]:
#Define tokenizer
tokenizer = Tokenizer()

### First Goal

In [79]:
#Fit tokenizer
tokenizer.fit_on_texts(cleaned_context_questions)
#Define sequences
sequences = tokenizer.texts_to_sequences(cleaned_context_questions)
#Find max length of sequences for padding
max_length = max([len(x) for x in sequences])
#Pad sequences based on mac length
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

NameError: name 'cleaned_context_questions' is not defined

### Second Goal

In [13]:
#Fit Tokenizer
tokenizer.fit_on_texts(cleaned_questions+cleaned_context)
#Create tokenized sequences of the context, and questions
sequences_question = tokenizer.texts_to_sequences(cleaned_questions)
sequences_context = tokenizer.texts_to_sequences(cleaned_context)
#Find max sequence length of questions and context, together, and seperately
#Max's are for padding
max_length = max([len(x) for x in sequences_context+sequences_question])
max_length_questions = max([len(x) for x in sequences_question])
max_length_context = max([len(x) for x in sequences_context])
#Pad sequences!
padded_sequences_context = pad_sequences(sequences_context, maxlen=max_length_context, padding='post')
padded_sequences_questions = pad_sequences(sequences_question, maxlen=max_length_questions, padding='post')


### Train test split

### First Goal

In [78]:
#Add padded sequences to X set
X = []
for x in padded_sequences:
    X.append(x)

NameError: name 'padded_sequences' is not defined

In [77]:
#Create 2d array of starts and ends of answers
y = [[answer["start"], answer["end"]] for answer in answers]
#Normalize data
y = [[arr[0]/context_length, arr[1]/context_length] for arr in y]

### Second Goal

In [14]:
#Add padded sequences to X set
X = []
for id, x in enumerate(padded_sequences_questions):
    X.append([padded_sequences_context[id], x])


In [15]:
#Create a vector of length conttext
y_startPOS = [np.zeros(context_length) for answer in answers]
y_endPOS = [np.zeros(context_length) for answer in answers]

In [16]:
len(y_startPOS[0])

3076

In [17]:
#Find the start and end of each answer
start_ends = [[answer["start"], answer["end"]] for answer in answers]
#Map the start and end of each question to its position in
#it's respective vector of length CONTEXT_LENGTH
for id, vector in enumerate(y_startPOS):
    vector[start_ends[id][0]]=1
for id, vector in enumerate(y_endPOS):
    vector[start_ends[id][0]]=1

In [18]:
output_data = []
for id, vector in enumerate(y_startPOS):
    output_data.append([vector, y_endPOS[id]])

### First Goal

#### Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

### Second Goal

#### Train test split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, output_data, test_size=0.25, random_state=1)

In [32]:
np.shape(np.array([np.array(y_train)[:,0], np.array(y_train)[:,0]]))

(2, 5729, 3076)

### Model Creation

### First Goal

### Second Goal

Model Definition

In [64]:
def create_model(context_sequence_length, question_sequence_length, vocab_length):
    input_1 = tf.keras.layers.Input(shape=(context_sequence_length,))  # shape of input CONTEXT
    input_2 = tf.keras.layers.Input(shape=(question_sequence_length,))  # shape of input QUESTION
    #INPUT 1
    #Embedding
    embedding_1 = Embedding(input_dim=vocab_length, output_dim=4)(input_1)
    #LSTM
    lstm_1 = LSTM(units=10)(embedding_1)
    #INPUT 2
    #Embedding
    embedding_2 = Embedding(input_dim=vocab_length, output_dim=4)(input_2)
    #LSTM
    lstm_2 = LSTM(units=10)(embedding_2)
    #concat the layers
    concatenated = tf.keras.layers.concatenate([lstm_1, lstm_2])
    #Dense layer 
    output_start = Dense(units=3076, activation='softmax')(concatenated)
    output_end = Dense(units=3076, activation='softmax')(concatenated)
    #Define Model
    goal2model = tf.keras.models.Model(inputs=[input_1, input_2], outputs=[output_start, output_end])

    #Copile Model
    goal2model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return goal2model

In [65]:
#Create model
model = create_model(max_length_context, max_length_questions, len(tokenizer.word_index)+1)

In [66]:
np.shape([np.array(y_train)[:,0], np.array(y_train)[:,1]])

(2, 5729, 3076)

Fit Model

In [67]:
model.fit([np.array([X[0] for X in X_train]), np.array([X[1] for X in X_train])], 
          [np.array(y_train)[:,0], np.array(y_train)[:,1]],epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fcc2b7d8730>

In [38]:
l = model.predict([np.array([X[0] for X in X_test]), np.array([X[1] for X in X_test])])



In [63]:
np.argmax(l[3][0])

IndexError: list index out of range

#### Loss function should take into account the start and end of the answer wrt the context

In [None]:
def squad_loss(y_true, y_pred):
    euclidean_loss = tf.keras.losses.MeanSquaredError()
    loss = tf.norm(y_pred - y_true, ord='euclidean')
    return loss

In [None]:
print(len(X_train[0][0])) #CONTEXT
print(len(X_train[0][1])) #QUESTION

509
29


In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len
    model = Sequential()
    # ----------Add Input Embedding Layer
    model.add(Embedding(total_words, 30, input_length=input_len))
    # ----------Add Hidden Layer 1 - LSTM Layer
    # model.add(LSTM(64))
    lstm_layer = tf.keras.layers.LSTM(units=10)
    bilstm_layer = tf.keras.layers.Bidirectional(lstm_layer)
    model.add(bilstm_layer)
    # ----------Add Output Layer
    model.add(Dense(1, activation='softmax'))
    model.compile(loss=tf.keras.losses.MSE, optimizer='adam')
    return model
model = create_model(max_length, len(tokenizer.word_index) + 1)
model.summary()


Model: "sequential_43"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_43 (Embedding)    (None, 525, 30)           527940    
                                                                 
 bidirectional_8 (Bidirectio  (None, 20)               3280      
 nal)                                                            
                                                                 
 dense_35 (Dense)            (None, 1)                 21        
                                                                 
Total params: 531,241
Trainable params: 531,241
Non-trainable params: 0
_________________________________________________________________


In [71]:
model.fit([np.array([X[0] for X in X_train]), np.array([X[1] for X in X_train])], np.array(y_train)[:,0],epochs=3)

Epoch 1/3


ValueError: in user code:

    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/losses.py", line 2162, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "/home/monty/.local/lib/python3.8/site-packages/keras/backend.py", line 5677, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(

    ValueError: `logits` and `labels` must have the same shape, received ((None, 7639) vs (None, 3076)).


In [66]:
len(np.array(y_train)[:,0][0])

3076

In [546]:
l = model.predict([np.array([X[0] for X in X_test]), np.array([X[1] for X in X_test])])

11/60 [====>.........................] - ETA: 1s

InvalidArgumentError: Graph execution error:

Detected at node 'model_6/embedding_63/embedding_lookup' defined at (most recent call last):
    File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/home/monty/.local/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "/home/monty/.local/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
      app.start()
    File "/home/monty/.local/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 677, in start
      self.io_loop.start()
    File "/home/monty/.local/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/usr/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/usr/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/home/monty/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "/home/monty/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "/home/monty/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 353, in dispatch_shell
      await result
    File "/home/monty/.local/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "/home/monty/.local/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "/home/monty/.local/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2898, in run_cell
      result = self._run_cell(
    File "/home/monty/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2944, in _run_cell
      return runner(coro)
    File "/home/monty/.local/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "/home/monty/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3169, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/monty/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3361, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/home/monty/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_22163/4230257143.py", line 1, in <module>
      l = model.predict([np.array([X[0] for X in X_test]), np.array([X[1] for X in X_test])])
    File "/home/monty/.local/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 2253, in predict
      tmp_batch_outputs = self.predict_function(iterator)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 2041, in predict_function
      return step_function(self, iterator)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 2027, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 2015, in run_step
      outputs = model.predict_step(data)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 1983, in predict_step
      return self(x, training=False)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/engine/base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/monty/.local/lib/python3.8/site-packages/keras/layers/core/embedding.py", line 208, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'model_6/embedding_63/embedding_lookup'
indices[7,91] = 17595 is not in [0, 17595)
	 [[{{node model_6/embedding_63/embedding_lookup}}]] [Op:__inference_predict_function_187950]