In [1]:
import pandas as pd
import numpy as np
import string

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Attention
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku

2023-04-04 12:59:31.711882: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-04 12:59:31.871441: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-04 12:59:31.871464: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-04 12:59:31.904210: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-04-04 12:59:33.168897: W tensorflow/stream_executor/platform/de

# Task definition: 
GOAL: Create a biLSTM model to take context, and question, and generate an answer
INPUT: Two inputs, each into their own biLSTM, one: "CONTEXT", two: "QUESTION"
OUTPUT: [[START_POSITION], [END_POSITION]] one-hot-encoded with respect to the context
1. Load SQuAD dataset
2. Clean Context, and Question input
3. Create input, and output lists
4. Tokenize data
5. Pad data
6. Create one-hot-encoding of ANSWERS
7. Define Model which consists of two inputs, two outputs, embedding layers, and bilstm layers
6. Create a loss function to get the correct start and end


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
raw_data = pd.read_json("../Data/SQuAD2/train-v2.0.json")

In [5]:
testing_POC = raw_data["data"]

# Prep data

Let's try to create a model that takes two inputs: Context, Question as padded sequences

In [7]:
def CQA_extraction_twoInputs(data):
    context = []
    questions = []
    answers = []
    for topic in data:
        for id, cq in enumerate(topic["paragraphs"]):
            for x in cq["qas"]:
                if x["is_impossible"]==True:
                    continue
                question_text = x['question']
                context_text = cq['context']

                context.append(context_text)

                questions.append(question_text)
                
                answer_text = x["answers"][0]["text"]
                answer_start = x["answers"][0]["answer_start"]
                
                answers.append({"text":answer_text, "start":answer_start, "end":answer_start+len(answer_text), "context":context_text})#[f"{x['question']}"] = 
    return context, questions, answers

In [8]:
context, questions, answers = CQA_extraction_twoInputs(testing_POC.head(20))

### Clean the Questions, and Context

In [9]:
def clean_text(txt):
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt

In [10]:
cleaned_context = [clean_text(x) for x in context]
cleaned_questions = [clean_text(x) for x in questions]

Let's also find the max length of the context!

In [11]:
context_length = max([len(answer["context"]) for answer in answers])
context_length

3076

### Tokenize data

In [12]:
#Define tokenizer
tokenizer = Tokenizer()

In [13]:
#Fit Tokenizer
tokenizer.fit_on_texts(cleaned_questions+cleaned_context)
#Create tokenized sequences of the context, and questions
sequences_question = tokenizer.texts_to_sequences(cleaned_questions)
sequences_context = tokenizer.texts_to_sequences(cleaned_context)
#Find max sequence length of questions and context, together, and seperately
#Max's are for padding
max_length = max([len(x) for x in sequences_context+sequences_question])
max_length_questions = max([len(x) for x in sequences_question])
max_length_context = max([len(x) for x in sequences_context])
#Pad sequences!
padded_sequences_context = pad_sequences(sequences_context, maxlen=max_length_context, padding='post')
padded_sequences_questions = pad_sequences(sequences_question, maxlen=max_length_questions, padding='post')


Train test split prep

In [14]:
#Add padded sequences to X set
X = []
for id, x in enumerate(padded_sequences_questions):
    X.append([padded_sequences_context[id], x])


In [15]:
#Create a vector of length conttext
y_startPOS = [np.zeros(context_length) for answer in answers]
y_endPOS = [np.zeros(context_length) for answer in answers]

In [16]:
#Find the start and end of each answer
start_ends = [[answer["start"], answer["end"]] for answer in answers]
#Map the start and end of each question to its position in
#it's respective vector of length CONTEXT_LENGTH
for id, vector in enumerate(y_startPOS):
    vector[start_ends[id][0]]=1
for id, vector in enumerate(y_endPOS):
    vector[start_ends[id][0]]=1

In [17]:
output_data = []
for id, vector in enumerate(y_startPOS):
    output_data.append([vector, y_endPOS[id]])

#### Train test split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, output_data, test_size=0.25, random_state=1)

In [19]:
np.shape(np.array([np.array(y_train)[:,0], np.array(y_train)[:,0]]))

(2, 5729, 3076)

### Model Creation

Model Definition

In [42]:
def create_model(context_sequence_length, question_sequence_length, vocab_length):
    input_1 = tf.keras.layers.Input(shape=(context_sequence_length,))  # shape of input CONTEXT
    input_2 = tf.keras.layers.Input(shape=(question_sequence_length,))  # shape of input QUESTION
    #INPUT 1
    #Embedding
    embedding_1 = Embedding(input_dim=vocab_length, output_dim=100)(input_1)
    #LSTM
    lstm_1 = tf.keras.layers.Bidirectional(LSTM(units=64))(embedding_1)
    #INPUT 2
    #Embedding
    embedding_2 = Embedding(input_dim=vocab_length, output_dim=100)(input_2)
    #LSTM
    lstm_2 = tf.keras.layers.Bidirectional(LSTM(units=64))(embedding_2)
    #concat the layers
    concatenated = tf.keras.layers.concatenate([lstm_1, lstm_2])
    #Reshape
    reshape_layer = tf.keras.layers.Reshape((256, 1))(concatenated)
    #Dense layer 
    final_bilstm = tf.keras.layers.Bidirectional(LSTM(units=32))(reshape_layer)
    output_start = Dense(units=3076, activation='softmax')(final_bilstm)
    output_end = Dense(units=3076, activation='softmax')(final_bilstm)
    #Define Model
    goal2model = tf.keras.models.Model(inputs=[input_1, input_2], outputs=[output_start, output_end])

    #Copile Model
    goal2model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return goal2model

In [43]:
#Create model
model = create_model(max_length_context, max_length_questions, len(tokenizer.word_index)+1)

Fit Model

In [44]:
model.fit([np.array([X[0] for X in X_train]), np.array([X[1] for X in X_train])], 
          [np.array(y_train)[:,0], np.array(y_train)[:,1]],epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3e06964820>

In [None]:
l = model.predict([np.array([X[0] for X in X_test]), np.array([X[1] for X in X_test])])



In [None]:
np.argmax(l[3][0])

IndexError: list index out of range