# Bidirectional LSTM

In [1]:
from matplotlib import pyplot as plt

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import numpy as np

import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Load data

In [2]:
def load_json(path):
    '''
    Loads the JSON file of the Squad dataset.
    Returns the json object of the dataset.
    '''
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
    print("Length of data: ", len(data['data']))
    print("Data Keys: ", data['data'][0].keys())
    print("Title: ", data['data'][0]['title'])
    
    return data

def parse_data(data:dict)->list:
    '''
    Parses the JSON file of Squad dataset by looping through the
    keys and values and returns a list of dictionaries with
    context, query and label triplets being the keys of each dict.
    '''
    data = data['data']
    qa_list = []

    for paragraphs in data:

        for para in paragraphs['paragraphs']:
            context = para['context']

            for qa in para['qas']:
                
                id = qa['id']
                question = qa['question']
                
                for ans in qa['answers']:
                    answer = ans['text']
                    ans_start = ans['answer_start']
                    ans_end = ans_start + len(answer)
                    
                    qa_dict = {}
                    qa_dict['id'] = id
                    qa_dict['context'] = context
                    qa_dict['question'] = question
                    qa_dict['label'] = [ans_start, ans_end]

                    qa_dict['answer'] = answer
                    qa_list.append(qa_dict)    

    
    return qa_list


In [3]:
import json

train_data = load_json('/kaggle/input/new-squaddataset/train-v2.0.json')
valid_data = load_json('/kaggle/input/new-squaddataset/dev-v2.0.json')

# parse the json structure to return the data as a list of dictionaries

train_list = parse_data(train_data)
valid_list = parse_data(valid_data)
print('--------------------------')

print('Train list len: ',len(train_list))
print('Valid list len: ',len(valid_list))

# converting the lists into dataframes

train_ds = pd.DataFrame(train_list)
val_ds = pd.DataFrame(valid_list)

train_ds = train_ds.drop('id', axis=1)
train_ds = train_ds.drop('label', axis=1)
val_ds = val_ds.drop('id', axis=1)
val_ds = val_ds.drop('label', axis=1)
train_ds.columns = ['Paragraph', 'Question', 'Answer']
val_ds.columns = ['Paragraph', 'Question', 'Answer']

Length of data:  442
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  Beyoncé
Length of data:  35
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  Normans
--------------------------
Train list len:  86821
Valid list len:  20302


In [4]:
train_ds.head()

Unnamed: 0,Paragraph,Question,Answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s


In [5]:
val_ds.head()

Unnamed: 0,Paragraph,Question,Answer
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France
1,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France
2,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France
3,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France
4,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,10th and 11th centuries


In [6]:
train_ds.head()

Unnamed: 0,Paragraph,Question,Answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s


# **Loading Model**

In [7]:
with open('/kaggle/input/load-lstm/model3_val.npy', 'rb') as f:
    model3_val = np.load(f)

with open('/kaggle/input/load-lstm/model3_train.npy', 'rb') as f:
    model3_train = np.load(f)

In [8]:
train_data_testing = load_json('/kaggle/input/new-squaddataset/train-v2.0.json')
valid_data_testing = load_json('/kaggle/input/new-squaddataset/dev-v2.0.json')

# parse the json structure to return the data as a list of dictionaries

train_list_testing = parse_data(train_data_testing)
valid_list_testing = parse_data(valid_data_testing)
print('--------------------------')

print('Train list len: ',len(train_list_testing))
print('Valid list len: ',len(valid_list_testing))

# converting the lists into dataframes

train_ds_testing = pd.DataFrame(train_list_testing)
val_ds_testing = pd.DataFrame(valid_list_testing)
# train_ds_testing.columns = ['Paragraph', 'Question', 'Answer']
# val_ds_testing.columns = ['Paragraph', 'Question', 'Answer']

Length of data:  442
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  Beyoncé
Length of data:  35
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  Normans
--------------------------
Train list len:  86821
Valid list len:  20302


In [9]:
val_ds_testing.head()

Unnamed: 0,id,context,question,label,answer
0,56ddde6b9a695914005b9628,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"[159, 165]",France
1,56ddde6b9a695914005b9628,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"[159, 165]",France
2,56ddde6b9a695914005b9628,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"[159, 165]",France
3,56ddde6b9a695914005b9628,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"[159, 165]",France
4,56ddde6b9a695914005b9629,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"[94, 117]",10th and 11th centuries


In [10]:
predictions = {}
for index, row in val_ds_testing.iterrows():
    id_val = row['id']
    paragraph = row['context']
    question = row['question']
    answer_start = row['label'][0]
    answer = row['answer']
    
    answer_end = answer_start + len(answer)
    if paragraph[answer_start-1:answer_end-1] == answer:
        answer_start -= 1
        answer_end -= 1
    elif paragraph[answer_start-2:answer_end-2] == answer:
        answer_start -= 2
        answer_end -= 2

    m3_start = model3_val[index][0]
    m3_end = model3_val[index][1]
    m3 = paragraph[m3_start:m3_end+1]
    
    predicted_answer = m3
    actual_answer = answer

    # Convert predicted and actual answers to sets of characters
    predicted_chars = set(predicted_answer)
    actual_chars = set(actual_answer)

    # Calculate common characters
    common_chars = predicted_chars.intersection(actual_chars)

    # Calculate precision, recall, and F1 score at character level
    precision = len(common_chars) / len(predicted_chars) if len(predicted_chars) > 0 else 0
    recall = len(common_chars) / len(actual_chars) if len(actual_chars) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    predictions[id_val] = [m3,f1_score]

prediction_validDataset_BiLSTM = 'prediction_validDataset_BiLSTM.txt'
with open(prediction_validDataset_BiLSTM, 'w', encoding='utf-8') as file:
    json.dump(predictions, file, ensure_ascii=False)

In [11]:
predictions = {}
for index, row in train_ds_testing.iterrows():
    id_val = row['id']
    paragraph = row['context']
    question = row['question']
    answer_start = row['label'][0]
    answer = row['answer']
    
    answer_end = answer_start + len(answer)
    if paragraph[answer_start-1:answer_end-1] == answer:
        answer_start -= 1
        answer_end -= 1
    elif paragraph[answer_start-2:answer_end-2] == answer:
        answer_start -= 2
        answer_end -= 2

    m3_start = model3_train[index][0]
    m3_end = model3_train[index][1]
    m3 = paragraph[m3_start:m3_end+1]
    
    predicted_answer = m3
    actual_answer = answer

    # Convert predicted and actual answers to sets of characters
    predicted_chars = set(predicted_answer)
    actual_chars = set(actual_answer)

    # Calculate common characters
    common_chars = predicted_chars.intersection(actual_chars)

    # Calculate precision, recall, and F1 score at character level
    precision = len(common_chars) / len(predicted_chars) if len(predicted_chars) > 0 else 0
    recall = len(common_chars) / len(actual_chars) if len(actual_chars) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    predictions[id_val] = [m3,f1_score]

prediction_trainDataset_BiLSTM = 'prediction_trainDataset_BiLSTM.txt'
with open(prediction_trainDataset_BiLSTM, 'w', encoding='utf-8') as file:
    json.dump(predictions, file, ensure_ascii=False)

In [12]:
# predictions

In [13]:
total_sum = 0
num_elements = 0

for key in predictions.keys():
    total_sum += predictions[key][1]
    num_elements += 1

if num_elements > 0:
    average = total_sum / num_elements
else:
    average = 0

# **F1 Score**

In [14]:
print(average)

0.26962483004135157
