In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/new-squaddataset/train-v2.0.json
/kaggle/input/new-squaddataset/dev-v2.0.json
/kaggle/input/loadbidaf/bidaftrain.pkl
/kaggle/input/loadbidaf/bidafw2id.pickle
/kaggle/input/loadbidaf/bidafc2id.pickle
/kaggle/input/loadbidaf/bidafglove_tv.npy
/kaggle/input/loadbidaf/bidafvalid.pkl


In [2]:
import numpy as np

import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds

from transformers import pipeline

In [3]:
def load_json(path):
    '''
    Loads the JSON file of the Squad dataset.
    Returns the json object of the dataset.
    '''
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
    print("Length of data: ", len(data['data']))
    print("Data Keys: ", data['data'][0].keys())
    print("Title: ", data['data'][0]['title'])
    
    return data

def parse_data(data:dict)->list:
    '''
    Parses the JSON file of Squad dataset by looping through the
    keys and values and returns a list of dictionaries with
    context, query and label triplets being the keys of each dict.
    '''
    data = data['data']
    qa_list = []

    for paragraphs in data:

        for para in paragraphs['paragraphs']:
            context = para['context']

            for qa in para['qas']:
                
                id = qa['id']
                question = qa['question']
                
                for ans in qa['answers']:
                    answer = ans['text']
                    ans_start = ans['answer_start']
                    ans_end = ans_start + len(answer)
                    
                    qa_dict = {}
                    qa_dict['id'] = id
                    qa_dict['context'] = context
                    qa_dict['question'] = question
                    qa_dict['label'] = [ans_start, ans_end]

                    qa_dict['answer'] = answer
                    qa_list.append(qa_dict)    

    
    return qa_list


In [4]:
import json

train_data = load_json('/kaggle/input/new-squaddataset/train-v2.0.json')
valid_data = load_json('/kaggle/input/new-squaddataset/dev-v2.0.json')

# parse the json structure to return the data as a list of dictionaries

train_list = parse_data(train_data)
valid_list = parse_data(valid_data)
print('--------------------------')

print('Train list len: ',len(train_list))
print('Valid list len: ',len(valid_list))

# converting the lists into dataframes

train_ds = pd.DataFrame(train_list)
val_ds = pd.DataFrame(valid_list)
# train_ds = train_ds.drop('id', axis=1)
# val_ds = val_ds.drop('id', axis=1)
train_ds.columns = ['id','Paragraph', 'Question', 'Answer Start', 'Answer']
val_ds.columns = ['id','Paragraph', 'Question', 'Answer Start', 'Answer']
train_ds['Answer Start'] = train_ds['Answer Start'].apply(lambda x: x[0])
val_ds['Answer Start'] = val_ds['Answer Start'].apply(lambda x: x[0])

Length of data:  442
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  University_of_Notre_Dame
Length of data:  48
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  Super_Bowl_50
--------------------------
Train list len:  87599
Valid list len:  34726


In [5]:
val_ds.head()

Unnamed: 0,id,Paragraph,Question,Answer Start,Answer
0,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,177,Denver Broncos
1,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,177,Denver Broncos
2,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,177,Denver Broncos
3,56be4db0acb8001400a502ed,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,249,Carolina Panthers
4,56be4db0acb8001400a502ed,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,249,Carolina Panthers


In [6]:
train_ds.head()

Unnamed: 0,id,Paragraph,Question,Answer Start,Answer
0,5733be284776f41900661182,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous
1,5733be284776f4190066117f,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ
2,5733be284776f41900661180,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building
3,5733be284776f41900661181,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,381,a Marian place of prayer and reflection
4,5733be284776f4190066117e,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,92,a golden statue of the Virgin Mary


In [7]:
# Get start and end character position of answer in paragraph
def get_answer_char_pos(row):
    paragraph, answer, answer_start = row['Paragraph'], row['Answer'], row['Answer Start']
    answer_end = answer_start + len(answer)

    # Check if SQuAD answers are off by one or two characters
    if paragraph[answer_start-1:answer_end-1] == answer:
        return [answer_start-1, answer_end-1]
    elif paragraph[answer_start-2:answer_end-2] == answer:
        return [answer_start-2, answer_end-2]
    else:
        return [answer_start, answer_end]

train_ds['Answer'] = train_ds.apply(get_answer_char_pos, axis=1)
train_ds = train_ds.drop('Answer Start', axis=1)

val_ds['Answer'] = val_ds.apply(get_answer_char_pos, axis=1)
val_ds = val_ds.drop('Answer Start', axis=1)

In [8]:
train_ds.head()

Unnamed: 0,id,Paragraph,Question,Answer
0,5733be284776f41900661182,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"[515, 541]"
1,5733be284776f4190066117f,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"[188, 213]"
2,5733be284776f41900661180,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"[279, 296]"
3,5733be284776f41900661181,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,"[381, 420]"
4,5733be284776f4190066117e,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,"[92, 126]"


In [9]:
id_train, paragraph_train, question_train = train_ds['id'].tolist(), train_ds['Paragraph'].tolist(), train_ds['Question'].tolist()
paragraph_train = [text for text in paragraph_train]
question_train = [text for text in question_train]
train_labels = train_ds['Answer'].tolist()

id_val, paragraph_val, question_val = val_ds['id'].tolist(), val_ds['Paragraph'].tolist(), val_ds['Question'].tolist()
paragraph_val = [text for text in paragraph_val]
question_val = [text for text in question_val]
val_labels = val_ds['Answer'].tolist()

In [10]:
# len(id_val)

In [11]:
from tqdm import tqdm
model = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')

train_predictions =[]
for id, paragraph, question in tqdm(zip(id_train, paragraph_train, question_train), "Processing training data", total=len(paragraph_train)):
    prediction = model(question=question, context=paragraph)
    start_pred, end_pred = prediction['start'], prediction['end']
    train_predictions.append([start_pred, end_pred, id])

val_predictions = []
for id, paragraph, question in tqdm(zip(id_val, paragraph_val, question_val), "Processing validation data", total=len(paragraph_val)):
    prediction = model(question=question, context=paragraph)
    start_pred, end_pred = prediction['start'], prediction['end']
    val_predictions.append([start_pred, end_pred, id])


Downloading (…)lve/main/config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Processing training data: 100%|██████████| 87599/87599 [4:11:48<00:00,  5.80it/s]   
Processing validation data: 100%|██████████| 34726/34726 [1:41:59<00:00,  5.67it/s]  


In [12]:
id2f1 = {}

def exact_match(y_true, y_pred):
    start_true, end_true = y_true[0], y_true[1]
    start_pred, end_pred = y_pred[0], y_pred[1]

    if start_pred != start_true or end_pred != end_true:
        return float(0)
    else:
        return float(1)

def f1_score(y_true, y_pred):
    global id2f1
    start_true, end_true = y_true[0], y_true[1]
    start_pred, end_pred = y_pred[0], y_pred[1]
    
    predicted_tokens = set(range(start_pred, end_pred + 1))
    true_tokens = set(range(start_true, end_true + 1))
    common_tokens = predicted_tokens.intersection(true_tokens)
    precision = len(common_tokens) / len(predicted_tokens) if len(predicted_tokens) > 0 else 0
    recall = len(common_tokens) / len(true_tokens) if len(true_tokens) > 0 else 0
    if precision + recall > 0:
        id2f1[y_pred[2]] = 2 * precision * recall / (precision + recall)
    else:
        id2f1[y_pred[2]] = 0
    return float(2 * precision * recall / (precision + recall) if precision + recall > 0 else 0)


In [13]:
exact_match_res = 0
f1_score_res = 0
count = 0

for y_true, y_pred in zip(train_labels, train_predictions):
    exact_match_res += exact_match(y_true, y_pred)
    f1_score_res += f1_score(y_true, y_pred)
    count += 1

exact_match_res /= count
f1_score_res /= count

print('Exact match:', exact_match_res)
print('F1 score:', f1_score_res)

Exact match: 0.7052477768011051
F1 score: 0.8778170577111749


In [14]:
exact_match_res = 0
f1_score_res = 0
count = 0

for y_true, y_pred in zip(val_labels, val_predictions):
    exact_match_res += exact_match(y_true, y_pred)
    f1_score_res += f1_score(y_true, y_pred)
    count += 1

exact_match_res /= count
f1_score_res /= count

print('Exact match:', exact_match_res)
print('F1 score:', f1_score_res)


Exact match: 0.5713874330472845
F1 score: 0.7623009335636085


In [15]:
with open('model4_val.npy', 'wb') as f:
    np.save(f, np.array(val_predictions))

with open('model4_train.npy', 'wb') as f:
    np.save(f, np.array(train_predictions))

# **Saving result**

In [16]:
with open('/kaggle/working/model4_val.npy', 'rb') as f:
    val_predictions = np.load(f)

with open('/kaggle/working/model4_train.npy', 'rb') as f:
    train_predictions = np.load(f)

In [17]:
train_data_testing = load_json('/kaggle/input/new-squaddataset/train-v2.0.json')
valid_data_testing = load_json('/kaggle/input/new-squaddataset/dev-v2.0.json')

# parse the json structure to return the data as a list of dictionaries

train_list_testing = parse_data(train_data_testing)
valid_list_testing = parse_data(valid_data_testing)
print('--------------------------')

print('Train list len: ',len(train_list_testing))
print('Valid list len: ',len(valid_list_testing))

# converting the lists into dataframes

train_ds_testing = pd.DataFrame(train_list_testing)
val_ds_testing = pd.DataFrame(valid_list_testing)
# train_ds_testing.columns = ['Paragraph', 'Question', 'Answer']
# val_ds_testing.columns = ['Paragraph', 'Question', 'Answer']

Length of data:  442
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  University_of_Notre_Dame
Length of data:  48
Data Keys:  dict_keys(['title', 'paragraphs'])
Title:  Super_Bowl_50
--------------------------
Train list len:  87599
Valid list len:  34726


In [18]:
val_ds_testing.head()

Unnamed: 0,id,context,question,label,answer
0,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,"[177, 191]",Denver Broncos
1,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,"[177, 191]",Denver Broncos
2,56be4db0acb8001400a502ec,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,"[177, 191]",Denver Broncos
3,56be4db0acb8001400a502ed,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,"[249, 266]",Carolina Panthers
4,56be4db0acb8001400a502ed,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,"[249, 266]",Carolina Panthers


In [19]:
predictions = {}
chk = 0
for index, row in val_ds_testing.iterrows():
    id_val = row['id']
    paragraph = row['context']
    question = row['question']
    answer_start = row['label'][0]
    answer = row['answer']

    m4_start = int(val_predictions[index][0])
    m4_end = int(val_predictions[index][1])
    m4 = paragraph[m4_start:m4_end+1]
        

    if id_val in id2f1:
        predictions[id_val] = [m4,id2f1[id_val]]
    else:
        predictions[id_val] = 0
        chk += 1

prediction_validDataset_DistilBERT = 'prediction_validDataset_DistilBERT.txt'
with open(prediction_validDataset_DistilBERT, 'w', encoding='utf-8') as file:
    json.dump(predictions, file, ensure_ascii=False)

In [20]:
predictions = {}
chk = 0
for index, row in train_ds_testing.iterrows():
    id_val = row['id']
    paragraph = row['context']
    question = row['question']
    answer_start = row['label'][0]
    answer = row['answer']

    m4_start = int(train_predictions[index][0])
    m4_end = int(train_predictions[index][1])
    m4 = paragraph[m4_start:m4_end+1]
        

    if id_val in id2f1:
        predictions[id_val] = [m4,id2f1[id_val]]
    else:
        predictions[id_val] = 0
        chk += 1

prediction_trainDataset_DistilBERT = 'prediction_trainDataset_DistilBERT.txt'
with open(prediction_trainDataset_DistilBERT, 'w', encoding='utf-8') as file:
    json.dump(predictions, file, ensure_ascii=False)

In [21]:
# predictions