In [3]:
!pip install transformers==3.0.2

Collecting transformers==3.0.2
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m769.0/769.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting tokenizers==0.8.1.rc1
  Downloading tokenizers-0.8.1rc1-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=7a3e1975dd333c395f9e9d65a58042398455ae5e1ebf51113ffa738cfb03e040
  Stored

In [4]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [66]:
# Required imports
import sys
import warnings
warnings.filterwarnings("ignore")
import random
import re
import utils
import os.path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
from sklearn.feature_extraction.text import CountVectorizer
from newsqa import NewsQaExample, NewsQaModel, create_dataset, get_single_prediction
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy.lang.en import English
import en_core_web_md
nlp = spacy.load('en_core_web_md')
en = English()

In [67]:
# Reading in the dataset
data = pd.read_csv('/kaggle/working/news111/newsqa-data-v1.csv')
# Getting a sample from the dataset
data = data.sample(frac = 0.10, random_state = 9)
data = data.reset_index(drop = True)
data.head()

Unnamed: 0,story_id,question,answer_char_ranges,is_answer_absent,is_question_bad,validated_answers
0,./cnn/stories/43a488c2d73b4f34122fc92c8df04670...,When was the plane crash?,"2794:2805,372:382|398:410|372:382",0.0,0.0,"{""none"": 2}"
1,./cnn/stories/91607b09ab00b8dff5d7b98387ed132a...,Where has Musharraf lived in exile?,1018:1035|1008:1035|1018:1035,0.0,0.0,
2,./cnn/stories/c674fdceb339915df2a9de1a0bd3007c...,What is he leaving?,529:543|None|237:252,0.0,0.333333333333,"{""237:252"": 2}"
3,./cnn/stories/a0c2d26c2ebecd53dcecc9ea04ae8c5a...,What should the president understand?,None|4317:4469,0.5,0.5,"{""4317:4469"": 2}"
4,./cnn/stories/ad2af6c1362a3f84000edd71fd752ad6...,Bush commutes whose sentence?,"103:129,366:380,384:397,92:103|None",0.0,0.5,"{""103:129"": 2, ""366:380"": 1}"


In [68]:
# Size of the dataset
len(data)

11963

In [69]:
# Convert to numeric and remove non-numeric rows
data = data[pd.to_numeric(data['is_question_bad'], errors = 'coerce').notnull()]
data['is_question_bad'] = data['is_question_bad'].astype(float)
# Number of bad questions
len(data[data['is_question_bad'] > 0])

1990

In [70]:
# Remove all bad questions/questions that don't make any sense
data = data[data['is_question_bad'] == 0]
data = data.reset_index(drop = True)

In [71]:
# Final data size
len(data)

8769

In [72]:
# Word counts in questions
cv = CountVectorizer(stop_words = ['the', 'is', 'was', 'of', 'to', 'in'])
dtm = cv.fit_transform(data['question']).toarray()

word_counts = dtm.sum(axis = 0)

In [73]:
# Removing the path and keeping just the file name in story_id
data['story_id'] = data['story_id'].apply(lambda x: x.split('/')[-1].split('.')[0])

In [74]:
data.to_csv('/kaggle/working/news111/newsqa-data-formatted.csv', index = False)

## Data Cleaning

In [75]:
# We don't need is_answer_absent and is_question_bad columns
data = data.drop(['is_answer_absent', 'is_question_bad'], axis = 1)

In [76]:
# Reading stories
NEWS_STORIES = utils.open_pickle('/kaggle/working/news111/news_stories.pkl')

In [77]:
def adjust_answer_range(story_id, answer_range):
    '''Checks if answer range starts or ends in the middle of a words
    and returns the correct answer range along with answer text'''
    
    # If answer is not available, denote it as -1
    if answer_range == 'None':
        return [-1, -1]
    
    story = NEWS_STORIES[story_id]
    
    # Check for errors in answer
    if len(answer_range.split(':')) == 1:
        return [-1, -1]
    
    start_idx, end_idx = answer_range.split(':')
    start_idx, end_idx = int(start_idx), int(end_idx)
    
    # Moves back start_idx to the start of a word
    while start_idx != 0 and not utils.is_whitespace(story[start_idx - 1]) and not utils.is_punct(story[start_idx - 1]):
        start_idx = start_idx - 1
    
    # Some ranges end with a punctuation or a whitespace
    if utils.is_whitespace(story[end_idx - 1]) or utils.is_punct(story[end_idx - 1]):
        end_idx = end_idx - 1
    
    # Moves end_idx to the end of a word
    while not utils.is_whitespace(story[end_idx]) and not utils.is_punct(story[end_idx + 1]):
        end_idx = end_idx + 1
        
    # There are some answers with \n at the end followed by a letter
    # The answer will not be in two different paragraphs
    answer_text = story[start_idx:end_idx]
    answer_para = re.split('\n', answer_text)
    
    if len(answer_para[-1]) > len(answer_para[0]):
        start_idx = end_idx - len(answer_para[-1])
        answer_text = answer_para[-1]
    else:
        end_idx = start_idx + len(answer_para[0])
        answer_text = answer_para[0]
    
    return [start_idx, end_idx]

In [78]:
def get_answer(qa_details):
    '''A function that selects an answer for a question
    
    > If validated answers are available, the one with most votes is selected
    > If there's a tie in validated answer votes or if validated answer is not
      available, the most frequent answer is selected
    > If there's a tie here too, a random answer is selected'''
    
    # If validated answers are available, select the one with most votes
    if qa_details['validated_answers'] is not np.nan:
        validated_answers = eval(qa_details['validated_answers'])
        
        # Get the answers with maximum votes
        max_vote_ans = utils.get_max_keys(validated_answers)
        
        # Check for ties
        if len(max_vote_ans) == 1:
            return adjust_answer_range(qa_details['story_id'], max_vote_ans[0])
    
    # If validated answers are not available or if there is a tie in validated answers
    # Get all available answers
    answers = re.split(',|\|', qa_details['answer_char_ranges'])
    
    # If there is just one answer
    if len(answers) == 1:
        return adjust_answer_range(qa_details['story_id'], answers[0])
    
    # Get counts of each answer
    answer_freq = utils.get_frequency(answers)
    max_vote_ans = utils.get_max_keys(answer_freq)
    
    if len(max_vote_ans) == 1:
        return adjust_answer_range(qa_details['story_id'], max_vote_ans[0])
    
    # If there is a tie for multiple answers, return a random answer
    return adjust_answer_range(qa_details['story_id'], random.choice(answers))

In [79]:
# Select one answer range among multiple answers
data[['start_idx', 'end_idx']] = data.apply(get_answer, axis = 1, result_type = 'expand')

In [80]:
data.head()

Unnamed: 0,story_id,question,answer_char_ranges,validated_answers,start_idx,end_idx
0,43a488c2d73b4f34122fc92c8df0467078699156,When was the plane crash?,"2794:2805,372:382|398:410|372:382","{""none"": 2}",-1,-1
1,91607b09ab00b8dff5d7b98387ed132a44d8b61b,Where has Musharraf lived in exile?,1018:1035|1008:1035|1018:1035,,1017,1038
2,d0a418c3fad00479e73f9786b5e745ae6e844972,Who issued a travel health warning?,"615:619|611:619,626:628,650:704,711:713,720:72...","{""none"": 2, ""650:704"": 1}",-1,-1
3,1e7b3db6969c8ed2687e4bcaaa380ff3e93852bf,What has become his highest grossing film sinc...,2592:2615|179:192|502:517,"{""502:517"": 2, ""2592:2615"": 1}",499,522
4,bd2671153b0103b968eaebc3d23435fc20cc999c,Where is the resting place of Spanish poet Fed...,None|272:279|2060:2084,"{""none"": 2}",-1,-1


## Baseline Model

In [81]:
# Total examples in the dataset
total_examples = len(data)

In [82]:
def simple_tokenizer(doc, model=en):
    # a simple tokenizer for individual documents
    parsed = model(doc)
    return([t.lower_ for t in parsed if (t.is_alpha)&(not t.like_url)])

In [83]:
def get_doc_embedding(tokens, model = nlp):
    '''Returns the embedding of a document by averaging the
    GloVe embeddings of all tokens in the document'''
    
    embeddings = []
    for t in tokens:
        embeddings.append(model.vocab[t].vector)
    
    embeddings = np.array(embeddings)
    if embeddings.ndim == 1:
        return embeddings
    else:
        return np.mean(embeddings, axis = 0)

In [84]:
def predict_answer(text, question):
    '''Returns the start and end indices of the sentence that
    has the maximum cosine similarity with the question'''
    
    # Stores the start position of each sentence
    sentence_to_char_idx = [0]
    
    sentences = []
    start_idx = 0
    
    for idx, char in enumerate(text):
        # If the chracter is a punctuation, we append the sentence
        if utils.is_punct(char):
            sentences.append(text[start_idx:idx])
            start_idx = idx + 1
            sentence_to_char_idx.append(start_idx)
    
    # Getting embeddings for each sentence
    sentence_embeddings = []
    for s in sentences:
        tokens = simple_tokenizer(s)
        embd = get_doc_embedding(tokens)
        if embd.shape == (300,):
            sentence_embeddings.append(embd)
    
    sentence_embeddings = np.stack(sentence_embeddings)
    
    # Getting the embedding for the question
    question_embedding = get_doc_embedding(simple_tokenizer(question))
    question_embedding = np.expand_dims(question_embedding, axis = 0)
    
    #print(sentence_embeddings.shape)
    # Get the cosine similarity of each sentence with the question
    similarity = cosine_similarity(sentence_embeddings, question_embedding)
    
    # Get the sentence with the most similarity
    best_idx = np.argmax(similarity)
    
    # Get the sentence start and end index
    pred_start = sentence_to_char_idx[best_idx]
    pred_end = sentence_to_char_idx[best_idx + 1] - 1
    
    return pred_start, pred_end

In [85]:
def calculate_metrics(pred_start, pred_end, true_start, true_end):
    '''Calculates the f1 score and if the predicted answer overlaps 
    with the correct one'''
    
    # Get the overlap
    overlap = set(range(true_start, true_end)).intersection(range(pred_start, pred_end))
    overlap = len(overlap)

    # If either of them have no answer
    if true_end == 0 or pred_end == 0:
        f1_score = int(true_end == pred_end)
        is_correct = int(end_idx == pred_end)
        return f1_score, is_correct
    
    # If they don't overlap at all
    if overlap == 0 or pred_start >= pred_end:
        f1_score = 0
        is_correct = 0
        return f1_score, is_correct

    # If there is an overlap, we consider it correct
    is_correct = 1

    precision = overlap / (pred_end - pred_start)
    recall = overlap / (true_end - true_start)
    f1_score = (2 * precision * recall) / (precision + recall)

    return f1_score, is_correct

In [86]:
# Evaluate the performance of this approach on the data

correct = 0
total_f1 = 0

for idx, row in data.iterrows():
    text = NEWS_STORIES[row['story_id']]
    question = row['question']
    
    # Get the predictions
    pred_start, pred_end = predict_answer(text, question)
    f1, is_correct = calculate_metrics(pred_start, pred_end, row['start_idx'], row['end_idx'])
    
    total_f1 += f1
    correct += is_correct
    
    # Print progress
    utils.drawProgressBar(idx + 1, total_examples)
    
acc = correct/total_examples
f1_score = total_f1/total_examples



In [87]:
print("F1 score: {:.4f}".format(f1_score))
print("Accuracy: {:.4f}".format(acc))

F1 score: 0.0306
Accuracy: 0.0928


## Advanced models

In [88]:
def get_examples():
    '''Return a list of NewsQaExample objects'''
    
    # If a pickle file exists for examples, read the file
    if os.path.isfile('/kaggle/working/news111/examples_sample.pkl'):
        return utils.open_pickle('/kaggle/working/news111/examples_sample.pkl')
    
    examples = []

    for idx, row in data.iterrows():
        ex = NewsQaExample(NEWS_STORIES[row['story_id']], row['question'], row['start_idx'], row['end_idx'])
        examples.append(ex)
        utils.drawProgressBar(idx + 1, total_examples)
    print('\n')
    # Saving examples to a pickle file
    utils.save_pickle('/kaggle/working/news111/examples_sample.pkl', examples)
    
    return examples

In [89]:
def get_datasets(examples, tokenizer_name):
    '''Returns train, val and test datasets from examples'''
    
    model_name = tokenizer_name.split('-')[0]
    
    if os.path.isfile('/kaggle/working/news111/sample_dataset_' + model_name + '.pkl'):
        return utils.open_pickle('/kaggle/working/news111/sample_dataset_' + model_name + '.pkl')
    
    features = []
    labels = []
    
    if tokenizer_name == 'bert-large-uncased-whole-word-masking-finetuned-squad':
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
    
    if tokenizer_name == 'distilbert-base-uncased-distilled-squad':
        tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_name)
    
    print("Getting input features:")
    for idx, ex in enumerate(examples):
        input_features = ex.encode_plus(tokenizer, pad = True)
        features.append(input_features)
        labels.append(ex.get_label())
        utils.drawProgressBar(idx + 1, total_examples)
    
    print('\n')
    # Getting TensorDataset
    train_set, val_set, test_set, feature_idx_map = create_dataset(features, labels, model = model_name)
    # Saving the dataset in a file
    utils.save_pickle('/kaggle/working/news111/sample_dataset_' + model_name + '.pkl', (train_set, val_set, test_set, feature_idx_map))
    
    return (train_set, val_set, test_set, feature_idx_map)

In [90]:
def get_dataloaders(train_set, val_set, test_set, batch_size):
    '''Creates torch dataloaders for train, validation and test sets'''
    
    train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, 
                          sampler = RandomSampler(train_set))

    val_loader = DataLoader(val_set, batch_size = BATCH_SIZE, 
                            sampler = SequentialSampler(val_set))

    test_loader = DataLoader(test_set, batch_size = BATCH_SIZE, 
                             sampler = SequentialSampler(test_set))
    
    return train_loader, val_loader, test_loader

In [91]:
def finetune_model(model_name, train_loader, val_loader, feature_idx_map, device, 
                   epochs = 1, learning_rate = 1e-5):
    '''Fine-tunes a pretrained model'''
    
    if model_name == 'bert-large-uncased-whole-word-masking-finetuned-squad':
        model = BertForQuestionAnswering.from_pretrained(model_name)
        # Freezing bert parameters
        for param in model.bert.parameters():
            param.requires_grad = False
    
    if model_name == 'distilbert-base-uncased-distilled-squad':
        model = DistilBertForQuestionAnswering.from_pretrained(model_name)
        # Freezing distilbert parameters
        for param in model.distilbert.parameters():
            param.requires_grad = False
        
    short_name = model_name.split('-')[0]
    
    newsqa_model = NewsQaModel(model)
    newsqa_model.train(train_loader, val_loader, feature_idx_map, device, 
                       num_epochs = epochs, lr = learning_rate, 
                       filename = '/kaggle/working/news111' + short_name + '_sample.pt')
    
    return newsqa_model

In [92]:
# Get a list of NewsQaExample objects
examples = get_examples()


Variable successfully saved in /kaggle/working/news111/examples_sample.pkl


### BERT

In [93]:
# Defining model name
bert_model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'

In [94]:
# Getting the training, validation and test sets
bert_datasets = get_datasets(examples, bert_model_name)
bert_train_set, bert_val_set, bert_test_set, bert_feature_idx_map = bert_datasets

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Getting input features:

Variable successfully saved in /kaggle/working/news111/sample_dataset_bert.pkl


In [95]:
# Getting data loaders
BATCH_SIZE = 32

bert_loaders = get_dataloaders(bert_train_set, bert_val_set, bert_test_set, batch_size = BATCH_SIZE)
bert_train_loader, bert_val_loader, bert_test_loader = bert_loaders

In [96]:
# -- Still takes nearly 20 minutes to run --
EPOCHS = 3
LEARNING_RATE = 0.001

bert_model = finetune_model(bert_model_name, bert_train_loader, bert_val_loader, bert_feature_idx_map,device, epochs = EPOCHS, learning_rate = LEARNING_RATE)

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Epoch 1/3:
Validation accuracy increased from 0.0000 to 0.4438, saving to /kaggle/working/news111bert_sample.pt



Epoch 2/3:
Validation accuracy increased from 0.4438 to 0.4490, saving to /kaggle/working/news111bert_sample.pt



Epoch 3/3:
Validation accuracy increased from 0.4490 to 0.4602, saving to /kaggle/working/news111bert_sample.pt


In [97]:
# Evaluation the performance on test set
bert_model.load('/kaggle/working/news111bert_sample.pt')
bert_eval_metrics = bert_model.evaluate(bert_test_loader, bert_feature_idx_map, 'cpu')

loss: 3.4567	f1:0.3269	acc:0.5084


In [98]:
# Evalutating performance on the model without fine-tuining
bert_non_finetuned = BertForQuestionAnswering.from_pretrained(bert_model_name)
bert_non_finetuned.to(device)

bert_newsqa_model = NewsQaModel(bert_non_finetuned)

non_finetuned_eval_metrics = bert_newsqa_model.evaluate(bert_test_loader, bert_feature_idx_map, device)

loss: 6.0434	f1:0.2818	acc:0.4625


### DistilBERT

In [99]:
# Defining model name
dbert_model_name = 'distilbert-base-uncased-distilled-squad'

In [100]:
# Getting the training, validation and test sets
dbert_datasets = get_datasets(examples, dbert_model_name)
dbert_train_set, dbert_val_set, dbert_test_set, dbert_feature_idx_map = dbert_datasets

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Getting input features:

Variable successfully saved in /kaggle/working/news111/sample_dataset_distilbert.pkl


In [101]:
# Getting data loaders
BATCH_SIZE = 32

dbert_loaders = get_dataloaders(dbert_train_set, dbert_val_set, dbert_test_set, batch_size = BATCH_SIZE)
dbert_train_loader, dbert_val_loader, dbert_test_loader = dbert_loaders

In [102]:
EPOCHS = 3
LEARNING_RATE = 0.001

dbert_model = finetune_model(dbert_model_name, dbert_train_loader, dbert_val_loader, dbert_feature_idx_map, 
                             device, epochs = EPOCHS, learning_rate = LEARNING_RATE)

Downloading:   0%|          | 0.00/451 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Epoch 1/3:
Validation accuracy increased from 0.0000 to 0.3925, saving to /kaggle/working/news111distilbert_sample.pt



Epoch 2/3:
Validation accuracy increased from 0.3925 to 0.4148, saving to /kaggle/working/news111distilbert_sample.pt



Epoch 3/3:
Validation accuracy increased from 0.4148 to 0.4293, saving to /kaggle/working/news111distilbert_sample.pt


In [103]:
# Evaluation the performance on test set
dbert_model.load('/kaggle/working/news111distilbert_sample.pt')
dbert_eval_metrics = dbert_model.evaluate(dbert_test_loader, dbert_feature_idx_map, 'cpu')

loss: 3.7215	f1:0.3027	acc:0.4302


In [104]:
# Evalutating performance on the model without fine-tuining
dbert_non_finetuned = DistilBertForQuestionAnswering.from_pretrained(dbert_model_name)
dbert_non_finetuned.to(device)

dbert_newsqa_model = NewsQaModel(dbert_non_finetuned)

non_finetuned_eval_metrics = dbert_newsqa_model.evaluate(dbert_test_loader, dbert_feature_idx_map, device)

loss: 6.4390	f1:0.2873	acc:0.4164
