In [None]:
!pip install transformers


In [None]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os

import numpy as np
from numpy import unravel_index
import pandas as pd
import math

import random
import sys
from IPython.display import Image
import time

# for text preprocessing
import re
import string

!CUBLAS_WORKSPACE_CONFIG=:4096:2 # for cuda deterministic behavior


# imports
from transformers import BertTokenizer, BertForQuestionAnswering

# dataloaders
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def set_seed(seed = 1234):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False
    # torch.use_deterministic_algorithms(False)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed()

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Working on:', device)

Working on: cuda


In [None]:
!pip install datasets
!pip install datasets rouge-score sacrebleu
from datasets import load_dataset


Collecting rouge-score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Collecting sacrebleu
  Using cached sacrebleu-2.4.2-py3-none-any.whl (106 kB)
Collecting portalocker (from sacrebleu)
  Using cached portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Using cached colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu, rouge-score
Successfully installed colorama-0.4.6 portalocker-2.8.2 rouge-score-0.1.2 sacrebleu-2.4.2


In [None]:
train_dataset = load_dataset('squad_v2', split='train[:10000]')
validation_dataset = load_dataset('squad_v2', split='validation[:1000]')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
train_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10000
})

In [None]:
train_dataset[0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

In [None]:
validation_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1000
})

In [None]:
validation_dataset[0]

{'id': '56ddde6b9a695914005b9628',
 'title': 'Normans',
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'question': 'In what country is Normandy located?',
 'answers': {'text': ['France', 'France', 'France', 'France'],
  'answer_start': [159, 159, 159, 159]}}

In [None]:
import pandas as pd
df = pd.DataFrame(train_dataset)

In [None]:
df.tail(10)

Unnamed: 0,id,title,context,question,answers
9990,56df51008bc80c19004e4a6d,Lighting,Major reductions in the cost of lighting occur...,When did gas powered street lights became econ...,"{'text': ['early 1800s'], 'answer_start': [186]}"
9991,56df510196943c1400a5d395,Lighting,"Over time, electric lighting became ubiquitous...",Street lights help reduce?,"{'text': ['urban crime.'], 'answer_start': [208]}"
9992,56df51038bc80c19004e4a72,Lighting,Lighting fixtures come in a wide variety of st...,What can come in a wide variety of styles for ...,"{'text': ['Lighting fixtures'], 'answer_start'..."
9993,56df51038bc80c19004e4a73,Lighting,Lighting fixtures come in a wide variety of st...,Functioning as holder a light fixture can prov...,"{'text': ['visual glare'], 'answer_start': [180]}"
9994,56df510696943c1400a5d39c,Lighting,An important property of light fixtures is the...,Luminous efficacy is measure in what unit?,"{'text': ['lumen per watt'], 'answer_start': [..."
9995,56df510996943c1400a5d3a3,Lighting,Color temperature for white light sources also...,How many Kelvins is daylight measured at?,"{'text': ['6400'], 'answer_start': [362]}"
9996,56df510996943c1400a5d3a4,Lighting,Color temperature for white light sources also...,What has a color temperature around 2800 to 30...,"{'text': ['incandescent bulb'], 'answer_start'..."
9997,56df510996943c1400a5d3a5,Lighting,Color temperature for white light sources also...,It can be said that lights with a high color t...,"{'text': ['blue-white'], 'answer_start': [550]}"
9998,56df510996943c1400a5d3a6,Lighting,Color temperature for white light sources also...,A lamp with more energy in the yellow and red ...,"{'text': ['Lower color temperature'], 'answer_..."
9999,56df513396943c1400a5d3ab,Lighting,Lighting is classified by intended use as gene...,"A light is classified by intended purpose, wha...","{'text': ['light produced by the fixture.'], '..."


In [None]:
train_dataset[1]

{'id': '56be85543aeaaa14008c9065',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'What areas did Beyonce compete in when she was growing up?',
 'answers': {'text': ['singing and dancing'], 'answer_start': [207]}}

In [None]:
def find_end(example):

    if (len(example['answers']['text']) != 0):
        context = example['context']
        text = example['answers']['text'][0]
        start_idx = example['answers']['answer_start'][0]

        end_idx = start_idx + len(text)

        temp = example['answers'] # to change the value
        temp['answer_end']=end_idx
        temp['answer_start'] = start_idx # [num]->num
        temp['text'] = text # ['text']->text

    else:
        temp = example['answers']
        temp['answer_end'] = 0 # []->0
        temp['answer_start'] = 0 # []->0
        temp['text'] = "" # []->""

    return example

train_dataset = train_dataset.map(find_end)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
train_dataset[1]

{'id': '56be85543aeaaa14008c9065',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'What areas did Beyonce compete in when she was growing up?',
 'answers': {'answer_end': 226,
  'answer_start': 207,
  'text': 'singing and dancing'}}

In [None]:

from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

tokenized_train = tokenizer(train_dataset['context'], train_dataset['question'], truncation=True, padding=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def find_token_indexes(tokenized, dataset):
    start_token_list = []
    end_token_list = []
    answers = dataset['answers']
    for i in range(len(answers)):
        if (answers[i]['text'] != ''):
            start_token = tokenized.char_to_token(i, answers[i]['answer_start'])
            end_token = tokenized.char_to_token(i, answers[i]['answer_end'] - 1)

            # if start token is None, the answer passage has been truncated
            if start_token is None:
                start_token = tokenizer.model_max_length
            if end_token is None:
                end_token = tokenizer.model_max_length
        else:
            start_token = 0
            end_token = 0

        start_token_list.append(start_token)
        end_token_list.append(end_token)

    return start_token_list, start_token_list

s, e = find_token_indexes(tokenized_train, train_dataset)
train_dataset = train_dataset.add_column("start_position", s)
train_dataset = train_dataset.add_column("end_position", e)

In [None]:
train_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'start_position', 'end_position'],
    num_rows: 10000
})

In [None]:
batch_size = 8
train_data = TensorDataset(torch.tensor(tokenized_train['input_ids'], dtype=torch.int64),
                           torch.tensor(tokenized_train['token_type_ids'], dtype=torch.int64),
                           torch.tensor(tokenized_train['attention_mask'], dtype=torch.float),
                           torch.tensor(train_dataset['start_position'], dtype=torch.int64),
                           torch.tensor(train_dataset['start_position'], dtype=torch.int64))

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [None]:
tokenized_validation = tokenizer(validation_dataset['context'], validation_dataset['question'], truncation=True, padding=True, return_offsets_mapping=True)

In [None]:
batch_size = 8
val_data = TensorDataset(torch.tensor(tokenized_validation['input_ids'], dtype=torch.int64),
                        torch.tensor(tokenized_validation['token_type_ids'], dtype=torch.int64),
                        torch.tensor(tokenized_validation['attention_mask'], dtype=torch.float))
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
# Fuinetuning


In [None]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
epochs = 3
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from tqdm import tqdm

for epoch in range(epochs):
    epoch_loss = []
    validation_loss = []

    total_loss = 0
    model.train()

    count=-1
    progress_bar = tqdm(train_dataloader, leave=True, position=0)
    progress_bar.set_description(f"Epoch {epoch+1}")
    for batch in progress_bar:
        count+=1
        input_ids, segment_ids, mask, start, end  = tuple(t.to(device) for t in batch)

        model.zero_grad()
        loss, start_logits, end_logits = model(input_ids = input_ids,
                                                token_type_ids = segment_ids,
                                                attention_mask = mask,
                                                start_positions = start,
                                                end_positions = end,
                                                return_dict = False)

        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        if (count % 20 == 0 and count != 0):
            avg = total_loss/count
            progress_bar.set_postfix(Loss=avg)

    torch.save(model.state_dict(), "./bert2_" + str(epoch) + ".h5") # save for later use
    avg_train_loss = total_loss / len(train_dataloader)
    epoch_loss.append(avg_train_loss)
    print(f"Epoch {epoch} Loss: {avg_train_loss}\n")

Epoch 1: 100%|██████████| 1250/1250 [07:07<00:00,  2.92it/s, Loss=2.27]


Epoch 0 Loss: 2.268073112797737



Epoch 2: 100%|██████████| 1250/1250 [07:06<00:00,  2.93it/s, Loss=1.14]


Epoch 1 Loss: 1.1371774078726768



Epoch 3: 100%|██████████| 1250/1250 [07:06<00:00,  2.93it/s, Loss=0.785]


Epoch 2 Loss: 0.7851844740033149



In [None]:
from tqdm import tqdm
# model.load_state_dict(torch.load("../input/bert-weights/bert2_2.h5"))

threshold = 1.0
epoch_i = 0
correct = 0
pred_dict = {}
na_prob_dict = {}

model.eval()
correct = 0
batch_val_losses = []
row = 0
for test_batch in tqdm(val_dataloader):
    input_ids, segment_ids, masks = tuple(t.to(device) for t in test_batch)

    with torch.no_grad():
        # prediction logits
        start_logits, end_logits = model(input_ids=input_ids,
                                        token_type_ids=segment_ids,
                                        attention_mask=masks,
                                        return_dict=False)

    # to cpu
    start_logits = start_logits.detach().cpu()
    end_logits = end_logits.detach().cpu()

    # for every sequence in batch
    for bidx in range(len(start_logits)):
        # apply softmax to logits to get scores
        start_scores = np.array(F.softmax(start_logits[bidx], dim = 0))
        end_scores = np.array(F.softmax(end_logits[bidx], dim = 0))

        # find max for start<=end
        size = len(start_scores)
        scores = np.zeros((size, size))

        for j in range(size):
            for i in range(j+1): # include j
                scores[i,j] = start_scores[i] + end_scores[j]

        # find best i and j
        start_pred, end_pred = unravel_index(scores.argmax(), scores.shape)
        answer_pred = ""
        if (scores[start_pred, end_pred] > scores[0,0]+threshold):

            offsets = tokenized_validation.offset_mapping[row]
            pred_char_start = offsets[start_pred][0]

            if end_pred < len(offsets):
                pred_char_end = offsets[end_pred][1]
                answer_pred = validation_dataset[row]['context'][pred_char_start:pred_char_end]
            else:
                answer_pred = validation_dataset[row]['context'][pred_char_start:]

            if answer_pred in validation_dataset[row]['answers']['text']:
                correct += 1

        else:
            if (len(validation_dataset[row]['answers']['text']) ==0):
                correct += 1

        pred_dict[validation_dataset[row]['id']] = answer_pred
        na_prob_dict[validation_dataset[row]['id']] = scores[0,0]

        row+=1


accuracy = correct/validation_dataset.num_rows
print("accuracy is: ", accuracy)

100%|██████████| 125/125 [00:56<00:00,  2.20it/s]

accuracy is:  0.447





In [None]:
from datasets import load_metric
from rouge_score import rouge_scorer
import sacrebleu

# Function to compute F1 score
def compute_f1(pred, actual):
    common = set(pred) & set(actual)
    if len(common) == 0:
        return 0.0
    precision = len(common) / len(pred)
    recall = len(common) / len(actual)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

# Function to compute BLEU score
def compute_bleu(predictions, references):
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

# Function to compute ROUGE score
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(pred, ref) for pred, ref in zip(predictions, references)]
    rouge1 = sum([s['rouge1'].fmeasure for s in scores]) / len(scores)
    rouge2 = sum([s['rouge2'].fmeasure for s in scores]) / len(scores)
    rougeL = sum([s['rougeL'].fmeasure for s in scores]) / len(scores)
    return {"rouge1": rouge1, "rouge2": rouge2, "rougeL": rougeL}

# Collect predictions and references
predictions = []
references = []

for row in range(len(validation_dataset)):
    context = validation_dataset[row]['context']
    answer = validation_dataset[row]['answers']['text']
    pred = pred_dict[validation_dataset[row]['id']]

    predictions.append(pred)
    if len(answer) == 0:
        references.append("")
    else:
        references.append(answer[0])

# Compute F1 score
f1_scores = [compute_f1(pred.split(), ref.split()) for pred, ref in zip(predictions, references)]
avg_f1 = sum(f1_scores) / len(f1_scores)
print(f"Average F1 Score: {avg_f1}")

# Compute ROUGE score
rouge_scores = compute_rouge(predictions, references)
print(f"ROUGE-1: {rouge_scores['rouge1']}")
print(f"ROUGE-2: {rouge_scores['rouge2']}")
print(f"ROUGE-L: {rouge_scores['rougeL']}")

# Compute BLEU score
bleu_score = compute_bleu(predictions, references)
print(f"BLEU Score: {bleu_score}")


Average F1 Score: 0.08211666666666667
ROUGE-1: 0.08935000000000003
ROUGE-2: 0.0
ROUGE-L: 0.08935000000000003
BLEU Score: 0.0
