# Installing Transformers and Torch

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Installing Datasets

In [None]:
pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collec

# Loading Datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("squad", "plain_text", split="train")

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


# Getting bert-base-uncased Model

In [None]:
from transformers import BertTokenizer, BertForQuestionAnswering, RobertaTokenizer, RobertaForQuestionAnswering
from transformers.data.processors.squad import SquadV2Processor
from transformers.data.metrics.squad_metrics import compute_predictions_logits

In [None]:
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Generate predictions function

In [None]:
# Function to generate predictions from a model
def generate_predictions(dataset, tokenizer, model):
    predictions = []
    for example in dataset:
        inputs = tokenizer.encode_plus(example["question"], example["context"], add_special_tokens=True, return_tensors="pt")
        input_ids = inputs["input_ids"].to(model.device)
        attention_mask = inputs["attention_mask"].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

        start_index = torch.argmax(start_logits)
        end_index = torch.argmax(end_logits)

        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index + 1]))
        predictions.append(answer.strip())

    return predictions

# Load the SQuAD validation dataset

In [None]:
# Load the SQuAD validation dataset
datasetv = load_dataset("squad", "plain_text", split="validation[0:500]")



# Generate predictions using roberta-base-squad-2

# true_answers in validation dataset

In [None]:
true_answers=[]
for ans in datasetv:
  true_answers.append(ans["answers"]["text"][0])
  #print(ans["answers"]["text"])

In [None]:
type(roberta_predictions)

In [None]:
type(true_answers)

# getting exact match count

In [None]:
print(true_answers[5:15])

['"golden anniversary"', 'February 7, 2016', 'American Football Conference', '"golden anniversary"', 'American Football Conference', 'February 7, 2016', 'Denver Broncos', "Levi's Stadium", 'Santa Clara', 'Super Bowl L']


# checking our own model exact match score

In [None]:
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import torch
from torch.optim import AdamW
import numpy as np

In [None]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model.to(device)
model.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), ep

In [None]:
train_features = []

for example in dataset:
    start_positions = []
    end_positions = []

    context = example["context"]
    questions = example["question"]
    answers = example["answers"]
    answer_start=0
    answer_end=0
    tokenized_context = tokenizer.tokenize(context)
    tokenized_question = tokenizer.tokenize(questions)
    tokenized_input = ["[CLS]"] + tokenized_question + ["[SEP]"] + tokenized_context + ["[SEP]"]
    inputs = tokenizer.convert_tokens_to_ids(tokenized_input)
    char_to_token = []
    token_index = 1  # Skip the [CLS] token
    for char_index, char in enumerate(context):
            if char != " " and char != "\n":
                char_to_token.append(token_index)
            if token_index < len(tokenized_input) - 1 and char_index + 1 < len(tokenized_input[token_index]):
                token_index += 1

    for answer in answers["answer_start"]:
        answer_start = answer
        token_start = char_to_token[min(answer_start, len(char_to_token) - 1)]
        token_start = min(token_start, len(inputs) - 1)
        start_positions.append(token_start)



    for text in answers["text"]:
        answer_end = answer_start + len(text) - 1
        token_end = char_to_token[min(answer_end, len(char_to_token) - 1)]
        token_end = min(token_end, len(inputs) - 1)
        end_positions.append(token_end)


In [None]:
train_features.append(
            {
                "input_ids": inputs,
                "attention_mask": [1] * len(inputs),
                "start_positions": start_positions,
                "end_positions": end_positions,
            }
        )

In [None]:
pip install wrapt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from transformers import BertForQuestionAnswering, AdamW
from torch.utils.data import DataLoader, RandomSampler
from transformers import AdamW
import torch.optim as optim

# Define the batch size and number of training epochs
batch_size = 4
num_epochs = 50

# Convert the train_features list to a PyTorch DataLoader
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor([f["input_ids"] for f in train_features], dtype=torch.long),
    torch.tensor([f["attention_mask"] for f in train_features], dtype=torch.long),
    torch.tensor([f["start_positions"] for f in train_features], dtype=torch.long),
    torch.tensor([f["end_positions"] for f in train_features], dtype=torch.long),
)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

# Load the pre-trained BERT model for question answering
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Set the model to training mode
model.train()

# Define the optimizer and learning rate
# optimizer = AdamW(model.parameters(), lr=3e-6)
optimizer = optim.Adamax(model.parameters(), lr=3e-6)

In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, start_positions, end_positions = batch

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
save_path = "/content/Model"

# Save the fine-tuned model
model.save_pretrained(save_path)

In [None]:
from transformers import BertForQuestionAnswering, BertTokenizer

# Load the fine-tuned BERT model
model = BertForQuestionAnswering.from_pretrained(save_path)
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Prepare the input
question = "what are dogs?"
context = """dogs are animals"""
inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Process the logits to obtain the predicted answer span
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Decode the predicted answer span
all_tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
answer = tokenizer.convert_tokens_to_string(all_tokens[start_index:end_index+1])

# Print the predicted answer
print("Question:", question)
print("Predicted Answer:", answer)


Question: what are dogs?
Predicted Answer: animals


In [None]:
def bert_question_answer(question, passage, max_len=512):
    # Tokenize input question and passage
    # Add special tokens - [CLS] and [SEP]
    input_ids = tokenizer.encode(question, passage, max_length=max_len, truncation=True)

    # Getting number of tokens in 1st sentence (question) and 2nd sentence (passage that contains answer)
    sep_index = input_ids.index(102)
    len_question = sep_index + 1
    len_passage = len(input_ids) - len_question

    # Need to separate question and passage
    # Segment ids will be 0 for question and 1 for passage
    segment_ids = [0] * len_question + [1] * len_passage

    # Converting token ids to tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Getting start and end scores for answer
    # Converting input arrays to torch tensors before passing to the model
    start_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))[0]
    end_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))[1]

    # Converting scores tensors to numpy arrays
    start_token_scores = start_token_scores.detach().numpy().flatten()
    end_token_scores = end_token_scores.detach().numpy().flatten()

    # Getting start and end index of answer based on highest scores
    answer_start_index = np.argmax(start_token_scores)
    answer_end_index = np.argmax(end_token_scores)

    # Getting scores for start and end token of the answer
    start_token_score = np.round(start_token_scores[answer_start_index], 2)
    end_token_score = np.round(end_token_scores[answer_end_index], 2)

    # Combining subwords starting with ## and get full words in output.
    # It is because tokenizer breaks words which are not in its vocab.
    answer = tokens[answer_start_index]
    for i in range(answer_start_index + 1, answer_end_index + 1):
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        else:
            answer += ' ' + tokens[i]

    # If the answer didn't find in the passage
    if (answer_start_index == 0) or (start_token_score < 0) or (answer == '[SEP]') or (answer_end_index < answer_start_index):
        answer = "Sorry!, I could not find an answer in the passage."

    return (answer_start_index, answer_end_index, start_token_score, end_token_score, answer)

def split_passage_and_process(question, passage, max_len=10000, chunk_size=500):
    # Split the passage into smaller chunks
    passage_chunks = [passage[i:i+chunk_size] for i in range(0, len(passage), chunk_size)]
    answers = []
    # Process each chunk separately
    for chunk in passage_chunks:
        answer = bert_question_answer(question, chunk, max_len=max_len)
        if answer[-1] != "Sorry!, I could not find an answer in the passage.":
            answers.append(answer[-1])

    # Combine the answers from all chunks
    combined_answer = " ".join(answers)
    return combined_answer

In [None]:
split_passage_and_process("dogs are animals","whata are dogs?")

'animals'

In [None]:
own_predictions = generate_predictions(datasetv, tokenizer, model)

In [None]:
# Assume you have a list of true answers and a list of predicted answers
predicted_answers = own_predictions

# Prepare the examples and predictions for squad_evaluate
exact_match_count = 0
total_examples = len(true_answers)

for true_answer, predicted_answer in zip(true_answers, predicted_answers):
    if true_answer == predicted_answer:
        exact_match_count += 1

exact_match_score = exact_match_count / total_examples

# Print the exact match score
print("Exact match score:", exact_match_score)

Exact match score: 0.26


In [None]:
print(own_predictions[5:15])

['golden anniversary', 'february 7 , 2016', 'american football conference', 'golden anniversary', 'american football conference', 'february 7 , 2016', 'denver broncos', "levi ' s stadium", 'santa clara , california', 'super bowl l']


In [None]:
print(true_answers[5:15])

['"golden anniversary"', 'February 7, 2016', 'American Football Conference', '"golden anniversary"', 'American Football Conference', 'February 7, 2016', 'Denver Broncos', "Levi's Stadium", 'Santa Clara', 'Super Bowl L']
