<a href="https://colab.research.google.com/github/chethana613/qna-ai-chatbot/blob/main/DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19

In [5]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''  # Set this before importing PyTorch

from datasets import load_dataset
import torch
from tqdm.auto import tqdm
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge import Rouge

# Load the dataset
data = load_dataset('squad')

def add_end_idx(answers, contexts):
    new_answers = []
    for answer, context in tqdm(zip(answers, contexts)):
        answer['text'] = answer['text'][0]
        answer['answer_start'] = answer['answer_start'][0]
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
        new_answers.append(answer)
    return new_answers

def prep_data(dataset):
    questions = dataset['question']
    contexts = dataset['context']
    answers = add_end_idx(dataset['answers'], contexts)
    return {
        'question': questions,
        'context': contexts,
        'answers': answers
    }

dataset = prep_data(data['train'].shuffle(seed=123).select(range(1000)))

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train = tokenizer(dataset['context'], dataset['question'],
                  truncation=True, padding='max_length',
                  max_length=512, return_tensors='pt')

tokenizer.decode(train['input_ids'][0])[:855]

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in tqdm(range(len(answers))):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length - 1

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train, dataset['answers'])

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.as_tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train)
loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)

# Loading the model
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

device = torch.device('cpu')  # Use CPU
model.to(device)

epochs = 3

# Initialize Rouge
rouge = Rouge()

def normalize_text(text):
    return text.lower().strip()

def exact_match(prediction, truth):
    return normalize_text(prediction) == normalize_text(truth)

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return round(2 * (prec * rec) / (prec + rec), 2)

def evaluate(model, dataloader):
    model.eval()
    exact_match_scores = []
    f1_scores = []
    bleu_scores = []
    rouge_n_scores = []
    rouge_l_scores = []
    accuracy_scores = []

    rouge = Rouge()

    for batch in tqdm(dataloader):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            start_pred = torch.argmax(outputs.start_logits, dim=1)
            end_pred = torch.argmax(outputs.end_logits, dim=1)

            for i in range(len(start_true)):
                pred_answer = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]+1], skip_special_tokens=True)
                true_answer = tokenizer.decode(input_ids[i][start_true[i]:end_true[i]+1], skip_special_tokens=True)

                if not pred_answer.strip():
                    continue

                em_score = exact_match(pred_answer, true_answer)
                f1_score = compute_f1(pred_answer, true_answer)

                exact_match_scores.append(em_score)
                f1_scores.append(f1_score)

                smoothie = SmoothingFunction().method4
                bleu_score = corpus_bleu([[true_answer.split()]], [pred_answer.split()], smoothing_function=smoothie)
                bleu_scores.append(bleu_score)

                start_correct = (start_pred[i] == start_true[i]).item()
                end_correct = (end_pred[i] == end_true[i]).item()
                accuracy = (start_correct and end_correct)
                accuracy_scores.append(accuracy)

                try:
                    rouge_scores = rouge.get_scores(pred_answer, true_answer)
                    rouge_n_scores.append(rouge_scores[0]['rouge-1']['f'])
                    rouge_l_scores.append(rouge_scores[0]['rouge-l']['f'])
                except Exception as e:
                    pass

    exact_match_score = sum(exact_match_scores) / len(exact_match_scores) if exact_match_scores else 0.0
    f1_score_avg = sum(f1_scores) / len(f1_scores) if f1_scores else 0.0
    bleu_score_avg = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0
    rouge_n_avg = sum(rouge_n_scores) / len(rouge_n_scores) if rouge_n_scores else 0.0
    rouge_l_avg = sum(rouge_l_scores) / len(rouge_l_scores) if rouge_l_scores else 0.0
    accuracy_avg = sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0.0

    print(f'Exact Match Score: {exact_match_score}')
    print(f'Average F1 Score: {f1_score_avg}')
    print(f'Average BLEU Score: {bleu_score_avg}')
    print(f'Average ROUGE-N Score: {rouge_n_avg}')
    print(f'Average ROUGE-L Score: {rouge_l_avg}')
    print(f'Accuracy Score: {accuracy_avg}')

# Training loop and evaluation
for epoch in tqdm(range(1, epochs + 1)):
    model.train()
    loss_train_total = 0

    progress_bar = tqdm(loader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = {key: val.to(device) for key, val in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    torch.save(model.state_dict(), f'finetuned_distilbert_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = loss_train_total / len(loader)
    tqdm.write(f'Training loss: {loss_train_avg}')

    evaluate(model, loader)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/500 [00:00<?, ?it/s]


Epoch 1
Training loss: 6.196246120452881


  0%|          | 0/500 [00:00<?, ?it/s]

Exact Match Score: 0.0
Average F1 Score: 0.05209523809523806
Average BLEU Score: 0.013024588787579828
Average ROUGE-N Score: 0.06488299660972846
Average ROUGE-L Score: 0.06257018018635956
Accuracy Score: 0.0


Epoch 2:   0%|          | 0/500 [00:00<?, ?it/s]


Epoch 2
Training loss: 6.19415505695343


  0%|          | 0/500 [00:00<?, ?it/s]

Exact Match Score: 0.0
Average F1 Score: 0.052095238095238035
Average BLEU Score: 0.013024588787579842
Average ROUGE-N Score: 0.06488299660972842
Average ROUGE-L Score: 0.06257018018635951
Accuracy Score: 0.0


Epoch 3:   0%|          | 0/500 [00:00<?, ?it/s]