<a href="https://colab.research.google.com/github/chethana613/qna-ai-chatbot/blob/main/Roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers
!pip install datasets
!pip install nltk
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [1]:
!pip install transformers[torch] accelerate

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86

In [2]:
"""https://huggingface.co/deepset/roberta-base-squad2"""
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = nlp(QA_input)

# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [4]:
res # score == F1 Score: it's computed on the individual words in the prediction vs the true words provided in context

{'score': 0.21171429753303528,
 'start': 59,
 'end': 84,
 'answer': 'gives freedom to the user'}

In [9]:
cd /content/sample_data

/content/sample_data


In [15]:
from datasets import load_dataset
import torch
from tqdm.auto import tqdm
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge import Rouge

# Load certain rows of squad dataset
data = load_dataset('squad')

# Function to add the start and end index for answer context pair
def add_end_idx(answers, contexts):
    new_answers = []
    for answer, context in tqdm(zip(answers, contexts)):
        answer['text'] = answer['text'][0]
        answer['answer_start'] = answer['answer_start'][0]
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
        new_answers.append(answer)
    return new_answers

def prep_data(dataset):
    questions = dataset['question']
    contexts = dataset['context']
    answers = add_end_idx(dataset['answers'], contexts)
    return {
        'question': questions,
        'context': contexts,
        'answers': answers
    }

dataset = prep_data(data['train'].shuffle(seed=123).select(range(1000)))

# Tokenization
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

train = tokenizer(dataset['context'], dataset['question'],
                  truncation=True, padding='max_length',
                  max_length=512, return_tensors='pt')

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in tqdm(range(len(answers))):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train, dataset['answers'])

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train)

loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)

# Load the RoBERTa model
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

optimizer1 = torch.optim.AdamW(model.parameters(), lr=0.01, eps=0.01)

epochs = 2

# Initialize Rouge
rouge = Rouge()

def normalize_text(text):
    # Example normalization: convert to lowercase and remove leading/trailing white spaces
    return text.lower().strip()

def exact_match(prediction, truth):
    return normalize_text(prediction) == normalize_text(truth)

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return round(2 * (prec * rec) / (prec + rec), 2)

def evaluate(model, dataloader):
    model.eval()
    exact_match_scores = []
    f1_scores = []
    bleu_scores = []
    rouge_n_scores = []
    rouge_l_scores = []
    accuracy_scores = []

    rouge = Rouge()

    for batch in tqdm(dataloader):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            start_pred = torch.argmax(outputs.start_logits, dim=1)
            end_pred = torch.argmax(outputs.end_logits, dim=1)

            for i in range(len(start_true)):
                pred_answer = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]+1], skip_special_tokens=True)
                true_answer = tokenizer.decode(input_ids[i][start_true[i]:end_true[i]+1], skip_special_tokens=True)

                # Debugging: Print predictions and true answers
                # print(f'Pred Answer: {pred_answer}')
                # print(f'True Answer: {true_answer}')

                # Skip empty predictions
                if not pred_answer.strip():
                    continue

                em_score = exact_match(pred_answer, true_answer)
                f1_score = compute_f1(pred_answer, true_answer)

                exact_match_scores.append(em_score)
                f1_scores.append(f1_score)

                # Compute BLEU score
                smoothie = SmoothingFunction().method4
                bleu_score = corpus_bleu([[true_answer.split()]], [pred_answer.split()], smoothing_function=smoothie)
                bleu_scores.append(bleu_score)

                # Accuracy calculation
                start_correct = (start_pred[i] == start_true[i]).item()
                end_correct = (end_pred[i] == end_true[i]).item()
                accuracy = (start_correct and end_correct)
                accuracy_scores.append(accuracy)

                # Compute ROUGE scores
                try:
                    rouge_scores = rouge.get_scores(pred_answer, true_answer)
                    rouge_n_scores.append(rouge_scores[0]['rouge-1']['f'])
                    rouge_l_scores.append(rouge_scores[0]['rouge-l']['f'])
                except Exception as e:
                    pass

    if len(exact_match_scores) > 0:
        exact_match_score = sum(exact_match_scores) / len(exact_match_scores)
    else:
        exact_match_score = 0.0  # Handle case where no valid scores are available

    if len(f1_scores) > 0:
        f1_score_avg = sum(f1_scores) / len(f1_scores)
    else:
        f1_score_avg = 0.0  # Handle case where no valid scores are available

    if len(bleu_scores) > 0:
        bleu_score_avg = sum(bleu_scores) / len(bleu_scores)
    else:
        bleu_score_avg = 0.0  # Handle case where no valid scores are available

    if len(rouge_n_scores) > 0:
        rouge_n_avg = sum(rouge_n_scores) / len(rouge_n_scores)
    else:
        rouge_n_avg = 0.0  # Handle case where no valid scores are available

    if len(rouge_l_scores) > 0:
        rouge_l_avg = sum(rouge_l_scores) / len(rouge_l_scores)
    else:
        rouge_l_avg = 0.0  # Handle case where no valid scores are available

    if len(accuracy_scores) > 0:
        accuracy_avg = sum(accuracy_scores) / len(accuracy_scores)
    else:
        accuracy_avg = 0.0  # Handle case where no valid scores are available

    print(f'Exact Match Score: {exact_match_score}')
    print(f'Average F1 Score: {f1_score_avg}')
    print(f'Average BLEU Score: {bleu_score_avg}')
    print(f'Average ROUGE-N Score: {rouge_n_avg}')
    print(f'Average ROUGE-L Score: {rouge_l_avg}')
    print(f'Accuracy Score: {accuracy_avg}')

# Training loop and evaluation
for epoch in tqdm(range(1, epochs + 1)):
    model.train()
    loss_train_total = 0

    progress_bar = tqdm(loader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = {key: val.to(device) for key, val in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer1.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    torch.save(model.state_dict(), f'finetuned_roberta_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = loss_train_total / len(loader)
    tqdm.write(f'Training loss: {loss_train_avg}')

    # Evaluate the model after each epoch
    evaluate(model, loader)


0it [00:00, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/500 [00:00<?, ?it/s]

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Epoch 1
Training loss: 6.128946403503418


  0%|          | 0/500 [00:00<?, ?it/s]

Exact Match Score: 0.0
Average F1 Score: 0.0561029411764706
Average BLEU Score: 0.020257899456116064
Average ROUGE-N Score: 0.062355907701005006
Average ROUGE-L Score: 0.060901132242434486
Accuracy Score: 0.0


Epoch 2:   0%|          | 0/500 [00:00<?, ?it/s]


Epoch 2
Training loss: 6.354054202079773


  0%|          | 0/500 [00:00<?, ?it/s]

Exact Match Score: 0.0
Average F1 Score: 0.04917631917631918
Average BLEU Score: 0.013702554016620284
Average ROUGE-N Score: 0.058381930584899204
Average ROUGE-L Score: 0.057966871586457774
Accuracy Score: 0.0


In [None]:
from datasets import load_dataset
import torch
from tqdm.auto import tqdm
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge import Rouge

data = load_dataset('squad')

def add_end_idx(answers, contexts):
    new_answers = []
    for answer, context in tqdm(zip(answers, contexts)):
        answer['text'] = answer['text'][0]
        answer['answer_start'] = answer['answer_start'][0]
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
        new_answers.append(answer)
    return new_answers

def prep_data(dataset):
    questions = dataset['question']
    contexts = dataset['context']
    answers = add_end_idx(dataset['answers'], contexts)
    return {
        'question': questions,
        'context': contexts,
        'answers': answers
    }

dataset = prep_data(data['train'].shuffle(seed=123).select(range(10000)))

# Tokenization
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

train = tokenizer(dataset['context'], dataset['question'],
                  truncation=True, padding='max_length',
                  max_length=512, return_tensors='pt')

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in tqdm(range(len(answers))):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train, dataset['answers'])

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train)

loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)

# Load the RoBERTa model
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

optimizer1 = torch.optim.AdamW(model.parameters(), lr=0.01, eps=0.01)

epochs = 3

# Initialize Rouge
rouge = Rouge()

def normalize_text(text):
    # Example normalization: convert to lowercase and remove leading/trailing white spaces
    return text.lower().strip()

def exact_match(prediction, truth):
    return normalize_text(prediction) == normalize_text(truth)

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return round(2 * (prec * rec) / (prec + rec), 2)

def evaluate(model, dataloader):
    model.eval()
    exact_match_scores = []
    f1_scores = []
    bleu_scores = []
    rouge_n_scores = []
    rouge_l_scores = []
    accuracy_scores = []

    rouge = Rouge()

    for batch in tqdm(dataloader):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            start_pred = torch.argmax(outputs.start_logits, dim=1)
            end_pred = torch.argmax(outputs.end_logits, dim=1)

            for i in range(len(start_true)):
                pred_answer = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]+1], skip_special_tokens=True)
                true_answer = tokenizer.decode(input_ids[i][start_true[i]:end_true[i]+1], skip_special_tokens=True)

                # Skip empty predictions
                if not pred_answer.strip():
                    continue

                em_score = exact_match(pred_answer, true_answer)
                f1_score = compute_f1(pred_answer, true_answer)

                exact_match_scores.append(em_score)
                f1_scores.append(f1_score)

                # Compute BLEU score
                smoothie = SmoothingFunction().method4
                bleu_score = corpus_bleu([[true_answer.split()]], [pred_answer.split()], smoothing_function=smoothie)
                bleu_scores.append(bleu_score)

                # Accuracy calculation
                start_correct = (start_pred[i] == start_true[i]).item()
                end_correct = (end_pred[i] == end_true[i]).item()
                accuracy = (start_correct and end_correct)
                accuracy_scores.append(accuracy)

                # Compute ROUGE scores
                try:
                    rouge_scores = rouge.get_scores(pred_answer, true_answer)
                    rouge_n_scores.append(rouge_scores[0]['rouge-1']['f'])
                    rouge_l_scores.append(rouge_scores[0]['rouge-l']['f'])
                except Exception as e:
                    pass

    if len(exact_match_scores) > 0:
        exact_match_score = sum(exact_match_scores) / len(exact_match_scores)
    else:
        exact_match_score = 0.0  # Handle case where no valid scores are available

    if len(f1_scores) > 0:
        f1_score_avg = sum(f1_scores) / len(f1_scores)
    else:
        f1_score_avg = 0.0  # Handle case where no valid scores are available

    if len(bleu_scores) > 0:
        bleu_score_avg = sum(bleu_scores) / len(bleu_scores)
    else:
        bleu_score_avg = 0.0  # Handle case where no valid scores are available

    if len(rouge_n_scores) > 0:
        rouge_n_avg = sum(rouge_n_scores) / len(rouge_n_scores)
    else:
        rouge_n_avg = 0.0  # Handle case where no valid scores are available

    if len(rouge_l_scores) > 0:
        rouge_l_avg = sum(rouge_l_scores) / len(rouge_l_scores)
    else:
        rouge_l_avg = 0.0  # Handle case where no valid scores are available

    if len(accuracy_scores) > 0:
        accuracy_avg = sum(accuracy_scores) / len(accuracy_scores)
    else:
        accuracy_avg = 0.0  # Handle case where no valid scores are available

    print(f'Exact Match Score: {exact_match_score}')
    print(f'Average F1 Score: {f1_score_avg}')
    print(f'Average BLEU Score: {bleu_score_avg}')
    print(f'Average ROUGE-N Score: {rouge_n_avg}')
    print(f'Average ROUGE-L Score: {rouge_l_avg}')
    print(f'Accuracy Score: {accuracy_avg}')

# Training loop and evaluation
for epoch in tqdm(range(1, epochs + 1)):
    model.train()
    loss_train_total = 0

    progress_bar = tqdm(loader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = {key: val.to(device) for key, val in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer1.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    torch.save(model.state_dict(), f'finetuned_roberta_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = loss_train_total / len(loader)
    tqdm.write(f'Training loss: {loss_train_avg}')

    # Evaluate the model after each epoch
    evaluate(model, loader)


0it [00:00, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/5000 [00:00<?, ?it/s]

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



Epoch 1
Training loss: 6.2841390712738034


  0%|          | 0/5000 [00:00<?, ?it/s]

Exact Match Score: 0.0
Average F1 Score: 0.06295368482376971
Average BLEU Score: 0.018558219638027635
Average ROUGE-N Score: 0.07235464622711403
Average ROUGE-L Score: 0.07100619669906205
Accuracy Score: 0.0


Epoch 2:   0%|          | 0/5000 [00:00<?, ?it/s]


Epoch 2
Training loss: 6.244938175296784


  0%|          | 0/5000 [00:00<?, ?it/s]