In [None]:
!pip install -q -U transformers datasets
!pip install accelerate -q -U
!pip install -q -U peft
!pip install -q -U rouge_score
!pip install -q -U evaluate
!pip install -q -U bitsandbytes
!pip install -q -U trl
!pip install -q -U bert-score
!pip install -q -U tqdm

In [None]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:1000]")
squad = squad.train_test_split(test_size=0.2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_ckpt = "bigscience/bloomz-560m"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
base_model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BloomForQuestionAnswering were not initialized from the model checkpoint at bigscience/bloomz-560m and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while idx < len(sequence_ids) and sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while idx < len(sequence_ids) and sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

# Fine tune base model

In [None]:
import copy
from transformers import TrainingArguments, Trainer

batch_size = 8
training_args = TrainingArguments(
    output_dir="fine-tuned-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
)

finetuned_trainer = Trainer(
    model=copy.deepcopy(base_model),
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

finetuned_trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,1.930354
2,No log,1.933329
3,No log,2.691486
4,No log,3.170162
5,0.859500,3.597589




TrainOutput(global_step=500, training_loss=0.859456787109375, metrics={'train_runtime': 293.9132, 'train_samples_per_second': 13.609, 'train_steps_per_second': 1.701, 'total_flos': 2786139850752000.0, 'train_loss': 0.859456787109375, 'epoch': 5.0})

# LoRA finetuning

In [None]:
base_model

BloomForQuestionAnswering(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True

In [None]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
lora_config = LoraConfig(
        lora_alpha=256,
        lora_dropout=0.05,
        r=512,
        bias="none",
        target_modules= ["query_key_value"],
)
lora_model = get_peft_model(copy.deepcopy(base_model), lora_config)

lora_training_args = TrainingArguments(
    output_dir="lora-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
)

lora_trainer = Trainer(
    model=lora_model,
    args=lora_training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

lora_trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log
4,No log,No log
5,2.272200,No log




TrainOutput(global_step=500, training_loss=2.272196044921875, metrics={'train_runtime': 236.6841, 'train_samples_per_second': 16.9, 'train_steps_per_second': 2.113, 'total_flos': 3249996318720000.0, 'train_loss': 2.272196044921875, 'epoch': 5.0})

QLoRA Training

In [None]:
from transformers import BitsAndBytesConfig
import torch

compute_dtype = getattr(torch, "float16")
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)

qlora_model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt, quantization_config=quantization_config, trust_remote_code=True)
qlora_model.gradient_checkpointing_enable()
qlora_model = prepare_model_for_kbit_training(qlora_model)
qlora_model = get_peft_model(qlora_model, lora_config)

qlora_training_args = TrainingArguments(
    output_dir="qlora-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
)

qlora_trainer = Trainer(
    model=qlora_model,
    args=qlora_training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

qlora_trainer.train()

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of BloomForQuestionAnswering were not initialized from the model checkpoint at bigscience/bloomz-560m and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log
4,No log,No log
5,2.401300,No log




TrainOutput(global_step=500, training_loss=2.40133544921875, metrics={'train_runtime': 334.9467, 'train_samples_per_second': 11.942, 'train_steps_per_second': 1.493, 'total_flos': 3249996318720000.0, 'train_loss': 2.40133544921875, 'epoch': 5.0})

# Evaluation

In [None]:
from transformers import pipeline
import pandas as pd
from bert_score import score


def evaluate(model, tokenizer, dataset, name):
    question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)
    questions = dataset["question"]
    answers = dataset["answers"]
    context = dataset["context"]
    predictions = []
    for i in range(len(questions)):
        answer = question_answerer(question=questions[i], context=context[i])
        predictions.append(answer)

    answers = [answer["text"][0] for answer in answers]
    predictions = [prediction["answer"] for prediction in predictions]
    R, P, F1 = score(answers, predictions, lang="en")

    results = {
        "Recall": R.mean().item(),
        "Precision": P.mean().item(),
        "F1 Score": F1.mean().item(),
    }

    results_df = pd.DataFrame(results, index=[name])


    return results_df

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


              Recall  Precision  F1 Score
Fine-tuned  0.840188   0.834017  0.835862




In [None]:
base_model_df = evaluate(base_model, tokenizer, squad["test"], "Base")
fine_tuned_df = evaluate(finetuned_trainer.model, tokenizer, squad["test"], "Full Model")
lora_df = evaluate(lora_trainer.model, tokenizer, squad["test"], "LoRA")
qlora_df = evaluate(qlora_trainer.model, tokenizer, squad["test"], "QLoRA")
results_df = pd.concat([base_model_df,fine_tuned_df, lora_df, qlora_df], axis=0)
results_df

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'PeftModel' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuest

Unnamed: 0,Recall,Precision,F1 Score
Base,0.824581,0.808797,0.815869
Full Model,0.840188,0.834017,0.835862
LoRA,0.821706,0.809451,0.814471
QLoRA,0.822033,0.819954,0.819742


In [None]:
def calculate_model_size(model, debug=False):
    total_bytes = 0
    for name, param in model.named_parameters():
        param_size = param.nelement() * param.element_size()
        if debug:
          print(f"Name: {name}, Size: {param_size} bytes")
        total_bytes += param_size

    # Converting bytes to megabytes and gigabytes
    total_megabytes = total_bytes / (1024 ** 2)
    total_gigabytes = total_bytes / (1024 ** 3)
    return total_bytes, total_megabytes, total_gigabytes

In [None]:
results_df

Unnamed: 0,Recall,Precision,F1 Score
Base,0.824581,0.808797,0.815869
Full Model,0.840188,0.834017,0.835862
LoRA,0.821706,0.809451,0.814471
QLoRA,0.822033,0.819954,0.819742


In [None]:
_, _, base_size_gigabytes = calculate_model_size(base_model)
_, _, fine_tuned_size_gigabytes = calculate_model_size(finetuned_trainer.model)
_, _, lora_size_gigabytes = calculate_model_size(lora_trainer.model)
_, _, qlora_size_gigabytes = calculate_model_size(qlora_trainer.model)



results_df['Size (GB)'] = [base_size_gigabytes, fine_tuned_size_gigabytes, lora_size_gigabytes, qlora_size_gigabytes]
results_df


Unnamed: 0,Recall,Precision,F1 Score,Size (GB)
Base,0.824581,0.808797,0.815869,2.083244
Full Model,0.840188,0.834017,0.835862,2.083244
LoRA,0.821706,0.809451,0.814471,2.270744
QLoRA,0.822033,0.819954,0.819742,1.286369
