## Setup notebook
**First, set the notebook runtime to T4 GPU.**
Steps derived from this article: https://mlabonne.github.io/blog/posts/Fine_Tune_Your_Own_Llama_2_Model_in_a_Colab_Notebook.html

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/244.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K 

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    StoppingCriteria,
    StoppingCriteriaList,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/llama-2-7b-chat-hf"

# Fine-tuned model name
new_model = "llama-2-7b-finetuned"  # TODO - change this depending on finetuning task

In [None]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [None]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [None]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

In [None]:
################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

## Define prompt

In [None]:
prompt = ""
prompt = prompt + "Provide an explanation on why this pun is funny.\n"
prompt = prompt + "When answering, follow these examples:\n"

# Example 1
prompt = prompt + "Pun: I phoned the zoo but the lion was busy.\n"
prompt = prompt + "Explanation: This is a pun on the phrase 'the line was busy' which means that the telephone line was currently engaged in another call. However, 'lion' is used in its place because it sounds similar and lions are an animal found in zoos.\n\n"

# Example 2
prompt = prompt + "Pun: I keep reading 'The Lord of the Rings' over and over. I guess it's just force of hobbit.\n"
prompt = prompt + "Explanation: Force of habit means something that has the tendency for something to be done frequently. Lord of the Rings is a series of fantasy novels written by J.R.R. Tolkien. 'The Hobbit' is a fantasy novel that proceeded the Lord of the Rings series which was also written by J.R.R. Tolkien. The joke is centered around the word 'hobbit' since it sounds like 'habit' and its use in the common phrase 'force of habit', turned into 'force of hobbit'.\n\n"

# Example 3
prompt = prompt + "Pun: The fisherman kept bragging about the big fish he caught, but he would not be very pacific about where he caught it.\n"
prompt = prompt + "Explanation: Pacific is the name of an ocean. An ocean is where you can catch fish. 'Pacific' sounds close to 'specific'. Since the joke is discussing a fishman catching fish, the word 'specific' was replaced with 'pacific' which relates to an ocean where fish can be found. The joke is playing on the word 'pacific' because it sounds close to 'specific'.\n"

print(prompt)

Provide an explanation on why this pun is funny.
When answering, follow these examples:
Pun: I phoned the zoo but the lion was busy.
Explanation: This is a pun on the phrase 'the line was busy' which means that the telephone line was currently engaged in another call. However, 'lion' is used in its place because it sounds similar and lions are an animal found in zoos.

Pun: I keep reading 'The Lord of the Rings' over and over. I guess it's just force of hobbit.
Explanation: Force of habit means something that has the tendency for something to be done frequently. Lord of the Rings is a series of fantasy novels written by J.R.R. Tolkien. 'The Hobbit' is a fantasy novel that proceeded the Lord of the Rings series which was also written by J.R.R. Tolkien. The joke is centered around the word 'hobbit' since it sounds like 'habit' and its use in the common phrase 'force of habit', turned into 'force of hobbit'.

Pun: The fisherman kept bragging about the big fish he caught, but he would not 

## Load data

The following prompt template is used to format the training and inference inputs:
- Training: https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k
- Inference: https://gpus.llm-utils.org/llama-2-prompt-template/

In [None]:
# Load dataset
import pandas as pd

filename_train_dataset = "data/train_explanations.csv"  # each row is an explanation; for training!
filename_test_dataset = "data/test_jokes.csv"  # each row is a joke; for inference!

train_dataset = load_dataset('csv', data_files=filename_train_dataset, split="train")
test_dataset = load_dataset('csv', data_files=filename_test_dataset, split="train")

df_train = pd.read_csv(filename_train_dataset)
df_test = pd.read_csv(filename_test_dataset)

df_train["model_train_input_text"] = df_train.apply(lambda x: "<s>[INST] <<SYS>>\n" + prompt + "\n<</SYS>>\n\n" + x["text"] + " [/INST] " + x["Natural language explanation"] + "</s>", axis=1)
df_test["model_inference_input_text"] = df_test.apply(lambda x: "<s>[INST] <<SYS>>\n" + prompt + "\n<</SYS>>\n\n" + x["text"] + " [/INST]", axis=1)

train_dataset = train_dataset.add_column(name="model_train_input_text", column=df_train["model_train_input_text"].to_list())
test_dataset = test_dataset.add_column(name="model_inference_input_text", column=df_test["model_inference_input_text"].to_list())

print(train_dataset)  # number of explanations; divide by 5 to get number of jokes
print(test_dataset)  # number of jokes; multiply by 5 to get number of explanations

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['ID', 'Annotator_ID', 'Funniness (1-5)', 'Is a Joke?', 'Joke keywords', 'Natural language explanation', 'Offensive/Inappropriate?', 'Understand the text?', 'text', 'model_train_input_text'],
    num_rows: 2300
})
Dataset({
    features: ['ID', 'text', 'Natural language explanation 1', 'Natural language explanation 2', 'Natural language explanation 3', 'Natural language explanation 4', 'Natural language explanation 5', 'model_inference_input_text'],
    num_rows: 115
})


## Finetuning methods

In [None]:
from tqdm import tqdm

def generate_outputs(dataset, model, tokenizer, batch_size=24, max_new_tokens=200):  #150
    """Generate outputs for each input in the dataset."""
    input_texts = [example["model_inference_input_text"] for example in dataset]
    output_texts = []

    for i in tqdm(list(range(0, len(input_texts), batch_size))):  # for each batch
        input_texts_batch = input_texts[i:min(i+batch_size,len(input_texts))]

        # Tokenize
        tokenize_result = tokenizer(input_texts_batch, return_tensors="pt", padding=True).to(model.device)

        # Generate outputs
        with torch.no_grad():
            outputs_batch = model.generate(**tokenize_result,
                                           max_new_tokens=max_new_tokens)

        # Decode outputs
        output_texts_batch = tokenizer.batch_decode(outputs_batch, skip_special_tokens=True)
        output_texts.extend(output_texts_batch)

    return output_texts

def print_joke_with_output(dataset, outputs, limit):
    for i, example in enumerate(dataset):
        if i == limit:
            break
        print("Pun:", example["text"])
        print("Output:", outputs[i])
        print()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from statistics import mean
import numpy as np

def get_sentence_embedding(sentence, model, tokenizer):
    """
    Get the sentence embedding for a given sentence.
    """
    input_ids = tokenizer.encode(sentence, return_tensors='pt')
    with torch.no_grad():
        output = model(input_ids)[0]
    sentence_embedding = torch.mean(output, dim=1).squeeze()
    return sentence_embedding.detach().cpu().numpy()

def calculate_cosine_similarity(dataset, outputs):
    """
    Calculate the cosine similarity between the prediction and ground truth for the entire dataset.
    """
    model_name = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    cosine_similarities = {"1": [], "2": [], "3": [], "4": [], "5": [], "mean": []}
    preds = []

    dataset = dataset.add_column(name="model_output_text", column=outputs)
    for example in tqdm(dataset):

        pred = example['model_output_text']
        if "[/INST]" in pred:
            idx = pred.index("[/INST]") + len("[/INST]")
            pred = pred[idx:]
        if "</s>" in pred:
            idx = pred.index("</s>")
            pred = pred[:idx]
        if len(pred) > 512:  # manual truncation
            pred = pred[:512]
        pred = pred[1:]  # remove newline or leading ']' character
        pred = pred.strip()  # remove leading or trailing whitespace
        preds.append(pred)

        pred_embedding = get_sentence_embedding(pred, model, tokenizer)

        cosine_similarities_example = []
        for i in range(5):
            groundtruth = example[f'Natural language explanation {i + 1}']
            groundtruth_embedding = get_sentence_embedding(groundtruth, model, tokenizer)

            cosine_sim = cosine_similarity([pred_embedding], [groundtruth_embedding])[0][0]
            cosine_similarities_example.append(cosine_sim)
            cosine_similarities[f"{i + 1}"].append(cosine_sim)

        cosine_similarities["mean"].append(mean(cosine_similarities_example))

    return cosine_similarities, preds

In [None]:
def evaluate_cosine_similarity(sims):
  print(f"Mean cosine similarity: {np.mean(sims):.4f}")
  print(f"Standard deviation of cosine similarities: {np.std(sims):.4f}")
  print(f"Minimum cosine similarity: {np.min(sims):.4f}")
  print(f"1st quartile cosine similarity: {np.percentile(sims, 25):.4f}")
  print(f"Median cosine similarity: {np.median(sims):.4f}")
  print(f"3rd quartile cosine similarity: {np.percentile(sims, 75):.4f}")
  print(f"Maximum cosine similarity: {np.max(sims):.4f}")

In [None]:
def save_outputs_to_file(outputs, cosine_similarities, filename, preds=None):
    df_output = df_test.copy()
    df_output["output"] = outputs
    if preds is not None:
        df_output["output_processed"] = preds
    for k, v in cosine_similarities.items():
        df_output[f"cosine_similarity_{k}"] = v
    df_output.to_csv(filename, index=False)

## Evaluate model before finetuning

In [None]:
outputs_before = generate_outputs(test_dataset, model, tokenizer)

100%|██████████| 5/5 [2:59:49<00:00, 2157.85s/it]


In [None]:
print_joke_with_output(test_dataset, outputs_before, 10)

Pun: When bottled water is cheap it's called a liquidation sale .
Output: [INST] <<SYS>>
Provide an explanation on why this pun is funny.
When answering, follow these examples:
Pun: I phoned the zoo but the lion was busy.
Explanation: This is a pun on the phrase 'the line was busy' which means that the telephone line was currently engaged in another call. However, 'lion' is used in its place because it sounds similar and lions are an animal found in zoos.

Pun: I keep reading 'The Lord of the Rings' over and over. I guess it's just force of hobbit.
Explanation: Force of habit means something that has the tendency for something to be done frequently. Lord of the Rings is a series of fantasy novels written by J.R.R. Tolkien. 'The Hobbit' is a fantasy novel that proceeded the Lord of the Rings series which was also written by J.R.R. Tolkien. The joke is centered around the word 'hobbit' since it sounds like 'habit' and its use in the common phrase 'force of habit', turned into 'force of h

In [None]:
cosine_similarities_before, _ = calculate_cosine_similarity(test_dataset, outputs_before)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 115/115 [02:06<00:00,  1.10s/it]


In [None]:
evaluate_cosine_similarity(cosine_similarities_before["mean"])

Mean cosine similarity: 0.8087
Standard deviation of cosine similarities: 0.0374
Minimum cosine similarity: 0.6651
1st quartile cosine similarity: 0.7915
Median cosine similarity: 0.8104
3rd quartile cosine similarity: 0.8380
Maximum cosine similarity: 0.8822


In [None]:
save_outputs_to_file(outputs_before, cosine_similarities_before, "before_finetuning.csv")

## Train model

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="model_train_input_text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/2300 [00:00<?, ? examples/s]

In [None]:
# Train model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.4713
50,0.2776
75,0.3213
100,0.195
125,0.3112
150,0.1912
175,0.3063
200,0.1932
225,0.2661
250,0.1699




TrainOutput(global_step=575, training_loss=0.2819382095336914, metrics={'train_runtime': 3445.85, 'train_samples_per_second': 0.667, 'train_steps_per_second': 0.167, 'total_flos': 2.138605396475904e+16, 'train_loss': 0.2819382095336914, 'epoch': 1.0})

In [None]:
trainer.save_model(new_model)

In [None]:
import os
from zipfile import ZipFile

# Set the directory to be zipped
folder_to_zip = f"/content/{new_model}"

# Set the output zip file name
zip_filename = f"{new_model}.zip"

# Create a ZipFile object
with ZipFile(zip_filename, 'w') as zip_file:
    # Walk through the folder and add each file to the zip
    for root, dirs, files in os.walk(folder_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            zip_file.write(file_path)

print(f"Zip file '{zip_filename}' created successfully!")

Zip file 'llama-2-7b-finetuned.zip' created successfully!


## Evaluate model after finetuning

In [None]:
outputs_after = generate_outputs(test_dataset, model, tokenizer)

100%|██████████| 5/5 [2:24:39<00:00, 1735.92s/it]


In [None]:
print_joke_with_output(test_dataset, outputs_after, 10)

Pun: When bottled water is cheap it's called a liquidation sale .
Output: [INST] <<SYS>>
Provide an explanation on why this pun is funny.
When answering, follow these examples:
Pun: I phoned the zoo but the lion was busy.
Explanation: This is a pun on the phrase 'the line was busy' which means that the telephone line was currently engaged in another call. However, 'lion' is used in its place because it sounds similar and lions are an animal found in zoos.

Pun: I keep reading 'The Lord of the Rings' over and over. I guess it's just force of hobbit.
Explanation: Force of habit means something that has the tendency for something to be done frequently. Lord of the Rings is a series of fantasy novels written by J.R.R. Tolkien. 'The Hobbit' is a fantasy novel that proceeded the Lord of the Rings series which was also written by J.R.R. Tolkien. The joke is centered around the word 'hobbit' since it sounds like 'habit' and its use in the common phrase 'force of habit', turned into 'force of h

In [None]:
cosine_similarities_after, preds = calculate_cosine_similarity(test_dataset, outputs_after)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 115/115 [01:51<00:00,  1.04it/s]


In [None]:
evaluate_cosine_similarity(cosine_similarities_after["mean"])

Mean cosine similarity: 0.7780
Standard deviation of cosine similarities: 0.0748
Minimum cosine similarity: 0.3328
1st quartile cosine similarity: 0.7478
Median cosine similarity: 0.7873
3rd quartile cosine similarity: 0.8288
Maximum cosine similarity: 0.8865


In [None]:
save_outputs_to_file(outputs_after, cosine_similarities_after, "after_finetuning.csv", preds=preds)

## Optional: save/load model to/from Huggingface hub

TODO: look at this later https://gist.github.com/younesbelkada/89fd3984a2992fdbb408fa8e3bf44101

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("llama2-qlora-finetuned-puns-v0")

NotImplementedError: You are calling `save_pretrained` on a 4-bit converted model. This is currently not supported