In [1]:
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install datasets scipy protobuf py7zr peft bitsandbytes fire torch_tb_profiler ipywidgets
!pip install transformers==4.32.0 tiktoken einops scipy transformers_stream_generator==0.0.4 peft deepspeed

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting scipy
  Downloading scipy-1.11.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:

In [2]:
from huggingface_hub import login

login(token="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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### 1. Load Dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("quyanh/helm-samsum-dolly-lima", split="train").train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt'],
        num_rows: 19766
    })
    test: Dataset({
        features: ['prompt'],
        num_rows: 2197
    })
})

In [2]:
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['prompt'],
    num_rows: 19766
})
Dataset({
    features: ['prompt'],
    num_rows: 2197
})


### 2. Load Base Model

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "Qwen/Qwen-14B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, trust_remote_code=True)
model.config.window = 2048

The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Try importing flash-attention for faster inference...


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]



### 3. Tokenization

Set up the tokenizer. Add padding on the left as it [makes training use less memory](https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa).


In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_eos_token=True,
    trust_remote_code=True,
    eos_token="<|endoftext|>",
)

In [5]:
tokenizer.eos_token_id

151643

In [6]:
tokenizer("Hello <|endoftext|>", add_eos_token=True)

{'input_ids': [9707, 220, 151643], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning is](https://neptune.ai/blog/self-supervised-learning):

In [7]:
from tqdm import tqdm
from itertools import chain

from torch.utils.data import Dataset

class Concatenator(object):
    def __init__(self, chunk_size=2048):
        self.chunk_size=chunk_size
        self.residual = {"input_ids": [], "attention_mask": []}

    def __call__(self, batch):
        concatenated_samples = {
            k: v + list(chain(*batch[k])) for k, v in self.residual.items()
        }

        total_length = len(concatenated_samples[list(concatenated_samples.keys())[0]])

        if total_length >= self.chunk_size:
            chunk_num = total_length // self.chunk_size
            result = {
                k: [
                    v[i : i + self.chunk_size]
                    for i in range(0, chunk_num * self.chunk_size, self.chunk_size)
                ]
                for k, v in concatenated_samples.items()
            }
            self.residual = {
                k: v[(chunk_num * self.chunk_size) :]
                for k, v in concatenated_samples.items()
            }
        else:
            result = concatenated_samples
            self.residual = {k: [] for k in concatenated_samples.keys()}

        result["labels"] = result["input_ids"].copy()

        return result

In [8]:
def get_dataset(dataset):
    prompt = ("""{prompt}<|endoftext|>""")

    def apply_prompt_template(sample):
        return {
            "text": prompt.format(
                prompt=sample["prompt"]
            )
        }

    dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))

    dataset = dataset.map(
        lambda sample: tokenizer(sample["text"]),
        batched=True,
        remove_columns=list(dataset.features)
    )
    dataset = dataset.remove_columns(['token_type_ids'])
    dataset = dataset.map(Concatenator(), batched=True)
    return dataset

In [9]:
tokenized_train_dataset = get_dataset(train_dataset)
tokenized_val_dataset = get_dataset(eval_dataset)

Map:   0%|          | 0/19766 [00:00<?, ? examples/s]

Map:   0%|          | 0/19766 [00:00<?, ? examples/s]

Map:   0%|          | 0/19766 [00:00<?, ? examples/s]

Map:   0%|          | 0/2197 [00:00<?, ? examples/s]

Map:   0%|          | 0/2197 [00:00<?, ? examples/s]

Map:   0%|          | 0/2197 [00:00<?, ? examples/s]

In [10]:
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2978
})

In [11]:
# tokenizer.decode(tokenized_train_dataset[0]['input_ids'])

In [12]:
print(tokenized_train_dataset[0]['input_ids'][:200])

[14582, 25, 47199, 4246, 702, 311, 8284, 220, 20, 19, 15, 16302, 315, 7640, 429, 525, 19375, 1119, 220, 18, 15, 47630, 81720, 13, 1416, 279, 11601, 2783, 315, 1817, 17717, 374, 400, 16, 13, 20, 11, 1246, 1753, 686, 47199, 4246, 2291, 369, 279, 43754, 5267, 16141, 25, 2619, 525, 220, 20, 19, 15, 16302, 608, 220, 18, 15, 16302, 2899, 7698, 284, 1115, 20, 19, 15, 14, 18, 15, 28, 16, 23, 2452, 16, 23, 81720, 315, 7640, 4362, 13, 31040, 11, 279, 2790, 2783, 369, 279, 43754, 374, 400, 16, 13, 20, 2899, 7698, 856, 220, 16, 23, 81720, 284, 400, 2442, 16, 13, 20, 9, 16, 23, 28, 17, 22, 2452, 17, 22, 13, 576, 4226, 374, 220, 17, 22, 13, 151643, 14582, 25, 2585, 1657, 50436, 525, 2115, 304, 279, 1879, 5267, 32, 13, 2619, 614, 2581, 1012, 50436, 304, 279, 1879, 624, 33, 13, 2619, 374, 825, 25105, 2115, 304, 279, 1879, 624, 34, 13, 2619, 525, 1378, 50436, 2115, 304, 279, 1879, 624, 35, 13, 2619, 374, 264, 40936, 315, 50436, 2115, 304, 279, 1879, 624, 36, 13, 2619, 525, 902, 50436, 2115, 304, 279, 1

In [13]:
print(len(tokenized_train_dataset[0]['input_ids']))

2048


#### How does the base model do?

In [14]:
eval_prompt = """
Summarize this dialog:
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we’ve discussed it many times. I think he’s ready now.
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-)
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))
---
"""

In [None]:
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))

We can see it doesn't do very well out of the box.

### 4. Set Up LoRA

Now, to start our fine-tuning, we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

Let's print the model to examine its layers, as we will apply QLoRA to all the linear layers of the model. Those layers are `q_proj`, `k_proj`, `v_proj`, `o_proj`, `gate_proj`, `up_proj`, `down_proj`, and `lm_head`.

In [None]:
# print(model)

Here we define the LoRA config.

`r` is the rank of the low-rank matrix used in the adapters, which thus controls the number of parameters trained. A higher rank will allow for more expressivity, but there is a compute tradeoff.

`alpha` is the scaling factor for the learned weights. The weight matrix is scaled by `alpha/r`, and thus a higher value for `alpha` assigns more weight to the LoRA activations.

The values used in the QLoRA paper were `r=64` and `lora_alpha=16`, and these are said to generalize well, but we will use `r=8` and `lora_alpha=16` so that we have more emphasis on the new fine-tuned data while also reducing computational complexity.

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Apply the accelerator. You can comment this out to remove the accelerator.
# model = accelerator.prepare_model(model)

See how the model looks different now, with the LoRA adapters added:


Let's use Weights & Biases to track our training metrics. You'll need to apply an API key when prompted. Feel free to skip this if you'd like, and just comment out the `wandb` parameters in the `Trainer` definition below.

In [None]:
!pip install -q wandb -U

import wandb, os
wandb.login()

wandb_project = "qwen-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

### 5. Run Training!

I used 500 steps, but I found the model should have trained for longer as it had not converged by then, so I upped the steps to 1000 below.

A note on training. You can set the `max_steps` to be high initially, and examine at what step your model's performance starts to degrade. There is where you'll find a sweet spot for how many steps to perform. For example, say you start with 1000 steps, and find that at around 500 steps the model starts overfitting - the validation loss goes up (bad) while the training loss goes down significantly, meaning the model is learning the training set really well, but is unable to generalize to new datapoints. Therefore, 500 steps would be your sweet spot, so you would use the `checkpoint-500` model repo in your output dir (`mistral-finetune-viggo`) as your final model in step 6 below.

You can interrupt the process via Kernel -> Interrupt Kernel in the top nav bar once you realize you didn't need to train anymore.

In [None]:
import transformers
from datetime import datetime

project = "qwen-finetune"
base_model_name = "qwen"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        max_steps=500,
        learning_rate=2e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=20,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=20,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=20,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
# from huggingface_hub import login, HfApi

# api = HfApi()

# api.upload_folder(
#     folder_path="mistral-mistral-finetune/checkpoint-500",
#     repo_id="quyanh/qwen-14b-neurips-v1",
#     repo_type='model',
# )