# References:

### Model code
- https://www.run.ai/guides/generative-ai/llama-2-fine-tuning
- https://github.com/curiousily/Get-Things-Done-with-Prompt-Engineering-and-LangChain/blob/master/llama-2.ipynb

### Custom dataset
- https://discuss.huggingface.co/t/loading-custom-datasets/6166/4
- https://dassum.medium.com/fine-tune-large-language-model-llm-on-a-custom-dataset-with-qlora-fb60abdeba07

# Training

In [1]:
import os
import deepspeed
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from peft import LoraConfig
from trl import SFTTrainer
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


[2024-03-17 17:31:00,775] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
base_model = "NousResearch/Llama-2-7b-chat-hf"
new_model = "llama-6k"

In [3]:
# Load the NIv2 dataset from the .json file in the directory
dataset = load_dataset('json', data_files='6k_train.json')


In [4]:
def combine_features(examples):
    combined_text = "Instruction: " + examples['instruction'] + " Question: " + examples['question'] + " Answer: " + examples['answer']
    return {"combined_text": combined_text}

# Apply the function to each example in the dataset
dataset['train'] = dataset['train'].map(combine_features)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'instruction', 'combined_text'],
        num_rows: 6248
    })
})

In [6]:
## Train Test split
# dataset = dataset['train'].train_test_split(test_size=0.1)

In [7]:
train_dataset = dataset["train"].map(lambda examples: {"text": examples["combined_text"]}, remove_columns=dataset["train"].column_names)
train_dataset

Dataset({
    features: ['text'],
    num_rows: 6248
})

In [8]:
# dataset = load_dataset(guanaco_dataset, split="train")
'''
data_files = {
    "train": "train.json",
    "validation": "valid.json",
    "test": "test.json"
}

dataset = load_dataset("json", data_files=data_files)
'''

'\ndata_files = {\n    "train": "train.json",\n    "validation": "valid.json",\n    "test": "test.json"\n}\n\ndataset = load_dataset("json", data_files=data_files)\n'

In [9]:
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype, 
    bnb_4bit_use_double_quant=False
)

In [10]:
# DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    base_model, 
    quantization_config=quant_config, 
    device_map={"": 0},
    use_flash_attention_2=False,
    torch_dtype=torch.float16
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.16s/it]


In [12]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Tokenizer config
pad_token = tokenizer.eos_token
padding_side = "right"
max_length=2048
truncation=True
padding='max_length'
return_tensors='pt'


In [15]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        max_length=max_length, 
        padding=padding, 
        truncation=truncation, 
        return_tensors="pt",
        # pad_token = pad_token,
        # padding_side = padding_side
    )

# Apply this function to your dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 6248/6248 [00:04<00:00, 1464.18 examples/s]


In [16]:
peft_params = LoraConfig(
    lora_alpha=16, 
    lora_dropout=0.1, 
    r=64, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [17]:
training_params = TrainingArguments(
    output_dir="./results", 
    num_train_epochs=1, 
    per_device_train_batch_size=4, 
    gradient_accumulation_steps=1, 
    optim="paged_adamw_32bit",
    save_steps=25, 
    logging_steps=25, 
    learning_rate=2e-4, 
    weight_decay=0.001, 
    fp16=False, 
    bf16=True, 
    max_grad_norm=0.3, 
    max_steps=-1, 
    warmup_ratio=0.03, 
    group_by_length=True, 
    lr_scheduler_type="constant", 
    report_to="tensorboard"
)


In [21]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False
)


Map: 100%|██████████| 6248/6248 [00:01<00:00, 6118.04 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [22]:
trainer.train()

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/peft/peft_model.py", line 1083, in forward
    return self.base_model(
           ^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/peft/tuners/tuners_utils.py", line 161, in forward
    return self.model.forward(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/transformers/models/llama/modeling_llama.py", line 1168, in forward
    outputs = self.model(
              ^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/transformers/models/llama/modeling_llama.py", line 1008, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/transformers/models/llama/modeling_llama.py", line 734, in forward
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
                                                          ^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tushar/.conda/envs/finetune/lib/python3.12/site-packages/transformers/models/llama/modeling_llama.py", line 665, in forward
    attn_output = torch.nn.functional.scaled_dot_product_attention(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Expected attn_mask dtype to be bool or to match query dtype, but got attn_mask.dtype: c10::BFloat16 and  query.dtype: float instead.


# Save the model

In [None]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

# Memory requirements

In [21]:
token_length = 512  # Tokens
bytes_per_token_id = 2  # bf16 for token IDs
bytes_per_attention_mask = 4  # int32 for attention masks
batch_size = 4

# Memory for token IDs per sample
memory_per_sample_ids = token_length * bytes_per_token_id

# Memory for attention mask per sample
memory_per_sample_attention_mask = token_length * bytes_per_attention_mask

# Total memory per sample for both token IDs and attention masks
total_memory_per_sample = memory_per_sample_ids + memory_per_sample_attention_mask

# Total memory for the batch
total_batch_memory = total_memory_per_sample * batch_size

print(f"Memory per sample for token IDs and attention masks: {total_memory_per_sample} bytes")
print(f"Total memory for batch (including token IDs and attention masks): {total_batch_memory/(1024*1024)} Mbytes")


Memory per sample for token IDs and attention masks: 3072 bytes
Total memory for batch (including token IDs and attention masks): 0.01171875 Mbytes


In [24]:
token_length = 2048
bytes_per_token_id = 4  # int32
bytes_per_attention_mask = 4  # Assuming int32 for simplicity
dataset_size = 10000

# Calculate memory requirements
memory_per_sample_ids = token_length * bytes_per_token_id
memory_per_sample_attention_mask = token_length * bytes_per_attention_mask
total_memory_per_sample = memory_per_sample_ids + memory_per_sample_attention_mask

total_memory_dataset_ids = memory_per_sample_ids * dataset_size
total_memory_dataset_attention_masks = memory_per_sample_attention_mask * dataset_size
total_memory_dataset = total_memory_per_sample * dataset_size

total_memory_dataset_ids_mb = total_memory_dataset_ids / (1024 * 1024)
total_memory_dataset_attention_masks_mb = total_memory_dataset_attention_masks / (1024 * 1024)
total_memory_dataset_mb = total_memory_dataset / (1024 * 1024)

total_memory_dataset_mb

156.25

# Evaluation

In [None]:
from tensorboard import notebook
log_dir = "results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))

In [None]:
logging.set_verbosity(logging.CRITICAL)
prompt = "Who is Leonardo Da Vinci?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])