In [2]:
# https://medium.com/@avishekpaul31/fine-tuning-llama-3-8b-instruct-qlora-using-low-cost-resources-89075e0dfa04

from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from transformers import BitsAndBytesConfig
model_id = "meta-llama/Meta-Llama-3-70B-Instruct"
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype=torch.bfloat16,
#     quantization_config=quantization_config,
#     device_map="cuda:1",
# )
tokenizer = AutoTokenizer.from_pretrained(model_id)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
# messages = [
#     {"role": "user", "content": "How to make methane by kitchen waste answer in 1 sentence?"},
# ]

# input_ids = tokenizer.apply_chat_template(
#     messages,
#     add_generation_prompt=True,
#     return_tensors="pt"
# ).to(model.device)

# terminators = [
#     tokenizer.eos_token_id,
#     tokenizer.convert_tokens_to_ids("<|eot_id|>")
# ]

# torch.backends.cuda.enable_mem_efficient_sdp(False)
# torch.backends.cuda.enable_flash_sdp(False)
# # https://github.com/Lightning-AI/litgpt/issues/327

# outputs = model.generate(
#     input_ids,
#     max_new_tokens=128,
#     eos_token_id=terminators,
#     do_sample=True,
#     temperature=0.6,
#     top_p=0.9,
# )

# response = outputs[0][input_ids.shape[-1]:]

# print(tokenizer.decode(response, skip_special_tokens=True))

In [4]:
# import json
# with open("data_train.json", "r") as f:
#     train = json.load(f)

# with open("data_val.json", "r") as f:
#     valid = json.load(f)

In [5]:
# train = list(map(lambda x: {"text":tokenizer.apply_chat_template(x, tokenize=False)}, train))
# valid = list(map(lambda x: {"text":tokenizer.apply_chat_template(x, tokenize=False)}, valid))

In [6]:
from datasets import load_dataset
dataset = load_dataset("json", data_files={"train": "data_train.json", "valid": "data_val.json"})

In [7]:
dataset["train"]

Dataset({
    features: ['0', '1'],
    num_rows: 170
})

In [8]:
from datasets import DatasetDict

dataset_dict = {"train": dataset["train"],
                "val": dataset["valid"]}

raw_datasets = DatasetDict(dataset_dict)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['0', '1'],
        num_rows: 170
    })
    val: Dataset({
        features: ['0', '1'],
        num_rows: 170
    })
})

In [9]:
import re
import random
from multiprocessing import cpu_count

def apply_chat_template(example, tokenizer):
    messages = example
    # print((messages)['0']["role"])
    # We add an empty system message if there is none
    # if messages[0]["role"] != "system":
    #     messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template([messages['0'],messages['1']], tokenize=False)

    return example

In [10]:
column_names = list(raw_datasets["train"].features)
raw_datasets = raw_datasets.map(apply_chat_template,
                                num_proc=1,
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template",)

In [11]:
print(raw_datasets["train"][10]["text"])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

What steps should be taken if problematic overfitting is observed in the best trials?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

If problematic overfitting is observed, rerun the experiments with additional regularization techniques (e.g., dropout, label smoothing, weight decay) and/or better tune existing regularization parameters to mitigate the overfitting before comparing the scientific hyperparameters.<|eot_id|>


In [12]:
# del trainer 
# torch.cuda.empty_cache()

NameError: name 'trainer' is not defined

In [14]:
model_id = "meta-llama/Meta-Llama-3-70B-Instruct"
trained_model_id = "Llama-3-70B-sft-lora-ultrachat"
output_dir = './' + trained_model_id
from transformers import TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

# based on config
training_args = TrainingArguments(
    fp16=False, # specify bf16=True instead when training on GPUs that support bf16 else fp16
    bf16=False,
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=3,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1, # originally set to 8
    per_device_train_batch_size=1, # originally set to 8
    # push_to_hub=True,
    # hub_model_id=trained_model_id,
    # hub_strategy="every_save",
    # report_to="tensorboard",
    report_to="none",  # for skipping wandb logging
    save_strategy="no",
    save_total_limit=None,
    seed=42,
)

# based on config
peft_config = LoraConfig(
        r=32,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

model_kwargs = dict(
    torch_dtype="auto",
    use_cache=False,
    device_map={'':"cuda"},
    quantization_config=quantization_config,
)

trainer = SFTTrainer(
        model=model_id,
        model_init_kwargs=model_kwargs,
        args=training_args,
        train_dataset=raw_datasets["train"],
        eval_dataset=raw_datasets["val"],
        dataset_text_field="text",
        tokenizer=tokenizer,
        # packing=True,
        peft_config=peft_config,
        max_seq_length=1024#tokenizer.model_max_length,
    )

# To clear out cache for unsuccessful run
torch.cuda.empty_cache()
tokenizer.pad_token = tokenizer.eos_token
train_result = trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Loading checkpoint shards: 100%|██████████| 30/30 [00:50<00:00,  1.68s/it]
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
***** Running training *****
  Num examples = 170
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 510
  Number of trainable parameters = 131,072,000


Epoch,Training Loss,Validation Loss


In [None]:
#