In [8]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, LlamaForCausalLM, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import get_peft_model, LoftQConfig, LoraConfig, PeftModel, PeftConfig
from trl import SFTTrainer
from dotenv import load_dotenv
from accelerate import Accelerator
import os
from nvidia_smi import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

- [x] LoftQ Configure
- [x] Check for LoRA config parameters
- [x] Accelerate for multiple GPUs (ZeRO)
- [x] BitsAndBytes (depending on LoftQ requirements)
- [x] Put training Parameters from paper

In [2]:
accelerator = Accelerator()

device = "cpu" # accelerator.device

In [3]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [3]:
train_data[0]

{'Unnamed: 0': 53.0,
 'split': 'llm',
 'query': 'what was the height of the inca empire',
 'prediction': 'The official language of the empire was Quechua , although hundreds of local languages and dialects of Quechua were spoken.',
 'context': 'Prior to its destruction by the Spaniards, the temple had what is believed to be the largest single roof in the Incan Empire, having its peak at the central wall, then stretching over the columns and some 25 metres (82 ft) beyond on each side. The huge proportions of the temple and its prominence on the site explain why the whole archaeological complex is also sometimes referred to as the Temple of Wiracocha.',
 'src': 'wiki_qa',
 'id': 'Q2977_Inca Empire',
 'context_src': 'oscar',
 'context_id': '8985565',
 'original_context': False,
 'task': 'qa',
 'domain': 'openqa',
 'c_len': 73,
 '__index_level_0__': 71990}

In [3]:
max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number
packing = True # Packing multiple examples into one sequence
base_model = "meta-llama/Llama-2-13b-chat-hf"
output_dir = "../models/itrf/7b-4bit-init-causal"

# Load environment variables from .env file
load_dotenv("../.env")

# Access the environment variable
token = os.getenv('HUGGINGFACE_TOKEN')
write_token = os.getenv('HUGGINGFACE__WRITE_TOKEN')

# Load model
model = LlamaForCausalLM.from_pretrained(base_model, attn_implementation="flash_attention_2", device_map=device, torch_dtype=torch.bfloat16, token=token)
tokenizer = AutoTokenizer.from_pretrained(base_model, token=token)
tokenizer.add_special_tokens({"additional_special_tokens": ["<>", "<inst_e>"]})
model.resize_token_embeddings(len(tokenizer))
# For batch tokenization and packing
tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
# Load environment variables from .env file
base_model = "meta-llama/Llama-2-13b-chat-hf"
load_dotenv("../.env")

# Access the environment variable
token = os.getenv('HUGGINGFACE_TOKEN')
write_token = os.getenv('HUGGINGFACE__WRITE_TOKEN')

# Load from pretrained
model = LlamaForCausalLM.from_pretrained(
                base_model, 
                attn_implementation="flash_attention_2",
                device_map=device,
                torch_dtype="auto", # torch.bfloat16,  # you may change it with different models
                quantization_config=BitsAndBytesConfig(
                                load_in_4bit=True,
                                bnb_4bit_compute_dtype=torch.bfloat16,  # bfloat16 is recommended
                                bnb_4bit_use_double_quant=False,
                                bnb_4bit_quant_type='nf4',
                            ),
                token=token)
tokenizer = AutoTokenizer.from_pretrained(base_model, token=token)
tokenizer.add_special_tokens({"additional_special_tokens": ["[INST]", "[/INST]", "[ANS]", "[/ANS]"]})
model.resize_token_embeddings(len(tokenizer))
tokenizer = AutoTokenizer.from_pretrained(base_model, token=token)
tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"
pedr_model = PeftModel.from_pretrained(model, "../models/itrf/13b-qlora/checkpoint-1000", device_map=device, torch_dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
pedr_model.push_to_hub("tristanratz/itrf-13b-qlora", token=write_token)

adapter_model.safetensors:   0%|          | 0.00/604M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tristanratz/itrf-7b-lora/commit/601bdab4c0a72cfd1e11d4d91c88b8074dd79a63', commit_message='Upload model', commit_description='', oid='601bdab4c0a72cfd1e11d4d91c88b8074dd79a63', pr_url=None, pr_revision=None, pr_num=None)

In [4]:
loftq_config = LoftQConfig(loftq_bits=4) # set 8bit quantization
lora_config = LoraConfig(
          r=16,
          lora_alpha=32,
          target_modules="all-linear",
          lora_dropout=0.05,
          bias="none",
          init_lora_weights="loftq", 
          loftq_config=loftq_config,
          task_type="CAUSAL_LM",
      )

In [5]:
peft_model = get_peft_model(model, lora_config)

In [6]:
peft_model.print_trainable_parameters()

trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.589770503135875


In [7]:
peft_model.save_pretrained(output_dir)

In [9]:
# df = pd.read_parquet("../data/dataset/itrf_dataset_llm.parquet")
# df = df.drop_duplicates()
# df["c_len"] = df["context"].map(lambda x: len(x.split(" ")))
# df = df[df["c_len"] > 15]
# dataset = Dataset.from_pandas(df)
val_set_size = 0.1

dataset = load_dataset("parquet", data_files="../data/dataset/itrf_dataset_llm_processed2.parquet")
len(dataset["train"])
# open("../data/dataset/itrf_dataset_llm.parquet")

#split thte data to train/val set
train_val = dataset["train"].train_test_split(
    test_size=val_set_size, shuffle=True, seed=2024
)
train_data = train_val["train"].shuffle(seed=2024)
val_data = train_val["test"].shuffle(seed=2024)

In [10]:
# Bring the examples in the form of the prompt
def formatting_func(example):
    text = f"<s>Background: {example['context']}\n\n<inst_a>{example['query']}<inst_e><ans_a>{example['prediction']}<eos>"
    return text

In [3]:
args = TrainingArguments(
    output_dir = output_dir,
    per_device_train_batch_size=4, # Batch size = per_device_train_batch_size * gradient_accumulation_steps * num_gpus (if using DataParallel)
    gradient_accumulation_steps=4, # Helps to fit larger batch sizes into memory, slightly increases training time
    gradient_checkpointing=True,
    warmup_steps=100,
    # num_train_epochs=500,
    max_steps=1000,
    learning_rate=1e-5,
    bf16=True, # Use bfloat16 precision if u have the right hardware
    logging_steps=10,
    optim="adafactor", # Adafactor can be less memory intensive than AdamW but may require more steps to converge
    logging_dir="../data/training_output",
    lr_scheduler_type="cosine",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=10,
    load_best_model_at_end=True,
    )    

trainer = accelerator.prepare(SFTTrainer(
        model=peft_model,
        args=args,
        train_dataset=train_data,
        eval_dataset=val_data,      
        packing=True, # Packing multiple examples into one sequence
        formatting_func=formatting_func,
        # data_collator=data_collator,
        max_seq_length=max_seq_length,
    ))

NameError: name 'output_dir' is not defined

In [12]:
# peft_model.merge_and_unload()

# peft_model.save_pretrained(output_dir)

# trainer.save_model(output_dir)
# trainer.model.save_pretrained(output_dir)
# model.save("../models/itrf/7b-4bit-init-2")
# model.save_pretrained("../models/itrf/7b-4bit-init/")
# m = peft_model.merge_and_unload()

m.save_pretrained(output_dir)

NameError: name 'm' is not defined

In [12]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacty of 23.68 GiB of which 22.69 MiB is free. Process 2573861 has 23.59 GiB memory in use. Of the allocated memory 21.99 GiB is allocated by PyTorch, and 230.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [15]:
# peft_model_2 = AutoPeftModel.from_pretrained("../models/itrf/7b-4bit-init")
adapter = output_dir
PeftModel.from_pretrained(model, adapter, lora_config, device_map=device)
# PeftModel.load_adapter(model, adapter, lora_config, device_map=device)

ValueError: Can't find 'adapter_config.json' at '../models/rarit/7b-4bit-init-causal'

In [15]:
config = PeftConfig.from_pretrained("../models/itrf/7b-4bit-init-causal")

TypeError: PeftConfig.__init__() got an unexpected keyword argument 'architectures'

## Deprecated

In [None]:
# COPIED FROM https://github.com/artidoro/qlora/blob/main/qlora.py
def find_all_linear_names(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")

    return list(lora_module_names)

# COPIED FROM https://github.com/artidoro/qlora/blob/main/qlora.py
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )