# Hugging Face Training 

See Docs:

[Doc1](https://wandb.ai/byyoung3/mlnews2/reports/Fine-Tuning-Llama-3-with-LoRA-TorchTune-vs-HuggingFace--Vmlldzo3NjE3NzAz)

[Doc2](https://wandb.ai/capecape/alpaca_ft/reports/How-to-Fine-tune-an-LLM-Part-3-The-HuggingFace-Trainer--Vmlldzo1OTEyNjMy)

In [1]:
!pip install torch==2.2.0 transformers==4.40.2 datasets==2.19.1 trl==0.8.6 peft==0.11.0 wandb==0.17.0 accelerate==0.27.2 -q


[0m

In [3]:
import os
from dataclasses import dataclass, field
from typing import Optional
from datasets.arrow_dataset import Dataset
import torch
from datasets import load_dataset
from peft import LoraConfig
from peft import AutoPeftModelForCausalLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)


from trl import SFTTrainer


@dataclass
class ScriptArguments:
    """
    These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
    """


    local_rank: Optional[int] = field(
        default=-1, 
        metadata={"help": "Used for multi-gpu"},
    )

    per_device_train_batch_size: Optional[int] = field(default=1)
    per_device_eval_batch_size: Optional[int] = field(default=4)
    gradient_accumulation_steps: Optional[int] = field(default=17)
    learning_rate: Optional[float] = field(default=3e-4)
    max_grad_norm: Optional[float] = field(default=0.3)
    weight_decay: Optional[int] = field(default=0.01)
    lora_alpha: Optional[int] = field(default=16)
    lora_dropout: Optional[float] = field(default=0.0)
    lora_r: Optional[int] = field(default=8)
    max_seq_length: Optional[int] = field(default=4096)
    model_name: Optional[str] = field(
        default="meta-llama/Meta-Llama-3-8B",
        metadata={
            "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
        }
    )
    tokenizer_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "use a different tokenizer than the model_name"
        }
    )
    
    dataset_name: Optional[str] = field(
        default="tatsu-lab/alpaca",
        metadata={"help": "The preference dataset to use."},
    )


    use_4bit: Optional[bool] = field(
        default=False,
        metadata={"help": "Activate 4bit precision base model loading"},
    )
    use_nested_quant: Optional[bool] = field(
        default=False,
        metadata={"help": "Activate nested quantization for 4bit base models"},
    )
    bnb_4bit_compute_dtype: Optional[str] = field(
        default="float16",
        metadata={"help": "Compute dtype for 4bit base models"},
    )
    bnb_4bit_quant_type: Optional[str] = field(
        default="nf4",
        metadata={"help": "Quantization type fp4 or nf4"},
    )
    num_train_epochs: Optional[int] = field(
        default=1,
        metadata={"help": "The number of training epochs for the reward model."},
    )
    fp16: Optional[bool] = field(
        default=True,
        metadata={"help": "Enables fp16 training."},
    )
    bf16: Optional[bool] = field(
        default=True,
        metadata={"help": "Enables bf16 training."},
    )
    packing: Optional[bool] = field(
        default=True,
        metadata={"help": "Use packing dataset creating."},
    )
    gradient_checkpointing: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables gradient checkpointing."},
    )
    optim: Optional[str] = field(
        default="adamw_torch",
        metadata={"help": "The optimizer to use."},
    )
    lr_scheduler_type: str = field(
        # default="cosine_with_warmup",
        default="cosine",
        metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"},
    )
    max_steps: int = field(default=100_000, metadata={"help": "How many optimizer update steps to take"})
    warmup_steps: int = field(default=100, metadata={"help": "# of steps to do a warmup for"})
    group_by_length: bool = field(
        default=True,
        metadata={
            "help": "Group sequences into batches with same length. Saves memory and speeds up training considerably."
        },
    )
    save_steps: int = field(default=200, metadata={"help": "Save checkpoint every X updates steps."})
    logging_steps: int = field(default=5, metadata={"help": "Log every X updates steps."})
    merge_and_push: Optional[bool] = field(
        default=True,
        metadata={"help": "Merge and push weights after training"},
    )
    output_dir: str = field(
        default="./out",
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
    )


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)[0]


def create_and_prepare_model(args):
    compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)

    # commented qlora stuff 
    # bnb_config = BitsAndBytesConfig(
    #     load_in_4bit=args.use_4bit,
    #     bnb_4bit_quant_type=args.bnb_4bit_quant_type,
    #     bnb_4bit_compute_dtype=compute_dtype,
    #     bnb_4bit_use_double_quant=args.use_nested_quant,
    # )


    if compute_dtype == torch.float16 and args.use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            print("=" * 80)
            print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
            print("=" * 80)


    # Load the entire model on the GPU 0
    # switch to `device_map = "auto"` for multi-GPU

    # device_map = {"": "auto"}

    model = AutoModelForCausalLM.from_pretrained(
        args.model_name, 
        # quantization_config=bnb_config, 
        device_map="auto", 
        use_auth_token=True,
    )
    
    peft_config = LoraConfig(
        lora_alpha=script_args.lora_alpha,
        lora_dropout=script_args.lora_dropout,
        # target_modules=["query_key_value"], 
        r=script_args.lora_r,
        bias="none",
        task_type="CAUSAL_LM", 
        # target_modules=['q_proj', 'v_proj'],
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"]
    )

    
    if args.model_name is not None:
        tokenizer_name = script_args.tokenizer_name
    else:
        tokenizer_name = script_args.model_name
        
    tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token


    return model, peft_config, tokenizer


def get_custom_dataset(tokenizer, split='train'):
    
    split = 'train'
    
    def tokenize_dialog(dialog, tokenizer):
        dialog_tokens = tokenizer.apply_chat_template(dialog, tokenize=False)
        idx = dialog_tokens.index('<|start_header_id|>assistant<|end_header_id|>')
        
        input_ids = tokenizer.encode(dialog_tokens)
        labels = tokenizer.encode(dialog_tokens[idx:])[1:]
        
        combined_tokens = {
            "input_ids": input_ids,
            "labels": [-100] * (len(input_ids) - len(labels)) + labels,
        }

        return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))
    
    dataset = load_dataset("parquet", data_files={'/root/llama_/qa_data_chat.parquet'}, split=split)
    dataset = dataset.map(lambda x: tokenize_dialog(x["chat"], tokenizer), remove_columns=list(dataset.features))
    # need to add padding

    return dataset



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# def get_custom_dataset(tokenizer, split='train'):
    
#     split = 'train'
    
#     def tokenize_dialog(dialog, tokenizer):
#         dialog_tokens = tokenizer.apply_chat_template(dialog, tokenize=False)
#         idx = dialog_tokens.index('<|start_header_id|>assistant<|end_header_id|>')
        
#         input_ids = tokenizer.encode(dialog_tokens, padding='max_length', max_length=512)
#         labels = tokenizer.encode(dialog_tokens[idx:])[1:]
        
#         combined_tokens = {
#             "input_ids": input_ids,
#             "labels": [-100] * (len(input_ids) - len(labels)) + labels,
#         }

#         return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))
    
#     dataset = load_dataset("parquet", data_files={'/root/llama_/qa_data_chat.parquet'}, split=split)
#     dataset = dataset.map(lambda x: tokenize_dialog(x["chat"], tokenizer), remove_columns=list(dataset.features))
#     # need to add padding

#     return dataset

In [5]:
# dataset_ = get_custom_dataset(tokenizer)


In [5]:
# def gen_batches_train():
#     ds = load_dataset(script_args.dataset_name, streaming=True, split="train")

#     for sample in iter(ds):


#         # Extract instruction and input from the sample
#         instruction = str(sample['instruction'])
#         input_text = str(sample['input'])
#         out_text = str(sample['output'])
#         formatted_prompt = None 
            
#         if input_text is None or input_text == "":
#             formatted_prompt = (
#                 f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
#                 f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n"
#                 f"<|eot_id|><|start_header_id|>asssitant<|end_header_id|>\n\n",
#                 f"{str(out_text)}"
#                 f"<|eot_id|><|end_of_text|>"
#             )
#         else:
#             formatted_prompt = (
#                 f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
#                 f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
#                 f"<|eot_id|><|start_header_id|>asssitant<|end_header_id|>\n\n"
#                 f"{str(out_text)}"
#                 f"<|eot_id|><|end_of_text|>"
#             )
        
#         formatted_prompt = "".join(formatted_prompt)
#         yield {'text': formatted_prompt}

In [4]:
script_args.fp16 = False
script_args.per_device_train_batch_size = 1
script_args.max_steps = 1600

training_arguments = TrainingArguments(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    optim=script_args.optim,
    save_steps=script_args.save_steps,
    logging_steps=script_args.logging_steps,
    learning_rate=script_args.learning_rate,
    fp16=script_args.fp16,
    bf16=script_args.bf16,
    max_grad_norm=script_args.max_grad_norm,
    max_steps=script_args.max_steps,
    warmup_steps=script_args.warmup_steps,
    group_by_length=script_args.group_by_length,
    lr_scheduler_type=script_args.lr_scheduler_type,
)

script_args.model_name = '/root/llama/models--meta-llama--Meta-Llama-2-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298/'
script_args.tokenizer_name = '/root/llama/models--meta-llama--Meta-Llama-2-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298/'
# script_args.tokenizer_name = '/root/llama/outputs/final_merged_checkpoint'
# script_args.model_name = '/root/llama/outputs/final_merged_checkpoint'


model, peft_config, tokenizer = create_and_prepare_model(script_args)
tokenizer.padding_side = "left"


dataset = get_custom_dataset(tokenizer)


Loading checkpoint shards: 100%|██████████| 4/4 [00:44<00:00, 11.08s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 793/793 [00:00<00:00, 1303.69 examples/s]


In [5]:
print(dataset[0]['input_ids'])

[128000, 128000, 128006, 882, 128007, 271, 3923, 374, 279, 1510, 6671, 304, 40677, 1494, 11, 29762, 11, 323, 1268, 374, 433, 74055, 279, 6593, 13077, 323, 6978, 304, 279, 3158, 30, 128009, 128006, 78191, 128007, 271, 1688, 220, 2366, 19, 1474, 352, 12, 2318, 11, 24461, 304, 40677, 1494, 527, 520, 5326, 315, 1694, 43206, 4245, 311, 264, 12330, 315, 14363, 323, 15902, 1274, 11, 323, 279, 6671, 374, 54677, 13, 578, 16286, 13695, 31589, 279, 14373, 3973, 27736, 449, 15212, 304, 264, 39493, 25834, 11, 323, 449, 279, 53250, 315, 40677, 1494, 11, 6921, 1457, 11835, 682, 315, 29762, 753, 87199, 369, 279, 1176, 892, 2533, 433, 62765, 17312, 323, 61107, 505, 279, 18455, 7154, 1403, 11026, 4227, 13, 578, 40677, 1494, 27736, 374, 279, 1193, 2035, 1405, 1274, 649, 3810, 323, 4974, 13, 578, 8738, 11039, 323, 22722, 315, 279, 87199, 1436, 5353, 279, 18678, 315, 12576, 7677, 323, 264, 38748, 65396, 13, 8442, 264, 4948, 315, 29762, 753, 220, 1927, 24461, 323, 6156, 2890, 2512, 36282, 527, 3318, 323, 68

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=script_args.max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=script_args.packing,
)


trainer.train()


Detected kernel version 4.14.343, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myaakovtayeb[0m ([33mprivate_[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,2.3572
10,2.6549
15,2.2545
20,2.1382
25,1.7061
30,1.8469
35,1.3917
40,1.7731
45,1.4592
50,1.5773




TrainOutput(global_step=1600, training_loss=0.18069023273004858, metrics={'train_runtime': 7892.2598, 'train_samples_per_second': 3.446, 'train_steps_per_second': 0.203, 'total_flos': 1.422223241408594e+17, 'train_loss': 0.18069023273004858, 'epoch': 34.30012610340479})

In [None]:
script_args.merge_and_push = True
script_args.output_dir = "./outputs"

if script_args.merge_and_push:
    output_dir = os.path.join(script_args.output_dir, "final_checkpoints")
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    torch.cuda.empty_cache()


    model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
    model = model.merge_and_unload()


    output_merged_dir = os.path.join(script_args.output_dir, "final_merged_checkpoint")
    model.save_pretrained(output_merged_dir, safe_serialization=True)


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.44it/s]


In [None]:
print('DONE')

DONE


# inference

In [12]:
# !ls /root/llama/models--meta-llama--Meta-Llama-2-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298/

In [2]:
!pip -q install transformers
# diffusers==0.12.1 

[0m

In [None]:
# model_id = "/root/llama/outputs/final_merged_checkpoint"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id)


In [8]:
# import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# model_id = "/root/llama/outputs/final_merged_checkpoint"
model_id = "/root/llama/models/Llama-3-8B-Instruct"
# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", use_auth_token=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:45<00:00, 11.32s/it]
Process ForkProcess-4:
Process ForkProcess-2:
Process ForkProcess-3:
Process ForkProcess-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.10/concurren

In [9]:
def predict(question, model, tokenizer):

    messages = [
        {"role": "system", "content": "you are a llama AI model who want to answer questions accuratly and nicely"},
        {"role": "user", "content": question},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to('cuda')

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=1.0,
        top_p=0.9,
    )
    response = outputs[0][input_ids.shape[-1]:]
    response = tokenizer.decode(response, skip_special_tokens=True)
    
    return response


In [11]:
answer = predict("how to do inferenece for Llama2 model?", model, tokenizer)
print(answer)


# Inference with LlamaIndex

In [2]:
!pip install -q llama-index llama-index-llms-huggingface llama-index-embeddings-huggingface

[0m

In [3]:
import torch
from transformers import AutoTokenizer
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.llms import ChatMessage
from IPython.display import Markdown, display


base_model = "/root/llama/models/Llama-3-8B-Instruct"
tunned_llama3 = "/root/llama/outputs/final_merged_checkpoint"

tokenizer = AutoTokenizer.from_pretrained(tunned_llama3)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

llm = HuggingFaceLLM(
    model_name=tunned_llama3,
    model_kwargs={
        "torch_dtype": torch.bfloat16,  # comment this line and uncomment below to use 4bit
        # "quantization_config": quantization_config
    },
    generate_kwargs={
        "do_sample": True,
        "temperature": 1.0,
        "top_p": 0.9,
    },
    tokenizer_name=base_model,
    stopping_ids=stopping_ids,
)


  from .autonotebook import tqdm as notebook_tqdm

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.41it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model `/root/llama/outputs/final_merged_checkpoint` and tokenizer `/root/llama/models/Llama-3-8B-Instruct` are different, please ensure that they are compatible.


In [6]:
%%time

response = llm.complete("Who are the most important people in Hamas and what their role is? When answering use a list format")

print(response)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


.
Here is a list of some of the most important people in Hamas and their roles:
1. Ismail Haniyeh: Haniyeh is the current political leader of Hamas and the head of the organization's political bureau. He has been the leader since 2017 and has been instrumental in shaping the organization's policies and strategies.
2. Khalid Mashal: Mashal is a former leader of Hamas and the head of the organization's political bureau from 2004 to 2017. He played a key role in the organization's military and political activities during the Second Intifada and was known for his strong anti-Israel stance.
3. Mahmoud al-Zahar: al-Zahar is a senior leader of Hamas and has been the head of the organization's military wing since 2006. He has been involved in the organization's military activities, including rocket attacks on Israel, and has been a key figure in the organization's relations with Iran.
4. Fathi Hammad: Hammad is a senior leader of Hamas and has been the head of the organization's interior minis

In [8]:
%%time

response = llm.complete("Today is May 29 2020. who are, currently, the most important people in Gaza? when answering, explain the reason for their importance. Use a list format")

print(response)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


, number the reasons, and provide names and titles.

At 2024-may-29, the most important people in Gaza are chiefs of the main armed factions, Hamas and Islamic Jihad, due to their ability to command and control their organisations, as well as any civilian population they may be controlling. The leaders of the main armed factions are the most important people in Gaza, as they are the ones with the military command and control. 

Here is the list of names and titles:

1. Ismail Haniyeh, chief of the Hamas operational command
2. Aziz Awqat, head of the Hamas political bureau
3. Mohammad al-Hadid, commander of the Hamas interior ministry
4. Faiq Mabhouh, head of the Hamas national security
5. Mohammed Khatib, commander of the Islamic Jihad operational command
6. Walid al-Baasa, head of the Islamic Jihad political bureau
7. Naim Abu al-Neyyem, commander of the Islamic Jihad interior ministry
8. Ramadan Shalah, head of the Islamic Jihad national security

Note: The leaders of the main armed 

In [9]:
%%time

response = llm.complete("Today is May 29 2020. who are, currently, the most important people in Gaza? when answering, explain the reason for their importance. Use a list format")

print(response)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


, numbered from 1 to 10.

1. Baher Shawqi, the chairman of the Gaza chamber of commerce, who is important for his efforts to revive trade and investment in Gaza.
2. Faiq Al-Qatnani, the director of the Gaza seaport, who is vital for the development of the seaport and the expansion of trade through it.
3. Ahmed Al-Kurd, the owner of the Kuwait Gulf Oil Company, who is important for his plans to rebuild the hotel industry in Gaza.
4. Iyad AbdelAziz, the CEO of Palm Chambers, who is vital for the reconstruction of the housing sector in Gaza.
5. Mohammed Al-Gharabli, the director of the Gaza airport, who is important for the development of air transportation in Gaza.
6. Raafat Al-Barazi, the chairman of the Palestinian federation of labor unions, who is vital for his efforts to organize labor and improve working conditions in Gaza.
7. Yousef Al-Zabaida, the owner of Yousef Al-Zabaida & Sons, who is important for his plans to rebuild the furniture industry in Gaza.
8. Mohammad AbuNada, the 

In [9]:
# model

In [None]:
%%timeit

answer = predict(question="tell about most recent events in Gaza")
print(answer)



# Covert Model to GGUF (ollama)
[Instructions](https://github.com/ggerganov/llama.cpp/discussions/2948)

[publish to ollama](https://www.youtube.com/watch?v=3BnnsQCmgLo)

In [2]:
# !git clone https://github.com/ggerganov/llama.cpp.git
# !pip install -r llama.cpp/requirements.txt

In [8]:
# !python llama.cpp/convert.py

In [9]:
# !python llama.cpp/convert.py /root/llama/outputs/final_merged_checkpoint --outfile /root/llama/outputs/llama3-custome.gguf --vocab-type bpe


In [None]:
!ollama rm llama-news