In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.login(key="2917a6564f5c6c7991cc83796354c93085be91b5")
run = wandb.init(
    project='Fine-tune Llama 3.2 on Aplaca cleaned uzbek Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myakhyo9696[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/yakhyo/.netrc


In [3]:
base_model = "llama-3.2-1b-uz/"
new_model = "llama-3.2-1b-uz-instruct"
dataset_name = "behbudiy/alpaca-cleaned-uz"

In [4]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [5]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # Example dtype, can be torch.bfloat16 or torch.float32
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    # quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attentio

In [10]:
tokenizer.default_chat_template

AttributeError: 'LlamaTokenizerFast' object has no attribute 'default_chat_template'

In [6]:
# Importing the dataset
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle(seed=65).select(range(1000))  # Only use 1000 samples for quick demo

# Define the instruction (system message)
instruction = """Siz katta bilimga ega sun'iy intellekt modelisiz va foydalanuvchilarning savollariga aniq va tushunarli tarzda javob berishga qodirsiz. 
Muloqot davomida doimo aniq va foydalanuvchi uchun qulay bo'ling, do'stona ohangda yozing va barcha savollarga javob bering. Sizning ismingiz Afandi
"""

# Format the chat template manually
def format_chat_template(row):
    # Manually concatenate the system, user, and assistant roles into a single string
    formatted_input = (
        f"Qo'llanma: {instruction}\n"
        f"Foydalanuvchi: {row['instruction']}\n"
        f"Yordamchi: {row['output']}\n"
    )
    
    # Tokenize the formatted input without applying a chat template method
    row["text"] = formatted_input
    return row

# Apply the formatting to the dataset
dataset = dataset.map(format_chat_template, num_proc=4)

# Check if the 'text' column now has the formatted input
print(dataset[0]["text"])


Qo'llanma: Siz katta bilimga ega sun'iy intellekt modelisiz va foydalanuvchilarning savollariga aniq va tushunarli tarzda javob berishga qodirsiz. 
Muloqot davomida doimo aniq va foydalanuvchi uchun qulay bo'ling, do'stona ohangda yozing va barcha savollarga javob bering. Sizning ismingiz Afandi

Foydalanuvchi: Metall va nometallarning xossalarini solishtiring va taqqoslang.
Yordamchi: Metalllar va metall bo'lmaganlar juda boshqacha xususiyatlarga ega bo'lgan elementlarning ikkita asosiy guruhidir. Ikki guruh o'rtasidagi asosiy farqlardan ba'zilari:

**Tashqi ko'rinishi:** Metallar odatda yaltiroq, yorqin va odatda kumush yoki kulrang rangga ega. Boshqa tomondan, metall bo'lmaganlar turli xil ranglarda bo'lishi mumkin va odatda porloq emas.

**Holat:** Ko'pgina metallar xona haroratida qattiq (simobdan tashqari), ba'zi nometallar esa gaz (kislorod va azot kabi), suyuqlik (brom) yoki qattiq (masalan, uglerod va oltingugurt) shaklida mavjud.

**Egiluvchanlik va egiluvchanlik:** Metallar 

In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    # Update to target torch.nn.Linear instead of bnb.nn.Linear4bit
    linear_types = (torch.nn.Linear,)  # Only torch linear layers
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, linear_types):
            names = name.split('.')
            lora_module_names.add(names[-1])
    if 'lm_head' in lora_module_names:  # Typically exclude 'lm_head'
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [11]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [14]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [17]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    # eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1405.12 examples/s]


ValueError: You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more details

In [23]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType

# Step 1: Set up BitsAndBytesConfig for 4-bit quantization (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # You can also use torch.bfloat16 or torch.float32
    bnb_4bit_use_double_quant=True,
)

# Step 2: Load the model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    "llama-3.2-1b-uz",  # Replace with your base model path
    quantization_config=bnb_config,
    device_map="auto"
)

# Step 3: Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1b", trust_remote_code=True)

# Step 4: Load your instruction-based dataset
dataset_name = "behbudiy/alpaca-cleaned-uz"  # Replace with your actual dataset
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle(seed=65).select(range(1000))  # Use 1000 samples for a quick demo

# Step 5: Define the instruction template
instruction = """Siz katta bilimga ega sun'iy intellekt modelisiz va foydalanuvchilarning savollariga aniq va tushunarli tarzda javob berishga qodirsiz. 
Muloqot davomida doimo aniq va foydalanuvchi uchun qulay bo'ling, do'stona ohangda yozing va barcha savollarga javob bering. Sizning ismingiz Afandi.
"""

# Step 6: Format the instruction-based dataset
def format_chat_template(row):
    # Concatenate the system, user, and assistant roles into a single string
    formatted_input = (
        f"Qo'llanma: {instruction}\n"
        f"Foydalanuvchi: {row['instruction']}\n"
        f"Yordamchi: {row['output']}\n"
    )
    row["text"] = formatted_input
    return row

# Apply the formatting function to the dataset
dataset = dataset.map(format_chat_template, num_proc=4)

# # Step 7: Tokenize the instruction-based dataset
# def tokenize_function(examples):
#     return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 8: Apply LoRA configuration for fine-tuning
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Adapt this to your specific task (e.g., SEQ2SEQ_LM for translation)
    r=8,  # Low-rank adaptation dimension
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout probability for regularization
)

# Add LoRA layers to the quantized model
peft_model = get_peft_model(model, lora_config)

# Step 9: Define TrainingArguments for fine-tuning
training_args = TrainingArguments(
    output_dir="./results",  # Where to save the results
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=2,  # Adjust based on your GPU memory
    per_device_eval_batch_size=2,   # Adjust based on your GPU memory
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if available
    gradient_accumulation_steps=8,  # Use gradient accumulation to simulate larger batches
)

# Step 10: Create the Trainer and start fine-tuning
trainer = Trainer(
    model=peft_model,  # Fine-tune the PEFT (LoRA) model
    args=training_args,
    train_dataset=dataset,  # Use the tokenized dataset
    tokenizer=tokenizer,
)

# Step 11: Fine-tune the model
trainer.train()

# Step 12: Save the fine-tuned model
trainer.save_model("./fine_tuned_llama_3_2_1b")


Map (num_proc=4): 100%|██████████| 1000/1000 [00:00<00:00, 5071.04 examples/s]


ValueError: You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more details

In [13]:
import torch


for param in model.parameters():
    print(param.dtype)
    break


torch.float32


In [14]:
from transformers import AutoModel, AutoTokenizer

# Load your model and tokenizer
model_name = "llama-3.2-1b-uz"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert model to float16 (half precision)
model.half()

# Save the model in float16
model.save_pretrained("model_fp16")
tokenizer.save_pretrained("model_fp16")


('model_fp16/tokenizer_config.json',
 'model_fp16/special_tokens_map.json',
 'model_fp16/tokenizer.json')