In [1]:
# Step 1: Install required packages
!pip install -q bitsandbytes>=0.41.0
!pip install -q datasets accelerate loralib transformers peft

In [2]:
# Step 2: Import necessary libraries
import os
import torch
import json
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)
from transformers import Trainer, DataCollatorForLanguageModeling
from google.colab import drive

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Step 3: Set up GPU and environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use first GPU
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Avoid warnings

In [5]:
# Step 4: Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,  # Double quantization for more memory efficiency
    bnb_4bit_quant_type="nf4",       # Normal float 4-bit quantization for better accuracy
    bnb_4bit_compute_dtype=torch.float16  # Compute in fp16 for better performance
)

In [6]:
# Step 5: Load the Alif model with 4-bit quantization
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    "large-traversaal/Alif-1.0-8B-Instruct",
    quantization_config=bnb_config,
    device_map="auto",    # Automatically determine device mapping
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("large-traversaal/Alif-1.0-8B-Instruct")

Loading base model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/947 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [7]:
# Ensure the tokenizer has padding token, EOS token and set left padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [8]:
# Step 6: Prepare model for QLoRA training
print("Preparing model for QLoRA fine-tuning...")
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

Preparing model for QLoRA fine-tuning...


In [9]:
# Step 7: Define LoRA configuration
lora_config = LoraConfig(
    r=64,                   # Rank dimension
    lora_alpha=128,          # Alpha parameter for LoRA scaling
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # Attention modules
        "gate_proj", "up_proj", "down_proj"      # MLP modules
    ],
    lora_dropout=0.1,      # Dropout probability for LoRA layers
    bias="none",            # Don't train bias parameters
    task_type="CAUSAL_LM"   # Task type - causal language modeling
)

In [10]:
# Step 8: Apply LoRA to model
print("Applying LoRA adapters...")
model = get_peft_model(model, lora_config)

Applying LoRA adapters...




In [11]:
# Print trainable parameters percentage
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"Trainable params: {trainable_params} || "
        f"All params: {all_param} || "
        f"Trainable%: {100 * trainable_params / all_param:.2f}%"
    )

print_trainable_parameters(model)

Trainable params: 167772160 || All params: 5759045632 || Trainable%: 2.91%


In [12]:
# Step 9: Load and prepare the Urdu QA dataset
from datasets import DatasetDict, Dataset

print("Loading dataset...")
dataset_path = "/content/drive/MyDrive/Translation-Outputs/train_data_qa_full.json"

class Llama3InstructDataset:
    def __init__(self, data):
        self.data = data
        self.prompts = []
        self.create_prompts()

    def create_prompt(self, row):
        instruction = "آپ ایک اردو میں سوال و جواب کرنے والا مددگار ہیں۔ دیے گئے علم کی بنیاد پر سوال کا درست جواب دیں"

        # Format with Llama 3 tokens
        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>{instruction}<|eot_id|><|start_header_id|>user<|end_header_id|>علم: {row['knowledge']}\n\nسوال: {row['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{row['right_answer']}<|eot_id|>"""
        return prompt

    def create_prompts(self):
        for row in self.data:
            prompt = self.create_prompt(row)
            self.prompts.append(prompt)

    def get_dataset(self):
        df = pd.DataFrame({'text': self.prompts})
        return df

# Load dataset
with open(dataset_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Create formatted dataset
dataset_formatter = Llama3InstructDataset(data)
df = dataset_formatter.get_dataset()
dataset = Dataset.from_pandas(df)

print(f"Dataset size: {len(dataset)} examples")
print("Sample entry:")
print(dataset[0]['text'][:500] + "...")  # Print beginning of first example

Loading dataset...
Dataset size: 8000 examples
Sample entry:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>آپ ایک اردو میں سوال و جواب کرنے والا مددگار ہیں۔ دیے گئے علم کی بنیاد پر سوال کا درست جواب دیں<|eot_id|><|start_header_id|>user<|end_header_id|>علم: کیسل روڈ مغربی کنارے کو شمالی کنارے سے الگ کرتا ہے۔ کنڈا ہائی وے ضلع کی مغربی سرحد کے طور پر کام کرتا ہے ، جبکہ انڈپینڈنسی ایونیو / لبریشن ایونیو مشرقی سرحد ہے۔ بیورو آف نیشنل انویسٹی گیشنز کا ہیڈکوارٹر شمالی کنارے میں واقع ہے۔

سوال: مغربی کنارے اور کنارے جس کی مشرقی سرحد انڈپینڈنسی ایونیو...


In [13]:
class CustomDataCollator:
    def __init__(self, tokenizer, max_length=512):  # Add max_length parameter with a default value
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, examples):
        # Process the text field from examples
        inputs = self.tokenizer([example["text"] for example in examples],
                              padding=True,
                              truncation=True,
                              max_length=self.max_length,
                              return_tensors="pt")

        # Set up labels for causal language modeling
        inputs["labels"] = inputs["input_ids"].clone()

        return inputs

In [14]:
# Step 11: Initialize trainer with your custom collator
training_args = TrainingArguments(
    output_dir="./results/urdu-qa-model_2",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=1,
    save_strategy="epoch",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    seed=42,
    push_to_hub=False,
    remove_unused_columns=False,
    optim="adamw_bnb_8bit"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=CustomDataCollator(tokenizer, max_length=512)
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
# Step 13: Disable cache for training (to save memory)
model.config.use_cache = False

In [16]:
# Step 14: Train the model
print("Starting training...")
trainer.train()

Starting training...


Step,Training Loss
1,4.7186
2,4.6859
3,5.0475
4,4.5957
5,4.4306
6,4.6318
7,4.6578
8,3.7181
9,4.7367
10,4.1676


Step,Training Loss
1,4.7186
2,4.6859
3,5.0475
4,4.5957
5,4.4306
6,4.6318
7,4.6578
8,3.7181
9,4.7367
10,4.1676


TrainOutput(global_step=5000, training_loss=1.8936672295928, metrics={'train_runtime': 20117.9979, 'train_samples_per_second': 3.977, 'train_steps_per_second': 0.249, 'total_flos': 1.8488002942178427e+18, 'train_loss': 1.8936672295928, 'epoch': 10.0})

In [17]:
# Step 15: Save the fine-tuned model
output_dir = "/content/drive/MyDrive/fine-tuning-alif/fine_tuned_model-alif"

# Create the directory if it doesn't exist
import os
os.makedirs(output_dir, exist_ok=True)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model saved to /content/drive/MyDrive/fine-tuning-alif/fine_tuned_model-alif


In [18]:
# Step 16: Optional - Push to Hugging Face Hub

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
# Push adapter weights and tokenizer to HF Hub
model.push_to_hub("zenu12/alif-finetuned-haluqa")
tokenizer.push_to_hub("zenu12/alif-finetuned-haluqa")

adapter_model.safetensors:   0%|          | 0.00/4.87G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/zenu12/alif-finetuned-haluqa/commit/c4b41b79a3692e70eddf898cd25d64b4245db0d4', commit_message='Upload tokenizer', commit_description='', oid='c4b41b79a3692e70eddf898cd25d64b4245db0d4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/zenu12/alif-finetuned-haluqa', endpoint='https://huggingface.co', repo_type='model', repo_id='zenu12/alif-finetuned-haluqa'), pr_revision=None, pr_num=None)