In [None]:
# !pip install flash-attn --no-build-isolation

In [None]:
# !pip install evaluate bitsandbytes

In [None]:
# !pip install --upgrade transformers torch accelerate (Run only if nessary)

In [None]:
# !pip install -U datasets fsspec

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import matplotlib.pyplot as plt
from evaluate import load
import os

In [None]:
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
# Check for CUDA availability
if not torch.cuda.is_available():
    print("CUDA not available, using CPU. This will be very slow.")
    device_map = "cpu"
else:
    device_map = "auto"

In [None]:
# 1. Define the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Before : {tokenizer.pad_token}")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token if it's missing
tokenizer.padding_side = "left"  # IMPORTANT: Set padding_side to 'left' BEFORE tokenizing
print(f"After : {tokenizer.pad_token}")

# Before : <|endoftext|>
# After : <|endoftext|>


In [None]:
# 2. Load the dataset
dataset_name = "Vishva007/Databricks-Dolly-4k"
dataset = load_dataset(dataset_name, split="train")

In [None]:
# 1. Format the dataset first
def format_dolly(sample):
    instruction = sample["instruction"]
    context = sample["context"]
    response = sample["response"]
    prompt = f"### Instruction:\n{instruction}\n\n### Context:\n{context}\n\n### Response:\n{response}"
    return {"text": prompt}

In [None]:
# Apply formatting
dataset = dataset.map(format_dolly).filter(lambda x: x is not None and x["text"] is not None)


In [None]:
# 2. Now tokenize the formatted data
def tokenize_function(examples):
    # Tokenize the texts with padding and truncation
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)


In [None]:
# Apply tokenization to create input_ids, attention_mask, etc.
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["instruction", "context", "response", "category", "text"],
)

In [None]:
# Split the tokenized dataset
train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [None]:
# 3. Configure QLoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        # "o_proj",
        # "gate_proj",
        # "up_proj",
        # "down_proj",
    ],
)

In [None]:
# 4. Load the base model in 4-bit quantization with BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map=device_map,
    trust_remote_code=True,
    # attn_implementation="flash_attention_2" #FlashAttention only supports Ampere GPUs or newer.
)

In [None]:
# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

In [None]:

# Add LoRA adapters to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# trainable params: 1,474,560 || all params: 495,507,328 || trainable%: 0.2976


In [None]:
# 5. Set up training arguments
output_dir = "./qwen2_5_dolly_qlora"  # Directory to save fine-tuned model

In [None]:
# !pip freeze > requirements.txt

In [None]:

# Load perplexity metric
# perplexity = load("perplexity", module_type="metric")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
#     return perplexity.compute(predictions=predictions, references=labels)
def compute_metrics(eval_pred):
    return {} # Trainer will automatically log eval_loss for us.



In [None]:
# 6. Set up data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,  # Keep at 1
    gradient_accumulation_steps=16,  # Increased to maintain batch size
    learning_rate=1e-4,  # Slightly reduced learning rate
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    num_train_epochs=2,  # Reduced from 3 to 2
    fp16=True,
    eval_strategy="steps",
    eval_steps=50,  # Increased eval steps to reduce frequency
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=False,  # Disabled to save memory
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    push_to_hub=False,
    remove_unused_columns=False,
    logging_dir="./logs",
    logging_steps=20,
    report_to="none",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    dataloader_pin_memory=False,  # Disable pin memory to save GPU memory
    dataloader_num_workers=0,     # Use single worker to save memory
    max_grad_norm=1.0,           # Add gradient clipping
    group_by_length=False,       # Disable to save memory
    length_column_name=None,
    eval_accumulation_steps=1,   # Process eval in smaller chunks
)

In [None]:
# 7. Initialize the Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


In [None]:
# 8. Start training
# torch.backends.cuda.enable_flash_sdp(True)
# torch.backends.cuda.enable_mem_efficient_sdp(True)
# torch.backends.cuda.enable_math_sdp(False)
train_result = trainer.train()

# Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
# `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
#  [226/226 26:53, Epoch 2/2]
# Step	Training Loss	Validation Loss
# 50	2.437600	2.402970
# 100	2.312700	2.363461
# 150	2.295800	2.277348
# 200	2.216500	2.233537



In [None]:
# 9. Save the fine-tuned LoRA adapters
model.save_pretrained(output_dir)

In [None]:
# Store training and evaluation metrics
train_history = train_result.metrics
eval_history = trainer.evaluate()

In [None]:
final_eval_loss = eval_history.get("eval_loss")

if final_eval_loss is not None:
    final_perplexity = torch.exp(torch.tensor(final_eval_loss)).item()
    print(f"Final Evaluation Loss: {final_eval_loss:.4f}")
    print(f"Final Perplexity: {final_perplexity:.2f}")

# Final Evaluation Loss: 2.2326
# Final Perplexity: 9.32


In [None]:
# --- 11. Extracting Metrics from log_history for Plotting ---
train_losses = []
eval_losses = []
eval_steps = [] # Store steps where evaluation occurred

# Iterate through the trainer's log_history
for log in trainer.state.log_history:
    # Training loss is logged at 'logging_steps' intervals
    if "loss" in log and "learning_rate" in log: # Check if it's a training step log
        train_losses.append({"step": log["step"], "loss": log["loss"]})
    # Evaluation loss is logged at 'eval_steps' intervals
    if "eval_loss" in log:
        eval_losses.append({"step": log["step"], "loss": log["eval_loss"]})

# Prepare data for plotting
train_steps_plot = [entry["step"] for entry in train_losses]
train_values_plot = [entry["loss"] for entry in train_losses]

eval_steps_plot = [entry["step"] for entry in eval_losses]
eval_values_plot = [entry["loss"] for entry in eval_losses]

# --- 12. Plotting the Training and Evaluation Loss ---
plt.figure(figsize=(12, 6))
plt.plot(train_steps_plot, train_values_plot, label="Training Loss", marker='.')
plt.plot(eval_steps_plot, eval_values_plot, label="Evaluation Loss", marker='o', linestyle='--')

plt.xlabel("Training Steps")
plt.ylabel("Loss")
plt.title("Training and Evaluation Loss Over Steps")
plt.legend()
plt.grid(True)
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plot_path = os.path.join(output_dir, "loss_plot.png")
plt.savefig(plot_path)
print(f"Loss plot saved to {plot_path}")
plt.show()