In [None]:
!pip install -U transformers accelerate bitsandbytes peft trl datasets huggingface_hub xformers


In [None]:
# Clean environment and pull the exact correct version of `trl`
!pip uninstall -y trl
!pip install git+https://github.com/huggingface/trl.git@main


In [None]:
from trl import DPOTrainer, DPOConfig


In [None]:
from trl import DPOTrainer
help(DPOTrainer)


In [None]:
from huggingface_hub import login

# Paste your Hugging Face token here
login("my llama code")


In [None]:
from google.colab import files

# Upload the LoRA adapter (nightline-lora-adapter.zip)
uploaded = files.upload()

# Upload the DPO dataset (cleaned_final_dpo_data.jsonl)
uploaded = files.upload()


In [None]:
!unzip nightline-lora-adapter.zip -d nightline-lora-adapter


In [None]:
!ls nightline-lora-adapter/nightline-lora-adapter


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    trust_remote_code=True,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
)

# Load LoRA adapter from nested folder
model = PeftModel.from_pretrained(
    model,
    "./nightline-lora-adapter/nightline-lora-adapter",
)


In [None]:
tokenizer.pad_token = tokenizer.eos_token


In [None]:
from datasets import load_dataset

# Load your DPO dataset
dataset = load_dataset("json", data_files="CLEANED_FINAL_DPO_DATA.jsonl", split="train")

# Show the first sample to confirm structure
dataset[0]


In [None]:
from trl import DPOTrainer, DPOConfig
from peft import PeftModel

# 🔧 Set pad token (needed for DPOTrainer)
tokenizer.pad_token = tokenizer.eos_token

# 🧠 Ensure model is in training mode
model.train()

# 🔓 Unfreeze LoRA adapter weights, freeze base model
for param in model.parameters():
    param.requires_grad = False
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True

# 🛠️ Optional: confirm what's trainable
if isinstance(model, PeftModel):
    model.print_trainable_parameters()

# ⚙️ Define training config
training_args = DPOConfig(
    beta=0.1,
    output_dir="./dpo-tuned-llama",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_strategy="steps",
    bf16=True,  # or fp16=True
    report_to="none",
    padding_value=tokenizer.pad_token_id,
)

# 🚀 Initialize DPO trainer
trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    processing_class=tokenizer,
)

# 🏁 Begin training
trainer.train()


In [None]:
# Save LoRA adapter to directory
adapter_save_path = "./dpo_lora_adapter"
model.save_pretrained(adapter_save_path)
tokenizer.save_pretrained(adapter_save_path)
print(f"LoRA adapter saved to: {adapter_save_path}")


In [None]:
from peft import PeftModel

# Merge LoRA into the base model
merged_model = model.merge_and_unload()

# Save the fully merged model
merged_model_path = "./dpo_merged_model"
merged_model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)
print(f"Merged model saved to: {merged_model_path}")


In [None]:
files.download('dpo_lora_adapter.zip')
