In [None]:
!pip install -U unsloth[torch] trl transformers datasets accelerate peft bitsandbytes


In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported, PatchDPOTrainer
from transformers import TrainingArguments
from trl import DPOTrainer
from datasets import load_dataset

# Patch TRL's DPO trainer with Unslothâ€™s speed-ups
PatchDPOTrainer()  # important for Unsloth RL

BASE_MODEL = "mistralai/Mistral-7B-v0.1"  # or another small instruct model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=2048,
    load_in_4bit=True,            # memory saver; DPO + LoRA is fine here
)
model = FastLanguageModel.get_peft_model(model)  # add LoRA adapters


In [None]:
raw = load_dataset("trl-lib/lm-human-preferences-sentiment")
# DPO expects columns: prompt / chosen / rejected
train = raw["train"].select(range(3000))  # keep it small for Colab
eval_ = raw["test"].select(range(300))    # optional if present

def keep_cols(x):
    return {"prompt": x["prompt"], "chosen": x["chosen"], "rejected": x["rejected"]}

train = train.map(keep_cols, remove_columns=[c for c in train.column_names if c not in {"prompt","chosen","rejected"}])

In [None]:
from unsloth import PatchDPOTrainer, is_bfloat16_supported
from trl import DPOTrainer, DPOConfig

# 0) Patch TRL's DPO with Unsloth speedups
PatchDPOTrainer()

# 1) Ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 2) Use DPOConfig (NOT TrainingArguments), include padding_value
args = DPOConfig(
    output_dir="dpo-rl-colab3",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    logging_steps=10,
    save_steps=100,
    num_train_epochs=1,                 # or use max_steps
    bf16=is_bfloat16_supported(),
    fp16=not is_bfloat16_supported(),
    lr_scheduler_type="cosine",
    report_to="none",
    padding_value=tokenizer.pad_token_id,  # <-- important for Unsloth DPO
    truncation_mode="keep_end",            # safe default for the collator
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    beta=0.1,
    train_dataset=train,
    eval_dataset=eval_ if "test" in raw else None,
    tokenizer=tokenizer,
    max_length=1024,
    max_prompt_length=512,
    args=args,
)

trainer.train()


Now that the model is trained, you can use it to generate responses based on new prompts. The following cell shows an example of how to do this.

In [None]:
from transformers import pipeline

# Create a pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define a prompt relevant to the sentiment dataset
prompt = "This movie is amazing and I really loved it because"

# Define a prompt relevant to the sentiment dataset
prompt = "This movie is shit and I really disliked it because"

# Generate text
generated_text = generator(prompt, max_length=50, num_return_sequences=1)[0]['generated_text']

print(generated_text)