In [4]:
!pip install -U unsloth[torch] trl transformers datasets accelerate peft bitsandbytes


Collecting trl
  Using cached trl-0.25.0-py3-none-any.whl.metadata (11 kB)


In [5]:
from unsloth import FastLanguageModel, is_bfloat16_supported, PatchDPOTrainer
from transformers import TrainingArguments
from trl import DPOTrainer
from datasets import load_dataset

# Patch TRL's DPO trainer with Unsloth’s speed-ups
PatchDPOTrainer()  # important for Unsloth RL

BASE_MODEL = "mistralai/Mistral-7B-v0.1"  # or another small instruct model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=2048,
    load_in_4bit=True,            # memory saver; DPO + LoRA is fine here
)
model = FastLanguageModel.get_peft_model(model)  # add LoRA adapters


==((====))==  Unsloth 2025.11.1: Fast Mistral patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [6]:
raw = load_dataset("trl-lib/lm-human-preferences-sentiment")
# DPO expects columns: prompt / chosen / rejected
train = raw["train"].select(range(3000))  # keep it small for Colab
eval_ = raw["test"].select(range(300))    # optional if present

def keep_cols(x):
    return {"prompt": x["prompt"], "chosen": x["chosen"], "rejected": x["rejected"]}

train = train.map(keep_cols, remove_columns=[c for c in train.column_names if c not in {"prompt","chosen","rejected"}])

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/324k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4992 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1272 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [8]:
from unsloth import PatchDPOTrainer, is_bfloat16_supported
from trl import DPOTrainer, DPOConfig

# 0) Patch TRL's DPO with Unsloth speedups
PatchDPOTrainer()

# 1) Ensure pad token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 2) Use DPOConfig (NOT TrainingArguments), include padding_value
args = DPOConfig(
    output_dir="dpo-rl-colab3",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    logging_steps=10,
    save_steps=100,
    num_train_epochs=1,                 # or use max_steps
    bf16=is_bfloat16_supported(),
    fp16=not is_bfloat16_supported(),
    lr_scheduler_type="cosine",
    report_to="none",
    padding_value=tokenizer.pad_token_id,  # <-- important for Unsloth DPO
    truncation_mode="keep_end",            # safe default for the collator
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    beta=0.1,
    train_dataset=train,
    eval_dataset=eval_ if "test" in raw else None,
    tokenizer=tokenizer,
    max_length=1024,
    max_prompt_length=512,
    args=args,
)

trainer.train()


Extracting prompt in train dataset (num_proc=16):   0%|          | 0/3000 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=16):   0%|          | 0/3000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=16):   0%|          | 0/3000 [00:00<?, ? examples/s]

Extracting prompt in eval dataset (num_proc=16):   0%|          | 0/300 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=16):   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=16):   0%|          | 0/300 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,000 | Num Epochs = 1 | Total steps = 188
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040 of 7,283,675,136 (0.58% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
10,0.693,-0.000345,-0.000746,0.41875,0.000401,-52.408253,-53.348938,-2.949501,-2.939445,0,0,0
20,0.693,0.000672,0.0004,0.475,0.000272,-53.503761,-53.655609,-2.955189,-2.934483,No Log,No Log,No Log
30,0.6919,0.00014,-0.002555,0.51875,0.002695,-53.810131,-53.053417,-2.982998,-2.968248,No Log,No Log,No Log
40,0.6877,0.005802,-0.005474,0.625,0.011276,-52.071449,-52.842506,-2.963692,-2.948175,No Log,No Log,No Log
50,0.6812,0.005266,-0.020198,0.64375,0.025464,-54.084099,-52.454193,-2.972651,-2.957529,No Log,No Log,No Log
60,0.6787,-0.009808,-0.041627,0.60625,0.031819,-53.527771,-54.741199,-2.976275,-2.958977,No Log,No Log,No Log
70,0.6693,-0.055722,-0.11187,0.63125,0.056148,-52.918346,-55.356953,-2.961649,-2.944667,No Log,No Log,No Log
80,0.6702,-0.114137,-0.176887,0.5875,0.062749,-53.43927,-55.3619,-2.952533,-2.925983,No Log,No Log,No Log
90,0.673,-0.097659,-0.159106,0.60625,0.061447,-54.900978,-55.624573,-2.947849,-2.927872,No Log,No Log,No Log
100,0.6813,-0.079708,-0.124177,0.51875,0.044469,-53.945881,-53.89098,-2.951003,-2.959134,No Log,No Log,No Log


TrainOutput(global_step=188, training_loss=0.6736042524905915, metrics={'train_runtime': 848.4594, 'train_samples_per_second': 3.536, 'train_steps_per_second': 0.222, 'total_flos': 0.0, 'train_loss': 0.6736042524905915, 'epoch': 1.0})

Now that the model is trained, you can use it to generate responses based on new prompts. The following cell shows an example of how to do this.

In [13]:
from transformers import pipeline

# Create a pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define a prompt relevant to the sentiment dataset
prompt = "This movie is amazing and I really loved it because"

# Define a prompt relevant to the sentiment dataset
prompt = "This movie is shit and I really disliked it because"

# Generate text
generated_text = generator(prompt, max_length=50, num_return_sequences=1)[0]['generated_text']

print(generated_text)

Device set to use cuda:0


This movie is shit and I really disliked it because it is full of lies. I've watched it with my own eyes and I am telling you that it is a movie made by people who have never seen a real American and have
