In [2]:
import torch
import json
import re
import math
import random
import os
import numpy as np
from tqdm import tqdm
from torch import Tensor
import torch.nn.functional as F
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TrainingArguments, Trainer, get_cosine_schedule_with_warmup
from peft import LoraConfig, get_peft_model
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu
from peft import PeftModel
from trl import DPOTrainer, DPOConfig
from utils import *
from datasets import Dataset as HFDataset


  import pynvml  # type: ignore[import]
  from .autonotebook import tqdm as notebook_tqdm


[2025-12-05 00:41:30,639] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio
collect2: error: ld returned 1 exit status




  @autocast_custom_fwd
  @autocast_custom_bwd


In [None]:
model_path = Path("models/ThinkLite-VL-7B")
sft_training_path = Path("data/alpaca_en_mini.json")
sft_adaptor_path = Path("checkpoints/sft_adaptor")
raw_response_path = Path("data/raw_responses.jsonl")
dpo_training_path = Path("data/dpo_training.jsonl")
dpo_model_path = Path("checkpoints/dpo_no_self_instruction")
test_path = Path("data/alpaca_en_test.json")

# Load Base Model

In [7]:
base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path, torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_path)

Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.75s/it]


# Use LoRA 

In [None]:
base_model.config.use_cache = False
base_model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=[
        'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'
    ],
    lora_dropout=0.1,
    bias='none',
    task_type='CAUSAL_LM'
)

policy_model = get_peft_model(base_model, lora_config)
policy_model.print_trainable_parameters()

## Load Training Dataset

In [6]:
train_json = Path(sft_training_path)
with train_json.open('r', encoding='utf-8') as f:
    raw_samples = json.load(f)

print(f'Loaded {len(raw_samples)} samples from {train_json}')
print('Instruction in sample 0:', raw_samples[0]['instruction'])
print('Input in sample 0:', raw_samples[0]['input'])
print('Output in sample 0:', raw_samples[0]['output'])

test_file = Path(test_path)
with test_file.open('r', encoding='utf-8') as f:
    test_samples = json.load(f)
for i in range(len(test_samples)):
    test_samples[i] = alpaca_to_sharegpt(test_samples[i])
print('num test samples:', len(test_samples))
print('test sample: ',test_samples[3])

Loaded 20 samples from /blue/yixin.wen/minghan.yu/self-rewarding-lm-pytorch/data/alpaca_en_mini.json
Instruction in sample 0: Describe a process of making crepes.
Input in sample 0: 
Output in sample 0: Making crepes is an easy and delicious process! Here are step-by-step instructions on how to make them:

1. Assemble your ingredients. For basic crepes, you'll need: 1 cup all-purpose flour, 2 eggs, 1/2 cup milk, 1/2 cup water, 1/4 teaspoon salt, and 2 tablespoons melted butter.

2. Mix the batter: In a large mixing bowl, whisk together the flour and the eggs. Gradually add the milk and water, stirring constantly to ensure that there are no lumps. Add salt and melted butter, and mix well.

3. Let the batter rest: If you can, let the batter sit for an hour or so. This will help the flour to absorb the liquid and make the crepes more tender.

4. Heat your pan: Preheat a non-stick pan over medium heat. Lightly butter the pan or use cooking spray to prevent the crepes from sticking.

5. Pou

# SFT The Seed Dataset

In [7]:
class TextSFTDataset(Dataset):
    def __init__(self, data: List[Dict[str, Any]]):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        ex = self.data[idx]
        return {"messages": alpaca_to_sharegpt(ex)}
class TextSFTCollator:
    def __init__(self, processor, max_length: int = 4096):
        self.processor = processor
        self.tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
        self.max_length = max_length
        if self.tokenizer.pad_token_id is None and self.tokenizer.eos_token_id is not None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
    def __call__(self, batch: List[Dict[str, Any]]):
        conversations = [item["messages"] for item in batch]
        texts = [
            self.processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)
            for msgs in conversations
        ]
        enc = self.tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=self.max_length is not None,
            max_length=self.max_length,
        )
        labels = enc["input_ids"].clone()
        pad_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
        if pad_id is not None:
            labels[labels == pad_id] = -100
        enc["labels"] = labels
        return enc
sft_dataset = TextSFTDataset(raw_samples)
sft_collator = TextSFTCollator(processor, max_length=4096)
print(f"Dataset ready with {len(sft_dataset)} samples. First messages example:")
print(sft_dataset[0]["messages"])


Dataset ready with 20 samples. First messages example:
[{'role': 'user', 'content': 'Describe a process of making crepes.'}, {'role': 'assistant', 'content': "Making crepes is an easy and delicious process! Here are step-by-step instructions on how to make them:\n\n1. Assemble your ingredients. For basic crepes, you'll need: 1 cup all-purpose flour, 2 eggs, 1/2 cup milk, 1/2 cup water, 1/4 teaspoon salt, and 2 tablespoons melted butter.\n\n2. Mix the batter: In a large mixing bowl, whisk together the flour and the eggs. Gradually add the milk and water, stirring constantly to ensure that there are no lumps. Add salt and melted butter, and mix well.\n\n3. Let the batter rest: If you can, let the batter sit for an hour or so. This will help the flour to absorb the liquid and make the crepes more tender.\n\n4. Heat your pan: Preheat a non-stick pan over medium heat. Lightly butter the pan or use cooking spray to prevent the crepes from sticking.\n\n5. Pour the batter: Using a ladle or a m

## LoRA SFT 

In [8]:
bf16_flag = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
fp16_flag = torch.cuda.is_available() and not bf16_flag

training_args = TrainingArguments(
    output_dir='outputs/thinklite_vision_lora_sft',
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=5.5e-6,
    lr_scheduler_type="cosine", 
    weight_decay=0.01,
    logging_steps=1,
    save_strategy='no',
    report_to=[],
    remove_unused_columns=False,
    bf16=bf16_flag,
    fp16=fp16_flag
)

trainer = Trainer(
    model=policy_model,
    args=training_args,
    train_dataset=sft_dataset,
    data_collator=sft_collator,
    tokenizer=processor.tokenizer
)

print('LoRA Trainer ready.')
trainer.train()



  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


LoRA Trainer ready.


Step,Training Loss
1,1.7342
2,4.5924
3,4.6557
4,2.6788
5,3.0437


TrainOutput(global_step=5, training_loss=3.340950036048889, metrics={'train_runtime': 21.0123, 'train_samples_per_second': 0.952, 'train_steps_per_second': 0.238, 'total_flos': 190119641530368.0, 'train_loss': 3.340950036048889, 'epoch': 1.0})

In [9]:
# Save LoRA adapter after SFT
policy_model.save_pretrained(sft_adaptor_path.as_posix())
processor.tokenizer.save_pretrained(sft_adaptor_path.as_posix())

('/blue/yixin.wen/minghan.yu/self-rewarding-lm-pytorch/checkpoints/sft_adaptor/tokenizer_config.json',
 '/blue/yixin.wen/minghan.yu/self-rewarding-lm-pytorch/checkpoints/sft_adaptor/special_tokens_map.json',
 '/blue/yixin.wen/minghan.yu/self-rewarding-lm-pytorch/checkpoints/sft_adaptor/vocab.json',
 '/blue/yixin.wen/minghan.yu/self-rewarding-lm-pytorch/checkpoints/sft_adaptor/merges.txt',
 '/blue/yixin.wen/minghan.yu/self-rewarding-lm-pytorch/checkpoints/sft_adaptor/added_tokens.json',
 '/blue/yixin.wen/minghan.yu/self-rewarding-lm-pytorch/checkpoints/sft_adaptor/tokenizer.json')

## SFT Test Score

In [None]:
results = evaluate_generation_metrics(
    model=policy_model,      
    processor=processor,
    test_samples=test_samples,
    max_new_tokens=128,
    device=policy_model.device
)
print("Aggregated Metrics:")
print(f"ROUGE-1: {results['rouge1_mean']:.2f}")
print(f"ROUGE-2: {results['rouge2_mean']:.2f}")
print(f"ROUGE-L: {results['rougeL_mean']:.2f}")
print(f"BLEU-4: {results['bleu4']:.2f}")

Evaluating sample 1/5
user input: What are some of the key characteristics of a competitive market?
reference: A competitive market is characterized by the presence of multiple buyers and sellers competing with each other to buy and sell goods and services at the best possible price. Some of the key characteristics of a competitive market are:

1. Large number of buyers and sellers: In a competitive market, there are a large number of buyers and sellers, and no single buyer or seller can influence the market price.

2. Homogeneous products: The products offered by different firms in a competitive market are similar, if not identical. This makes it easy for buyers to compare prices and make a purchase decision.

3. Freedom of entry and exit: Firms can freely enter or exit the market, which ensures that new firms can enter the market if existing firms are making profits, and loss-making firms can exit the market.

4. Perfect information: In a competitive market, buyers and sellers have a

# Generate DPO Samples

In [None]:
policy_model.eval()
device = policy_model.device
print('model_device:', device)


PROMPT_GEN_TEMPLATE = (
    "You are an expert meteorologist creating challenging user instructions that require analyzing weather maps."
    "Below are example instructions (After User:) and responses(After Assistant:). Write a new instruction in a similar style that asks for a structured weather analysis."
    "Return only the new instruction. Do not include 'User:'. The new instruction should be concise and not exceed 30 words. "
    "{few_shot_block}"
    "New instruction:"
)

REWARD_PROMPT_TEMPLATE = (
    "Review the user's question and the corresponding response using the additive 5-point scoring system described below. "
    "Points are accumulated based on the satisfaction of each criterion: - Add 1 point if the response is relevant and provides some information related to the user's inquiry, even if it is incomplete or contains some irrelevant content. "
    "- Add another point if the response addresses a substantial portion of the user's question, but does not completely resolve the query or provide a direct answer. "
    "- Award a third point if the response answers the basic elements of the user's question in a useful way, regardless of whether it seems to have been written by an AI Assistant or if it has elements typically found in blogs or search results. "
    "- Grant a fourth point if the response is clearly written from an AI Assistant's perspective, addressing the user's question directly and comprehensively, and is well-organized and helpful, even if there is slight room for improvement in clarity, conciseness or focus. "
    "- Bestow a fifth point for a response that is impeccably tailored to the user's question by an AI Assistant, without extraneous information, reflecting expert knowledge, and demonstrating a high-quality, engaging, and insightful answer. "
    "User: {instruction}<response>{response}</response>"
    "After examining the user's instruction and the response:"
    "- Briefly justify your total score, up to 100 words."
    "- Conclude with the score using the format: Score: <total points>"
    "Remember to assess from the AI Assistant perspective, utilizing web search knowledge as necessary. To evaluate the response in alignment with this additive scoring model, we will systematically attribute points based on the outlined criteria."
)

reward_score_regex = re.compile(r"Score:\s*([0-5](?:\.\d+)?)")
num_candidates = 4

dpo_records = []
random.seed(42)

for entry in tqdm(sft_dataset, desc='Building self-rewarding DPO data'):
    instruction = entry["messages"][0]["content"]
    print("Instruction:", instruction)
    candidate_responses = generate_candidates(  
        model=policy_model,
        processor=processor,
        instruction=instruction,
        num_candidates=num_candidates,
        generation_kwargs=dict(max_new_tokens=512, temperature=0.7, top_p=0.9, do_sample=True),
        device=device
    )

    scored_candidates = []
    for candidate in candidate_responses:
        score, judgement = judge_response(model=policy_model, tokenizer=processor, instruction=instruction, response=candidate, reward_prompt_template=REWARD_PROMPT_TEMPLATE, reward_score_regex=reward_score_regex, n_votes=3, device=device)
        scored_candidates.append({"response": candidate, "score": score, "judge_output": judgement})

    valid_candidates = [c for c in scored_candidates if not math.isnan(c['score'])]  # drop failed parses
    if len(valid_candidates) < 2:
        continue

    sorted_candidates = sorted(valid_candidates, key=lambda c: c['score'], reverse=True)  # best-to-worst
    best = sorted_candidates[0]
    worst = sorted_candidates[-1]

    dpo_records.append({  # assemble final DPO entry
        'instruction': instruction,
        'candidates': scored_candidates,
        'chosen': best['response'],
        'chosen_score': best['score'],
        'rejected': worst['response'],
        'rejected_score': worst['score']
    })

print(f'Constructed {len(dpo_records)} DPO samples (after filtering).')


hf_device_map: cuda:0


Building self-rewarding DPO data:   0%|          | 0/20 [00:00<?, ?it/s]

Instruction: Describe a process of making crepes.


Building self-rewarding DPO data:   5%|▌         | 1/20 [01:51<35:25, 111.86s/it]

Instruction: Transform the following sentence using a synonym: The car sped quickly.


Building self-rewarding DPO data:  10%|█         | 2/20 [02:16<18:07, 60.41s/it] 

Instruction: Make a persuasive argument to promote recycling.


Building self-rewarding DPO data:  15%|█▌        | 3/20 [03:50<21:31, 75.95s/it]

Instruction: Invent a new word by combining two existing words.


Building self-rewarding DPO data:  20%|██        | 4/20 [04:20<15:23, 57.75s/it]

Instruction: Give an example of a job that a computer can do better than a human being.


Building self-rewarding DPO data:  25%|██▌       | 5/20 [05:04<13:12, 52.84s/it]

Instruction: Given the parameters of a triangle, find out its perimeter.

Side 1 = 4
Side 2 = 6
Side 3 = 8


Building self-rewarding DPO data:  30%|███       | 6/20 [05:54<12:07, 51.96s/it]

Instruction: Create an effective 140 character twitter post


Building self-rewarding DPO data:  35%|███▌      | 7/20 [06:23<09:35, 44.24s/it]

Instruction: Produce a list of the top 5 NHL players in 2021.


Building self-rewarding DPO data:  40%|████      | 8/20 [08:00<12:12, 61.07s/it]

Instruction: Reword this sentence to increase clarity

The idea of her being so brave made me smile


Building self-rewarding DPO data:  45%|████▌     | 9/20 [08:25<09:08, 49.87s/it]

Instruction: Explain the differences between birds and mammals


Building self-rewarding DPO data:  50%|█████     | 10/20 [10:19<11:37, 69.74s/it]

Instruction: Generate a one-sentence title for a creative recipe.


Building self-rewarding DPO data:  55%|█████▌    | 11/20 [10:46<08:30, 56.67s/it]

Instruction: Explain the concept of e-commerce.


Building self-rewarding DPO data:  60%|██████    | 12/20 [12:13<08:45, 65.67s/it]

Instruction: Design an experiment to evaluate the efficacy of the proposed method.

Proposed Method: Neural persistence


Building self-rewarding DPO data:  65%|██████▌   | 13/20 [14:09<09:27, 81.02s/it]

Instruction: Generate a list of five different books about science.


Building self-rewarding DPO data:  70%|███████   | 14/20 [15:10<07:29, 74.97s/it]

Instruction: Brainstorm some activities that could make an in-person work meeting more engaging.


Building self-rewarding DPO data:  75%|███████▌  | 15/20 [16:58<07:05, 85.01s/it]

Instruction: Brainstorm a list of titles for a photo album


Building self-rewarding DPO data:  80%|████████  | 16/20 [17:53<05:03, 75.84s/it]

Instruction: Rewrite the sentence so that it's in the present tense.

She had worked at the company for the past 3 years.


Building self-rewarding DPO data:  85%|████████▌ | 17/20 [18:18<03:01, 60.50s/it]

Instruction: Adapt the provided joke to make it more humorous.

Why did the frog cross the road?


Building self-rewarding DPO data:  90%|█████████ | 18/20 [18:45<01:41, 50.69s/it]

Instruction: Create an AI chatbot


Building self-rewarding DPO data:  95%|█████████▌| 19/20 [20:42<01:10, 70.34s/it]

Instruction: Explain what a circuit breaker is.


Building self-rewarding DPO data: 100%|██████████| 20/20 [21:57<00:00, 65.88s/it]

Constructed 11 DPO samples (after filtering).





## Save and Preview DPO Dataset


In [49]:
with raw_response_path.open('w', encoding='utf-8') as f:
    for item in dpo_records:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f'Saved {len(dpo_records)} pairs to {raw_response_path}')

with raw_response_path.open('r', encoding='utf-8') as f:
    first_line = f.readline().strip()
    if first_line:
        preview = json.loads(first_line)
        print('Sample entry keys:', list(preview.keys()))
        print('Chosen score:', preview.get('chosen_score'), 'Rejected score:', preview.get('rejected_score'))


Saved 11 pairs to /blue/yixin.wen/minghan.yu/self-rewarding-lm-pytorch/data/raw_responses.jsonl
Sample entry keys: ['instruction', 'candidates', 'chosen', 'chosen_score', 'rejected', 'rejected_score']
Chosen score: 4.5 Rejected score: 4.0


## Clean DPO dataset

In [50]:
dpo_training_path.parent.mkdir(parents=True, exist_ok=True)

with_scores = False
TIE_EPS = 1e-6
total = kept = dropped_nan = dropped_tie = dropped_other = 0

with dpo_training_path.open("w", encoding="utf-8") as g:
    for rec in iter_records(raw_response_path):
        total += 1
        # Count the number of NaN scores in the original candidates (for overview)
        cands = rec.get("candidates", [])
        if isinstance(cands, list) and any(to_float(c.get("score")) is None for c in cands):
            dropped_nan += 1
        dpo = build_dpo_entry(rec, include_scores=with_scores)
        if dpo is None:
            # Determine the reason (as best as possible)
            if not rec.get("candidates") or len(rec.get("candidates")) < 2:
                dropped_other += 1
            else:
                # Re-evaluate if it's a tie
                vals = [to_float(c.get("score")) for c in rec["candidates"]]
                vals = [v for v in vals if v is not None]
                if len(vals) >= 2 and (max(vals) - min(vals) <= TIE_EPS):
                    dropped_tie += 1
                else:
                    dropped_other += 1
            continue
            # Write out the valid DPO entry
        g.write(json.dumps(dpo, ensure_ascii=False) + "\n")
        kept += 1

print(f"[done] total={total}, kept={kept}, "
      f"had_nan_candidates={dropped_nan}, dropped_tie={dropped_tie}, dropped_other={dropped_other}")
print(f"[out] {dpo_training_path}")


[done] total=11, kept=10, had_nan_candidates=4, dropped_tie=1, dropped_other=0
[out] /blue/yixin.wen/minghan.yu/self-rewarding-lm-pytorch/data/dpo_training.jsonl


# DPO Training (Reference model is replaced by a different Lora adapter)

In [None]:
# Use the previous LoRA as the reference model, then update the current LoRA as the policy model
previous_adaptor_path = sft_adaptor_path  
current_adaptor_path = sft_adaptor_path  
policy_model = load_policy_with_lora(base_model, current_adaptor_path)
ref_model = load_policy_with_lora(base_model, previous_adaptor_path)
ref_model.eval()
for p in ref_model.parameters():
    p.requires_grad_(False)
del base_model
torch.cuda.empty_cache()



In [None]:
print(f"Loaded {len(raw_dpo_pairs)} raw pairs from {dpo_training_path}")
rows = []
for rec in raw_dpo_pairs:
    prompt = rec.get("prompt") or rec.get("instruction") or rec.get("query")
    chosen, rejected = rec.get("chosen"), rec.get("rejected")
    if not prompt or not chosen or not rejected:
        continue
    rows.append({"prompt": to_chat_prompt(prompt), "chosen": chosen, "rejected": rejected})
dpo_dataset = HFDataset.from_list(rows)
print("First formatted row:", {k: v[:120] for k, v in dpo_dataset[0].items()})
# safety: set precision flags if not already defined
if "bf16_flag" not in globals():
    bf16_flag = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
if "fp16_flag" not in globals():
    fp16_flag = torch.cuda.is_available() and not bf16_flag
dpo_args = DPOConfig(
    output_dir=dpo_model_path.as_posix(),
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=5e-6,
    logging_steps=1,
    save_strategy="no",
    report_to=[],
    remove_unused_columns=False,
    bf16=bf16_flag,
    fp16=fp16_flag,
    disable_tqdm=False
)
dpo_trainer = DPOTrainer(
    model=policy_model,      
    ref_model=ref_model,    
    args=dpo_args,
    beta=0.1,
    train_dataset=dpo_dataset,
    tokenizer=processor,
    max_prompt_length=512,
    max_length=1024,
)
print("Starting DPO training from SFT LoRA...")
dpo_trainer.train()
policy_model.save_pretrained(dpo_model_path.as_posix())
processor.save_pretrained(dpo_model_path.as_posix())
print(f"Saved DPO-tuned adapter to {dpo_model_path}")



Deprecated positional argument(s) used in DPOTrainer, please use the DPOConfig to set these arguments instead.
Trainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Loaded 10 raw pairs from /blue/yixin.wen/minghan.yu/self-rewarding-lm-pytorch/data/dpo_training.jsonl
First formatted row: {'prompt': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nDescribe a process of making crepes.<|im_end|', 'chosen': "Making crepes is a delightful culinary experience that requires a bit of patience and practice. Here's a step-by-step gu", 'rejected': "Making crepes is a delightful culinary process that can be both fun and satisfying. Here's a step-by-step guide to makin"}


: 

# Test Acc again Again


In [None]:
results = evaluate_generation_metrics(
    model=policy_model,      
    processor=processor,
    test_samples=test_samples,
    max_new_tokens=128,
    device=policy_model.device
)
print("Aggregated Metrics:")
print(f"ROUGE-1: {results['rouge1_mean']:.2f}")
print(f"ROUGE-2: {results['rouge2_mean']:.2f}")
print(f"ROUGE-L: {results['rougeL_mean']:.2f}")
print(f"BLEU-4: {results['bleu4']:.2f}")

Aggregated Metrics:
ROUGE-1: 4.71
ROUGE-2: 0.61
ROUGE-L: 3.46
BLEU-4: 0.40
