In [2]:
%%capture
# Installs Unsloth, Xformers, and TRL
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes datasets

In [None]:
# ==========================================
# 0. Environment Setup
# ==========================================
import os
try:
    import unsloth
except ImportError:
    print("Installing Unsloth...")
    os.system("pip install -q 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'")
    os.system("pip install -q --no-deps xformers trl peft accelerate bitsandbytes datasets")

# ==========================================
# 1. Configuration
# ==========================================
import torch
import json
import glob
import gc
import re
from unsloth import FastLanguageModel, PatchDPOTrainer
from trl import SFTTrainer, DPOTrainer, DPOConfig
from transformers import TrainingArguments
from datasets import Dataset
from unsloth.chat_templates import get_chat_template

# Paths
DATA_DIR = "/content/drive/MyDrive/ETSP"
OUTPUT_DIR_SFT = "/content/drive/MyDrive/ETSP/qwen_sft_v2"
OUTPUT_DIR_DPO = "/content/drive/MyDrive/ETSP/qwen_dpo_v2"

# Model Configuration
MODEL_NAME = "unsloth/Qwen2.5-3B-Instruct-bnb-4bit"
MAX_SEQ_LENGTH = 8192  # Extended context for 30 reviews + prompt
DTYPE = None
LOAD_IN_4BIT = True

# Mount Google Drive
from google.colab import drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# ==========================================
# 2. Data Processing
# ==========================================
def clean_text(text: str) -> str:
    """Remove HTML tags and normalize whitespace."""
    if not isinstance(text, str):
        return ""
    if '<' in text and '>' in text:
        text = re.sub(r'<[^>]+>', '', text)
    return re.sub(r"\s+", " ", text).strip()

def format_review_context(reviews_list):
    """
    Reconstruct review context matching generation pipeline:
    - Clean text with clean_text()
    - Filter by length: 20-1500 chars
    - Format: 'Review N: {text}' (no title)
    - Max 30 reviews per product
    """
    texts = []
    count = 0

    for r in reviews_list:
        if count >= 30:
            break

        text = clean_text(r.get('text', ''))

        if len(text) < 20 or len(text) > 1500:
            continue

        texts.append(f'Review {count+1}: {text}')
        count += 1

    return '\n\n'.join(texts)

def load_datasets(data_dir):
    """
    Load *_dpo_v2.jsonl files and construct SFT/DPO datasets.

    SFT: Learns readability control (A1/C1) and format adherence
    DPO: Learns hallucination rejection via preference optimization
    """
    files = glob.glob(os.path.join(data_dir, "**/*_dpo_v2.jsonl"), recursive=True)
    print(f"Found {len(files)} data files.")

    sft_data = []
    dpo_data = []

    for file_path in files:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    item = json.loads(line)
                    reviews = item.get('reviews', [])
                    if not reviews: continue

                    context = format_review_context(reviews)
                    if not context.strip(): continue

                    simple = item.get('summary_simple')
                    complex_ = item.get('summary_complex')
                    hallucinated = item.get('summary_hallucinated')

                    if not (simple and complex_ and hallucinated): continue

                    # A1 (Simple) Prompt - matches generation exactly
                    a1_instruction = """Summarize these reviews for a beginner (CEFR A1).

Requirements:
- Use simple present tense, basic vocabulary.
- Write 1 paragraph of 3-4 short sentences.
- NO bullet points.
- Structure: [Overall] + [Feature] + [Conclusion].
- Reflect what most people say, but mention important issues if some people have them.

Reviews:
{context}

Output ONLY the summary.""".format(context=context)

                    # C1 (Complex) Prompt - matches generation exactly
                    c1_instruction = """Summarize these reviews in a professional, analytical style (CEFR C1).

Requirements:
1. **Style**: Use sophisticated vocabulary and phrasing, identical to a high-quality expert review.

2. **Format**: Use a bulleted list with EXACTLY 3-6 points total.
   - **CRITICAL**: EVERY point MUST start with `(+)`, `(-)`, or `(~)`.
   - Use `(+)` for consensus strengths.
   - Use `(-)` for consensus weaknesses.
   - Use `(~)` for mixed/controversial opinions (CRITICAL).
   - **Order**: List all `(+)` first, then `(-)`, then `(~)`. Do NOT mix them randomly.
   - **Compact output**: No blank lines between points. Each point on a new line immediately after the previous one.

3. **Handling Contradictions**:
   - If User A says "great battery" but User B says "battery died", you MUST report this as an inconsistency.
   - Use phrases like "Polarized feedback regarding...", "Inconsistent reports on...", or "While most praise X, some users note Y...".

4. **Length**: Total summary under 180 words. Each point 18-30 words.

Reviews:
{context}

Output ONLY the structured summary.""".format(context=context)

                    # SFT samples (both readability levels)
                    sft_data.append({
                        "instruction": a1_instruction,
                        "output": simple
                    })
                    sft_data.append({
                        "instruction": c1_instruction,
                        "output": complex_
                    })

                    # DPO samples (C1 only - hallucination detection)
                    dpo_data.append({
                        "question": c1_instruction,
                        "chosen": complex_,
                        "rejected": hallucinated
                    })

                except Exception:
                    continue

    print(f"Loaded: {len(sft_data)} SFT samples | {len(dpo_data)} DPO pairs")

    # 90/10 train/val split
    ds_sft = Dataset.from_list(sft_data).train_test_split(test_size=0.1, seed=42)
    ds_dpo = Dataset.from_list(dpo_data).train_test_split(test_size=0.1, seed=42)

    return ds_sft, ds_dpo

# Load datasets
dataset_sft, dataset_dpo = load_datasets(DATA_DIR)
print(f"SFT Train: {len(dataset_sft['train'])}, Val: {len(dataset_sft['test'])}")
print(f"DPO Train: {len(dataset_dpo['train'])}, Val: {len(dataset_dpo['test'])}")

# ==========================================
# 3. Stage 1: Supervised Fine-Tuning (SFT)
# ==========================================
print("\nüöÄ [Stage 1] SFT Training")

# Load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT,
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

# Apply Qwen 2.5 chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen-2.5",
    mapping = {"role": "role", "content": "content", "user": "user", "assistant": "assistant"}
)

def format_sft_func(examples):
    """Apply chat template to SFT samples."""
    texts = []
    for inst, out in zip(examples["instruction"], examples["output"]):
        conv = [
            {"role": "user", "content": inst},
            {"role": "assistant", "content": out}
        ]
        texts.append(tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=False))
    return {"text": texts}

dataset_sft_fmt = dataset_sft.map(format_sft_func, batched=True)

# SFT Trainer
trainer_sft = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_sft_fmt['train'],
    eval_dataset = dataset_sft_fmt['test'],
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 2,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 2,
        learning_rate = 5e-5,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 10,
        eval_strategy = "steps",
        eval_steps = 50,
        save_strategy = "steps",
        save_steps = 100,
        save_total_limit = 3,
        load_best_model_at_end = True,
        metric_for_best_model = "eval_loss",
        optim = "adamw_8bit",
        output_dir = OUTPUT_DIR_SFT,
        seed = 3407,
    ),
)

trainer_sft.train()
print("‚úÖ SFT complete")

model.save_pretrained(OUTPUT_DIR_SFT)
tokenizer.save_pretrained(OUTPUT_DIR_SFT)

# ==========================================
# 4. Memory Cleanup
# ==========================================
del model, tokenizer, trainer_sft
gc.collect()
torch.cuda.empty_cache()
print("üßπ Memory cleared")

# ==========================================
# 5. Stage 2: Direct Preference Optimization (DPO)
# ==========================================
print("\nüöÄ [Stage 2] DPO Training")

# Reload model with SFT weights
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = OUTPUT_DIR_SFT,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT,
)

# Enable DPO training
PatchDPOTrainer()

def format_dpo_func(example):
    """Format DPO data for preference learning."""
    return {
        "prompt": tokenizer.apply_chat_template([{"role": "user", "content": example["question"]}], tokenize=False, add_generation_prompt=True),
        "chosen": example["chosen"],
        "rejected": example["rejected"],
    }

dataset_dpo_fmt = dataset_dpo.map(format_dpo_func)

# DPO Trainer
dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,  # Unsloth handles reference model internally
    tokenizer = tokenizer,
    train_dataset = dataset_dpo_fmt['train'],
    eval_dataset = dataset_dpo_fmt['test'],
    max_length = MAX_SEQ_LENGTH,
    max_prompt_length = MAX_SEQ_LENGTH - 512,  # Reserve 512 tokens for output
    beta = 0.1,  # Strong preference signal for hallucination rejection
    args = DPOConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8,
        warmup_ratio = 0.1,
        num_train_epochs = 1,
        learning_rate = 5e-6,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 10,
        eval_strategy = "steps",
        eval_steps = 20,
        save_strategy = "no",
        optim = "adamw_8bit",
        output_dir = OUTPUT_DIR_DPO,
        seed = 3407,
    ),
)

dpo_trainer.train()
print("‚úÖ DPO complete")

# ==========================================
# 6. Final Export
# ==========================================
print(f"üíæ Saving to {OUTPUT_DIR_DPO}")
model.save_pretrained(OUTPUT_DIR_DPO)
tokenizer.save_pretrained(OUTPUT_DIR_DPO)
print("üéâ Training complete!")

Installing Unsloth...
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
Found 9 data files.
Loaded: 3554 SFT samples | 1777 DPO pairs
SFT Train: 3198, Val: 356
DPO Train: 1599, Val: 178

üöÄ [Stage 1] SFT Training
==((====))==  Unsloth 2025.12.5: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.05G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.12.5 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


Map:   0%|          | 0/3198 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/3198 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/356 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,198 | Num Epochs = 2 | Total steps = 800
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 29,933,568 of 3,115,872,256 (0.96% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,2.5754,2.519796
100,2.026,1.907364
150,1.8008,1.881023
200,1.8816,1.867517
250,1.9119,1.858518
300,1.8324,1.851551
350,1.8278,1.846871
400,1.8051,1.841844
450,1.9178,1.83786
500,1.8241,1.835466


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


0,1
eval/loss,‚ñà‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/runtime,‚ñà‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ
eval/samples_per_second,‚ñÅ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñá‚ñá‚ñá‚ñà‚ñá‚ñá‚ñá‚ñá‚ñá
eval/steps_per_second,‚ñÅ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñá‚ñá‚ñá‚ñà‚ñá‚ñá‚ñÜ‚ñá‚ñá
train/epoch,‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñà‚ñÜ‚ñÖ‚ñÑ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ
train/learning_rate,‚ñÑ‚ñÑ‚ñÖ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
train/loss,‚ñà‚ñá‚ñá‚ñÜ‚ñÖ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ

0,1
eval/loss,1.8255
eval/runtime,58.7558
eval/samples_per_second,6.059
eval/steps_per_second,3.029
total_flos,9.902415246178714e+16
train/epoch,2
train/global_step,800
train/grad_norm,0.28802
train/learning_rate,0.0
train/loss,1.8445


‚úÖ SFT complete
üßπ Memory cleared

üöÄ [Stage 2] DPO Training
==((====))==  Unsloth 2025.12.5: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

Extracting prompt in train dataset (num_proc=16):   0%|          | 0/1599 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=16):   0%|          | 0/1599 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=16):   0%|          | 0/1599 [00:00<?, ? examples/s]

Extracting prompt in eval dataset (num_proc=16):   0%|          | 0/178 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=16):   0%|          | 0/178 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=16):   0%|          | 0/178 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,599 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 29,933,568 of 3,115,872,256 (0.96% trained)


Step,Training Loss,Validation Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
20,0.8522,0.592342,13.255346,11.899546,0.711111,1.355799,-379.436462,-404.503113,-0.622687,-0.590356,0,0,0
40,0.5304,0.323342,12.162534,9.678,0.844444,2.484532,-390.364594,-426.718628,-0.771685,-0.724624,No Log,No Log,No Log
60,0.2797,0.203208,10.942546,7.57139,0.922222,3.371155,-402.564484,-447.784668,-0.927748,-0.868945,No Log,No Log,No Log
80,0.155,0.137324,9.493854,5.340331,0.938889,4.153523,-417.051392,-470.095276,-1.059625,-0.990322,No Log,No Log,No Log
100,0.2151,0.106404,8.069264,3.231753,0.955556,4.837512,-431.297302,-491.181122,-1.188316,-1.110057,No Log,No Log,No Log
120,0.1757,0.089143,6.999538,1.613979,0.961111,5.38556,-441.994537,-507.358826,-1.277033,-1.191818,No Log,No Log,No Log
140,0.1297,0.078113,6.081824,0.248831,0.966667,5.832993,-451.171661,-521.010254,-1.358885,-1.266958,No Log,No Log,No Log
160,0.0378,0.072495,5.693827,-0.391511,0.966667,6.085338,-455.051697,-527.413696,-1.390653,-1.296496,No Log,No Log,No Log
180,0.0689,0.070677,5.400839,-0.835406,0.966667,6.236245,-457.981567,-531.852661,-1.414287,-1.317963,No Log,No Log,No Log
200,0.1203,0.069946,5.35225,-0.913489,0.966667,6.265739,-458.467438,-532.633484,-1.418352,-1.321566,No Log,No Log,No Log


0,1
eval/logits/chosen,‚ñà‚ñá‚ñÖ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
eval/logits/rejected,‚ñà‚ñá‚ñÖ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
eval/logps/chosen,‚ñà‚ñá‚ñÜ‚ñÖ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
eval/logps/rejected,‚ñà‚ñá‚ñÜ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
eval/loss,‚ñà‚ñÑ‚ñÉ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/rewards/accuracies,‚ñÅ‚ñÖ‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
eval/rewards/chosen,‚ñà‚ñá‚ñÜ‚ñÖ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
eval/rewards/margins,‚ñÅ‚ñÉ‚ñÑ‚ñÖ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà
eval/rewards/rejected,‚ñà‚ñá‚ñÜ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
eval/runtime,‚ñÜ‚ñÅ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñà‚ñÖ

0,1
eval/logits/chosen,-1.41835
eval/logits/rejected,-1.32157
eval/logps/chosen,-458.46744
eval/logps/rejected,-532.63348
eval/loss,0.06995
eval/rewards/accuracies,0.96667
eval/rewards/chosen,5.35225
eval/rewards/margins,6.26574
eval/rewards/rejected,-0.91349
eval/runtime,93.6106


‚úÖ DPO complete
üíæ Saving to /content/drive/MyDrive/ETSP/qwen_dpo_v2
üéâ Training complete!


In [3]:
import torch
from unsloth import FastLanguageModel
from transformers import TextStreamer
import json
import re
import random
import os
import glob

class Colors:
    GREEN = '\033[92m'
    RED = '\033[91m'
    YELLOW = '\033[93m'
    CYAN = '\033[96m'
    BOLD = '\033[1m'
    RESET = '\033[0m'
    GRAY = '\033[90m'

MODEL_PATH = "/content/drive/MyDrive/ETSP/qwen_dpo_v2"
DATA_DIR = "/content/drive/MyDrive/ETSP"

print(f"{Colors.CYAN}Ê≠£Âú®Âä†ËΩΩËÆ≠ÁªÉÂ•ΩÁöÑ DPO Ê®°Âûã: {MODEL_PATH}...{Colors.RESET}")
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = MODEL_PATH,
        max_seq_length = 6144,
        dtype = None,
        load_in_4bit = True,
    )
    FastLanguageModel.for_inference(model)
    print(f"{Colors.GREEN}Ê®°ÂûãÂä†ËΩΩÊàêÂäü!{Colors.RESET}\n")
except Exception as e:
    print(f"{Colors.RED}Ê®°ÂûãÂä†ËΩΩÂ§±Ë¥•: {e}{Colors.RESET}")
    exit()

def clean_text(text: str) -> str:
    if not isinstance(text, str): return ""
    if '<' in text and '>' in text:
        text = re.sub(r'<[^>]+>', '', text)
    return re.sub(r"\s+", " ", text).strip()

def format_review_context(reviews_list):
    texts = []
    count = 0
    for r in reviews_list:
        if count >= 30: break
        text = clean_text(r.get('text', ''))
        if len(text) < 20 or len(text) > 1500: continue
        texts.append(f'Review {count+1}: {text}')
        count += 1
    return '\n\n'.join(texts)

def load_random_sample():
    files = glob.glob(os.path.join(DATA_DIR, "**/*_dpo_v2.jsonl"), recursive=True)
    if not files:
        print(f"{Colors.RED}Êâæ‰∏çÂà∞ÊµãËØïÊï∞ÊçÆÊñá‰ª∂ (*_dpo_v2.jsonl){Colors.RESET}")
        return None, None

    target_file = random.choice(files)
    print(f"‰ΩøÁî®Êï∞ÊçÆÊñá‰ª∂: {target_file}")

    samples = []
    with open(target_file, 'r', encoding='utf-8') as f:
        for _ in range(500):
            line = f.readline()
            if not line: break
            samples.append(json.loads(line))

    return random.choice(samples), target_file

sample_data, filename = load_random_sample()
if not sample_data: exit()

context_input = format_review_context(sample_data['reviews'])
asin = sample_data.get('parent_asin', 'Unknown')
category = sample_data.get('category', 'Unknown')

print(f"{'='*60}")
print(f"‰∫ßÂìÅ: {category} | ASIN: {asin}")
print(f"ÂéüÂßãËØÑËÆ∫ËæìÂÖ• (ÂÖ± {len(sample_data['reviews'])} Êù°, Êà™ÂèñÂâç30Êù°)")
print(f"{'='*60}")
print(f"{Colors.GRAY}{context_input[:1000]}... [Áî±‰∫éÂ§™ÈïøÂè™ÊòæÁ§∫Ââç1000Â≠óÁ¨¶]{Colors.RESET}")
print(f"{'='*60}\n")

def get_prompt(style, context):
    if style == "A1":
        return """Summarize these reviews for a beginner (CEFR A1).

Requirements:
- Use simple present tense, basic vocabulary.
- Write 1 paragraph of 3-4 short sentences.
- NO bullet points.
- Structure: [Overall] + [Feature] + [Conclusion].
- Reflect what most people say, but mention important issues if some people have them.

Reviews:
{context}

Output ONLY the summary.""".format(context=context)

    elif style == "C1":
        return """Summarize these reviews in a professional, analytical style (CEFR C1).

Requirements:
1. **Style**: Use sophisticated vocabulary and phrasing, identical to a high-quality expert review.

2. **Format**: Use a bulleted list with EXACTLY 3-6 points total.
   - **CRITICAL**: EVERY point MUST start with `(+)`, `(-)`, or `(~)`.
   - Use `(+)` for consensus strengths.
   - Use `(-)` for consensus weaknesses.
   - Use `(~)` for mixed/controversial opinions (CRITICAL).
   - **Order**: List all `(+)` first, then `(-)`, then `(~)`. Do NOT mix them randomly.
   - **Compact output**: No blank lines between points. Each point on a new line immediately after the previous one.

3. **Handling Contradictions**:
   - If User A says "great battery" but User B says "battery died", you MUST report this as an inconsistency.
   - Use phrases like "Polarized feedback regarding...", "Inconsistent reports on...", or "While most praise X, some users note Y...".

4. **Length**: Total summary under 180 words. Each point 18-30 words.

Reviews:
{context}

Output ONLY the structured summary.""".format(context=context)

def run_test(style):
    print(f"\n>>> ÊµãËØïÊ®°Âºè: {Colors.BOLD}{style} (SFT+DPO ÊïàÊûúÈ™åËØÅ){Colors.RESET}")

    user_prompt = get_prompt(style, context_input)

    messages = [{"role": "user", "content": user_prompt}]
    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    streamer = TextStreamer(tokenizer, skip_prompt=True)

    print(f"{Colors.CYAN}Ê®°ÂûãÁîüÊàê‰∏≠...{Colors.RESET}")
    print(f"{Colors.YELLOW}{'-'*40}{Colors.RESET}")

    _ = model.generate(
        inputs,
        streamer=streamer,
        max_new_tokens=512,
        temperature=0.1,
        use_cache=True
    )
    print(f"{Colors.YELLOW}{'-'*40}{Colors.RESET}")

run_test("A1")
run_test("C1")
print(f"\n{Colors.GREEN}ÊµãËØïÂÆåÊàê{Colors.RESET}")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
[96mÊ≠£Âú®Âä†ËΩΩËÆ≠ÁªÉÂ•ΩÁöÑ DPO Ê®°Âûã: /content/drive/MyDrive/ETSP/qwen_dpo_v2...[0m
==((====))==  Unsloth 2025.12.7: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.05G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Unsloth 2025.12.7 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


[92mÊ®°ÂûãÂä†ËΩΩÊàêÂäü![0m

‰ΩøÁî®Êï∞ÊçÆÊñá‰ª∂: /content/drive/MyDrive/ETSP/Pet_Supplies/Pet_Supplies_dpo_v2.jsonl


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


‰∫ßÂìÅ: Pet_Supplies | ASIN: B01MD0MV9M
ÂéüÂßãËØÑËÆ∫ËæìÂÖ• (ÂÖ± 5 Êù°, Êà™ÂèñÂâç30Êù°)
[90mReview 1: We have been a fan of this carrier for years. It is comfortable to carry, the cats love them. They are sturdy and a bonus is that they look great.Only down side is that they are not the easiest to clean. We have put them through the washer a couple of times (the big one at the laundry mat) when a cat had an accident, even though you are not supposed to and they survived just fine but I would not do so regularly.They work great on planes (used on the flight back from Germany with our 15# cat) and for trips to the vet. It is actually possible AND comfortable to carry and stack multiple carriers!!They are expensive so we have slowly relpaced our cheaper ones and now everyone has a sleepy pod so they can stop fighting over them. Heavier cats will fit in this carrier since they can turn around, but I would not make them travel for long in it (just to the vet in our case, the 19# cat gets a 

In [None]:
import os
import json
import random
import re
from tqdm.auto import tqdm
from unsloth import FastLanguageModel
from datasets import Dataset
from openai import OpenAI

MODEL_PATH = "/content/drive/MyDrive/ETSP/qwen_dpo_v2"
DATA_ROOT = "/content/drive/MyDrive/ETSP"

CATEGORIES = [
    "Electronics", "Books", "Home_and_Kitchen", "Beauty_and_Personal_Care",
    "Clothing_Shoes_and_Jewelry", "Toys_and_Games", "Sports_and_Outdoors",
    "Pet_Supplies", "Automotive", "Office_Products"
]

SAMPLES_PER_CATEGORY = 5

API_KEY = "sk-xxxxxxxxxxxxxxxxxxx"
BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
JUDGE_MODEL = "qwen-plus"

print(f"Ê≠£Âú®Âä†ËΩΩÂæÆË∞ÉÊ®°Âûã: {MODEL_PATH}...")
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = MODEL_PATH,
        max_seq_length = 6144,
        dtype = None,
        load_in_4bit = True,
    )
    FastLanguageModel.for_inference(model)
    print("Student Ê®°ÂûãÂä†ËΩΩÊàêÂäü")
except Exception as e:
    print(f"Ê®°ÂûãÂä†ËΩΩÂ§±Ë¥•: {e}")
    exit()

def clean_text(text: str) -> str:
    if not isinstance(text, str): return ""
    if '<' in text and '>' in text: text = re.sub(r'<[^>]+>', '', text)
    return re.sub(r"\s+", " ", text).strip()

def format_context(reviews):
    texts = []
    count = 0
    for r in reviews:
        if count >= 30: break
        text = clean_text(r.get('text', ''))
        if len(text) < 20 or len(text) > 1500: continue
        texts.append(f'Review {count+1}: {text}')
        count += 1
    return '\n\n'.join(texts)

def get_unseen_samples(category, count=2):
    file_path = os.path.join(DATA_ROOT, category, f"{category}_dpo_v2.jsonl")
    if not os.path.exists(file_path):
        file_path = os.path.join(DATA_ROOT, category, f"{category}_dpo.jsonl")
        if not os.path.exists(file_path):
            print(f"Ë∑≥Ëøá {category}: Êñá‰ª∂‰∏çÂ≠òÂú®")
            return []

    clean_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line)
                safe_item = {
                    "parent_asin": item.get("parent_asin", "unknown"),
                    "reviews": item.get("reviews", []),
                }
                clean_data.append(safe_item)
            except:
                continue

    if len(clean_data) < 10: return []

    try:
        ds = Dataset.from_list(clean_data)
        ds_split = ds.train_test_split(test_size=0.1, seed=42)
        test_set = ds_split['test']

        if len(test_set) > count:
            indices = random.sample(range(len(test_set)), count)
            samples = [test_set[i] for i in indices]
        else:
            samples = [item for item in test_set]

        results = []
        for item in samples:
            context = format_context(item.get('reviews', []))
            if len(context) > 200:
                item['formatted_context'] = context
                item['category'] = category
                results.append(item)

        return results

    except Exception as e:
        print(f"{category} Â§ÑÁêÜÂá∫Èîô: {e}")
        return []

client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

def evaluate_factuality(original_reviews, generated_summary, style):
    if style == "A1":
        style_requirements = """
[A1 Ê†ºÂºèË¶ÅÊ±Ç]
- Â∫îËØ•ÊòØ 1 ÊÆµËØùÔºå3-4 ‰∏™ÁÆÄÂçïÁü≠Âè•
- ‰ΩøÁî®ÁÆÄÂçïÁé∞Âú®Êó∂ÂíåÂü∫Á°ÄËØçÊ±á
- ‰∏çÂ∫îËØ•Êúâ bullet points
- ÁªìÊûÑÔºö[ÊÄª‰ΩìËØÑ‰ª∑] + [ÁâπÁÇπ] + [ÁªìËÆ∫]
"""
    else:  # C1
        style_requirements = """
[C1 Ê†ºÂºèË¶ÅÊ±Ç]
- Â∫îËØ•ÊòØ 3-6 ‰∏™ bullet points
- ÊØè‰∏™ point ÂøÖÈ°ª‰ª• (+) / (-) / (~) ÂºÄÂ§¥
- È°∫Â∫èÔºöÊâÄÊúâ (+) Âú®ÂâçÔºåÁÑ∂Âêé (-)ÔºåÊúÄÂêé (~)
- ‰ΩøÁî®Â§çÊùÇËØçÊ±áÂíå‰∏ì‰∏öË°®Ëææ
"""

    judge_prompt = f"""
‰Ω†ÊòØ‰∏Ä‰ΩçÂÖ¨Âπ≥„ÄÅÂÆ¢ËßÇÁöÑ‰∫ãÂÆûÊ†∏Êü•Âëò„ÄÇËØ∑Âü∫‰∫é‰∏ãÊñπÁöÑËØÑËÆ∫ÂéüÊñáÔºåÊ£ÄÊü• AI ÁîüÊàêÁöÑÊëòË¶ÅÂÜÖÂÆπÊòØÂê¶Â±ûÂÆû„ÄÇ

[ËæìÂÖ•Êï∞ÊçÆ]
ËØÑËÆ∫ÂéüÊñá (Source):
{original_reviews}

AI ÁîüÊàêÊëòË¶Å (Summary):
{generated_summary}

{style_requirements}

[Âà§ÁΩöÊ†áÂáÜ]

PASS (ÂêàÊ†º):
1. **ÊúâÊçÆÂèØ‰æù**: ÊëòË¶Å‰∏≠ÊèêÂà∞ÁöÑËßÇÁÇπÔºàÊó†ËÆ∫ÊòØ‰ºòÁÇπËøòÊòØÁº∫ÁÇπÔºâÔºåÂè™Ë¶ÅÂú®ËØÑËÆ∫ÂéüÊñá‰∏≠ÁúüÂÆûÂá∫Áé∞ËøáÔºàÂì™ÊÄïÂè™ÊúâÂ∞ëÊï∞‰∫∫ÊèêÂà∞ÔºâÔºåÂ∞±ÁÆóÊ≠£Á°Æ„ÄÇ
2. **ÂêàÁêÜÂΩíÁ∫≥**: ÂÖÅËÆ∏Ê®°ÂûãÂØπÁõ∏‰ººÁöÑËßÇÁÇπËøõË°åÊÄªÁªìÊ¶ÇÊã¨„ÄÇ
3. **Ê†ºÂºèÊ≠£Á°Æ**: Á¨¶Âêà {style} ÁöÑÊ†ºÂºèË¶ÅÊ±Ç„ÄÇ

FAIL (‰∏çÂêàÊ†º):
1. **Êó†‰∏≠ÁîüÊúâ (Fabrication)**: ÊèêÂà∞‰∫ÜËØÑËÆ∫‰∏≠Ê†πÊú¨‰∏çÂ≠òÂú®ÁöÑÂèÇÊï∞„ÄÅÂäüËÉΩ„ÄÅÈÖç‰ª∂ÊàñÂÖ∑‰ΩìÂú∫ÊôØ„ÄÇ
2. **ÊÉÖÊÑüÂèçËΩ¨ (Sentiment Error)**: ËØÑËÆ∫ÊòéÊòéËØ¥ÊòØ"Áº∫ÁÇπ"ÔºåÊëòË¶ÅÂç¥ÂÜôÊàê‰∫Ü"‰ºòÁÇπ"ÔºàÊàñÂèç‰πãÔºâ„ÄÇ
3. **Êé©ÁõñÁüõÁõæ (Misrepresentation)**: Â¶ÇÊûúËØÑËÆ∫‰∏≠ÂØπÊüê‰∏ÄÁÇπÂ≠òÂú®ÊòéÊòæÂàÜÊ≠ßÔºàÊúâ‰∫∫ËØ¥Â•ΩÔºåÊúâ‰∫∫ËØ¥ÂùèÔºâÔºå‰ΩÜÊëòË¶ÅÂç¥Âè™ÂçïÊñπÈù¢ËØ¥"Â§ßÂÆ∂ÈÉΩÂñúÊ¨¢"ÔºåÂøΩÁï•‰∫ÜÂèçÈù¢Â£∞Èü≥„ÄÇ
4. **Ê†ºÂºèÈîôËØØ**: ‰∏çÁ¨¶Âêà {style} ÁöÑÊ†ºÂºèË¶ÅÊ±Ç„ÄÇ

[ËæìÂá∫Ê†ºÂºè - JSON]
{{
    "score": <0-10>,
    "is_factual": <boolean>,
    "format_correct": <boolean>,
    "hallucinations": [
        "ÊëòË¶ÅÂéüÂè• -> ÈîôËØØÂéüÂõ†"
    ],
    "reasoning": "ÁÆÄÁü≠ËØÑ‰ª∑„ÄÇ"
}}
"""
    try:
        response = client.chat.completions.create(
            model=JUDGE_MODEL,
            messages=[{"role": "user", "content": judge_prompt}],
            temperature=0.0,
            response_format={"type": "json_object"}
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"Judge API Error: {e}")
        return {"score": 0, "is_factual": False, "format_correct": False, "hallucinations": ["API Error"], "reasoning": str(e)}

def generate_student_summary(context, style="C1"):
    if style == "A1":
        user_prompt = """Summarize these reviews for a beginner (CEFR A1).

Requirements:
- Use simple present tense, basic vocabulary.
- Write 1 paragraph of 3-4 short sentences.
- NO bullet points.
- Structure: [Overall] + [Feature] + [Conclusion].
- Reflect what most people say, but mention important issues if some people have them.

Reviews:
{context}

Output ONLY the summary.""".format(context=context)

    else:  # C1
        user_prompt = """Summarize these reviews in a professional, analytical style (CEFR C1).

Requirements:
1. **Style**: Use sophisticated vocabulary and phrasing.
2. **Format**: Use a bulleted list with EXACTLY 3-6 points total.
   - **CRITICAL**: EVERY point MUST start with `(+)`, `(-)`, or `(~)`.
   - Use `(+)` for consensus strengths.
   - Use `(-)` for consensus weaknesses.
   - Use `(~)` for mixed/controversial opinions.
   - **Order**: List all `(+)` first, then `(-)`, then `(~)`.

3. **Handling Contradictions**:
   - Report inconsistencies using phrases like "Polarized feedback...".

4. **Length**: Total summary under 180 words.

Reviews:
{context}

Output ONLY the structured summary.""".format(context=context)

    messages = [{"role": "user", "content": user_prompt}]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        temperature=0.1,
        use_cache=True
    )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "assistant" in text:
        return text.split("assistant")[-1].strip()
    return text

def main():
    all_test_samples = []
    print(f"\nÊ≠£Âú®‰ªé {len(CATEGORIES)} ‰∏™ÂìÅÁ±ª‰∏≠ÊäΩÂèñÊú™ËßÅÈ™åËØÅÈõÜ (seed=42)...")

    for cat in tqdm(CATEGORIES, desc="Sampling"):
        samples = get_unseen_samples(cat, count=SAMPLES_PER_CATEGORY)
        all_test_samples.extend(samples)

    if not all_test_samples:
        print("Êú™ÊâæÂà∞‰ªª‰ΩïÊµãËØïÊ†∑Êú¨ÔºåËØ∑Ê£ÄÊü•Êï∞ÊçÆË∑ØÂæÑ")
        return

    print(f"\nÂáÜÂ§áÂ∞±Áª™ÔºåÂÖ± {len(all_test_samples)} ‰∏™Ê†∑Êú¨„ÄÇÂºÄÂßãÂÖ®ÂìÅÁ±ªÁúüÂÆûÊÄßÊµãËØï\n")

    results_a1 = []
    results_c1 = []

    GREEN = '\033[92m'
    RED = '\033[91m'
    YELLOW = '\033[93m'
    CYAN = '\033[96m'
    RESET = '\033[0m'
    BOLD = '\033[1m'

    for i, item in enumerate(all_test_samples):
        cat = item['category']
        asin = item['parent_asin']
        context = item['formatted_context']

        print(f"\n{'='*60}")
        print(f"[{i+1}/{len(all_test_samples)}] ÊµãËØï‰∫ßÂìÅ: {cat} (ASIN: {asin})")
        print(f"{'='*60}")

        # ÊµãËØï A1 Ê®°Âºè
        print(f"\n{CYAN}{BOLD}>>> A1 Ê®°ÂºèÊµãËØï{RESET}")
        summary_a1 = generate_student_summary(context, style="A1")
        eval_a1 = evaluate_factuality(context, summary_a1, style="A1")

        res_a1 = {
            "category": cat,
            "asin": asin,
            "style": "A1",
            "summary": summary_a1,
            "score": eval_a1['score'],
            "is_factual": eval_a1['is_factual'],
            "format_correct": eval_a1.get('format_correct', True),
            "hallucinations": eval_a1['hallucinations'],
            "reasoning": eval_a1.get('reasoning', '')
        }
        results_a1.append(res_a1)

        score_a1 = eval_a1['score']
        color_a1 = GREEN if score_a1 >= 8 else (YELLOW if score_a1 >= 5 else RED)

        print(f"   ÁîüÊàêÊëòË¶Å: {summary_a1[:100].replace(chr(10), ' ')}...")
        print(f"   Ë£ÅÂà§ÊâìÂàÜ: {color_a1}{score_a1}/10{RESET}")
        if eval_a1['hallucinations']:
            print(f"   ÂπªËßâ/ÈîôËØØ: {RED}{eval_a1['hallucinations']}{RESET}")
        else:
            print(f"   ‰∫ãÂÆûÊ†∏Êü•ÈÄöËøá")

        # ÊµãËØï C1 Ê®°Âºè
        print(f"\n{CYAN}{BOLD}>>> C1 Ê®°ÂºèÊµãËØï{RESET}")
        summary_c1 = generate_student_summary(context, style="C1")
        eval_c1 = evaluate_factuality(context, summary_c1, style="C1")

        res_c1 = {
            "category": cat,
            "asin": asin,
            "style": "C1",
            "summary": summary_c1,
            "score": eval_c1['score'],
            "is_factual": eval_c1['is_factual'],
            "format_correct": eval_c1.get('format_correct', True),
            "hallucinations": eval_c1['hallucinations'],
            "reasoning": eval_c1.get('reasoning', '')
        }
        results_c1.append(res_c1)

        score_c1 = eval_c1['score']
        color_c1 = GREEN if score_c1 >= 8 else (YELLOW if score_c1 >= 5 else RED)

        print(f"   ÁîüÊàêÊëòË¶Å: {summary_c1[:100].replace(chr(10), ' ')}...")
        print(f"   Ë£ÅÂà§ÊâìÂàÜ: {color_c1}{score_c1}/10{RESET}")
        if eval_c1['hallucinations']:
            print(f"   ÂπªËßâ/ÈîôËØØ: {RED}{eval_c1['hallucinations']}{RESET}")
        else:
            print(f"   ‰∫ãÂÆûÊ†∏Êü•ÈÄöËøá")

        print("-" * 60)

    if not results_a1 and not results_c1: return

    # ÊúÄÁªàÁªüËÆ°Êä•Âëä
    print("\n" + "="*70)
    print("ÂπªËßâÊ£ÄÊµãÊúÄÁªàÊä•Âëä (Final Factuality Report)")
    print("="*70)

    def print_stats(results, style_name):
        print(f"\n{BOLD}„Äê{style_name} Ê®°ÂºèÁªüËÆ°„Äë{RESET}")

        cat_stats = {}
        for r in results:
            c = r['category']
            if c not in cat_stats: cat_stats[c] = []
            cat_stats[c].append(r['score'])

        print(f"{'Category':<30} | {'Avg Score':<10}")
        print("-" * 45)
        for cat, scores in cat_stats.items():
            avg = sum(scores)/len(scores)
            print(f"{cat:<30} | {avg:.1f}")

        total_avg = sum(r['score'] for r in results) / len(results)
        pass_count = sum(1 for r in results if r['is_factual'])
        format_count = sum(1 for r in results if r.get('format_correct', True))
        pass_rate = (pass_count / len(results)) * 100
        format_rate = (format_count / len(results)) * 100

        print("-" * 45)
        print(f"ÊÄªÂπ≥ÂùáÂàÜ: {total_avg:.2f} / 10")
        print(f"Êó†ÂπªËßâÈÄöËøáÁéá: {pass_rate:.1f}%")
        print(f"Ê†ºÂºèÊ≠£Á°ÆÁéá: {format_rate:.1f}%")

    print_stats(results_a1, "A1")
    print_stats(results_c1, "C1")

    print("\n" + "="*70)

    # ‰øùÂ≠òÁªìÊûú
    all_results = {
        "A1": results_a1,
        "C1": results_c1
    }

    with open("hallucination_test_results_full.json", "w", encoding='utf-8') as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    print("ËØ¶ÁªÜÁªìÊûúÂ∑≤‰øùÂ≠òËá≥ hallucination_test_results_full.json")

if __name__ == "__main__":
    main()


Ê≠£Âú®Âä†ËΩΩÂæÆË∞ÉÊ®°Âûã: /content/drive/MyDrive/ETSP/qwen_dpo_v2...
==((====))==  Unsloth 2025.12.7: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Student Ê®°ÂûãÂä†ËΩΩÊàêÂäü

Ê≠£Âú®‰ªé 10 ‰∏™ÂìÅÁ±ª‰∏≠ÊäΩÂèñÊú™ËßÅÈ™åËØÅÈõÜ (seed=42)...


Sampling:   0%|          | 0/10 [00:00<?, ?it/s]


ÂáÜÂ§áÂ∞±Áª™ÔºåÂÖ± 50 ‰∏™Ê†∑Êú¨„ÄÇÂºÄÂßãÂÖ®ÂìÅÁ±ªÁúüÂÆûÊÄßÊµãËØï


[1/50] ÊµãËØï‰∫ßÂìÅ: Electronics (ASIN: B089Q5MJ2K)

[96m[1m>>> A1 Ê®°ÂºèÊµãËØï[0m
   ÁîüÊàêÊëòË¶Å: Most people like the laptop because it is fast, thin, and good for work or hobbies. Many say it work...
   Ë£ÅÂà§ÊâìÂàÜ: [92m10/10[0m
   ‰∫ãÂÆûÊ†∏Êü•ÈÄöËøá

[96m[1m>>> C1 Ê®°ÂºèÊµãËØï[0m
   ÁîüÊàêÊëòË¶Å: (+) Exceptional performance, rapid startup, lightweight design, strong value for money, and high rel...
   Ë£ÅÂà§ÊâìÂàÜ: [92m9/10[0m
   ‰∫ãÂÆûÊ†∏Êü•ÈÄöËøá
------------------------------------------------------------

[2/50] ÊµãËØï‰∫ßÂìÅ: Electronics (ASIN: B0B2CX4CKD)

[96m[1m>>> A1 Ê®°ÂºèÊµãËØï[0m
   ÁîüÊàêÊëòË¶Å: Most people like this microphone because it has good sound, easy setup, and cool features like light...
   Ë£ÅÂà§ÊâìÂàÜ: [92m10/10[0m
   ‰∫ãÂÆûÊ†∏Êü•ÈÄöËøá

[96m[1m>>> C1 Ê®°ÂºèÊµãËØï[0m
   ÁîüÊàêÊëòË¶Å: **Strengths:** - **Sound Quality:** Consistently praised for clear, crisp audio, particul