In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [None]:

# --- CONFIGURATION ---
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import sys
import re
import json
import time
import torch
from unsloth import FastLanguageModel
from tqdm import tqdm
from IPython.display import display, Markdown

# --- CONFIGURATION ---
INPUT_FILE = "/content/drive/MyDrive/mipd_train_16k.jsonl"
OUTPUT_FILE = "/content/drive/MyDrive/mipd_train_cot_reasoning_only.jsonl"

TEACHER_MODEL = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
MAX_SEQ_LENGTH = 4096
MAX_NEW_TOKENS = 512 # Increased slightly for better reasoning
MAX_RETRIES = 3

# Set to None to process EVERYTHING. Set to 5 for a quick test.
SAMPLE_LIMIT = 5

# --- 1. LOAD MODEL ---
print(f"Loading Teacher Model: {TEACHER_MODEL}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = TEACHER_MODEL,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)

# --- 2. HELPERS ---

def clean_json_string(text):
    """Removes Markdown wrapping to get raw JSON string."""
    text = text.replace("```json", "").replace("```", "").strip()
    return text

def contains_chinese(text):
    return bool(re.search(r'[\u4e00-\u9fff]', text))

# --- 3. RESUME LOGIC ---
processed_inputs = set()
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                if line.strip():
                    entry = json.loads(line)
                    processed_inputs.add(entry['input'])
            except: pass
    print(f"RESUMING: Skipping {len(processed_inputs)} samples.")

# --- 4. MAIN LOOP ---
print("Starting Logic-Only Generation...")

with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    all_lines = f.readlines()

if SAMPLE_LIMIT:
    all_lines = all_lines[:SAMPLE_LIMIT]
    print(f"DEBUG MODE: Processing {SAMPLE_LIMIT} rows only.")

with open(OUTPUT_FILE, 'a', encoding='utf-8') as f_out:

    for i, line in tqdm(enumerate(all_lines), total=len(all_lines), desc="Generating"):
        data = json.loads(line)
        user_input = data['input']

        if user_input in processed_inputs:
            continue

        # Get raw JSON string for the prompt
        raw_json_str = clean_json_string(data['output'])

        # SYSTEM: Define the persona
        system_prompt = (
            "Jeste≈õ ekspertem od weryfikacji informacji i logiki. "
            "Twoim zadaniem jest wyja≈õnienie, dlaczego dany tekst zosta≈Ç oznaczony konkretnymi technikami manipulacji. "
            "Je≈õli tekst nie ma technik, wyja≈õnij dlaczego jest neutralny."
        )

        # USER: Give text + labels
        user_prompt = (
            f"TEKST:\n\"{user_input}\"\n\n"
            f"PRZYPISANE ETYKIETY:\n{raw_json_str}\n\n"
            "Napisz zwiƒôz≈Çe uzasadnienie (maksymalnie 3 zdania). "
            "Wyja≈õnij, kt√≥re fragmenty tekstu odpowiadajƒÖ przypisanym etykietom. "
            "Nie powtarzaj tre≈õci etykiet, wyja≈õnij mechanizm ich u≈ºycia."
        )

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        input_ids = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to("cuda")

        success = False
        final_reasoning = ""

        for attempt in range(MAX_RETRIES):
            try:
                with torch.no_grad():
                    outputs = model.generate(
                        input_ids=input_ids,
                        max_new_tokens=MAX_NEW_TOKENS,
                        use_cache=True,
                        temperature=0.7,
                        pad_token_id=tokenizer.eos_token_id
                    )

                generated_text = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

                # Simple Guardrails
                if contains_chinese(generated_text):
                    continue
                if len(generated_text) < 10: # Too short
                    continue

                final_reasoning = generated_text
                success = True
                break

            except Exception as e:
                print(f"Error: {e}")
                continue

        # Fallback if generation failed
        if not success:
            final_reasoning = "Analiza technik manipulacji wykaza≈Ça zgodno≈õƒá z przypisanymi etykietami."

        # CONSTRUCT FINAL OUTPUT MANUALLY
        # Format: [Reasoning] \n ```json {Original JSON} ```
        final_output_string = f"{final_reasoning}\n\n```json\n{raw_json_str}\n```"

        # Save
        new_entry = data.copy()
        new_entry['output'] = final_output_string
        new_entry['original_output'] = raw_json_str # Safe copy

        f_out.write(json.dumps(new_entry, ensure_ascii=False) + "\n")
        f_out.flush()

        # Pretty Print for Review
        if SAMPLE_LIMIT and i < SAMPLE_LIMIT:
            print(f"\n--- Sample {i+1} ---")
            display(Markdown(f"**LABELS:** `{raw_json_str}`"))
            display(Markdown(f"**REASONING:** {final_reasoning}"))
            print("-" * 40)

print(f"Done! Saved to {OUTPUT_FILE}")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading Teacher Model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit...
==((====))==  Unsloth 2026.1.2: Fast Qwen2 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Starting Robust Generation...


Generating:   0%|          | 0/5 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Generating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [04:04<00:00, 49.00s/it]

Done! Saved to /content/drive/MyDrive/mipd_train_cot_synthetic_robust.jsonl





In [None]:
import json
import re

# Update path if necessary (e.g., if you are running this locally or in colab)
INPUT_FILE = "/content/drive/MyDrive/mipd_train_cot_synthetic_robust.jsonl"

def extract_json(text):
    # Try to find JSON block
    match = re.search(r'```json(.*?)```', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    # Fallback: look for the last set of braces
    match = re.search(r'(\{.*\})$', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

print(f"Inspecting {INPUT_FILE}...\n")

valid_count = 0
total_count = 0
mismatch_count = 0

with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        total_count += 1
        data = json.loads(line)

        user_input = data['input']
        ground_truth = data['original_output'] # The tags we wanted to keep
        generated_output = data['output']      # The Reasoning + JSON

        # 1. PARSE GENERATED JSON
        extracted_json_str = extract_json(generated_output)

        # 2. VALIDITY CHECK
        is_valid_json = False
        tags_match = False

        if extracted_json_str:
            try:
                gen_json = json.loads(extracted_json_str)
                gt_json = json.loads(ground_truth)

                # Check if tags are identical (ignoring order)
                # Handle cases where keys might be slightly different or missing
                gen_tags = set(gen_json.get('discovered_techniques', []))
                gt_tags = set(gt_json.get('discovered_techniques', []))

                if gen_tags == gt_tags:
                    tags_match = True
                else:
                    mismatch_count += 1

                is_valid_json = True
                valid_count += 1
            except:
                pass

        # 3. PRINT FIRST 5 SAMPLES FOR MANUAL REVIEW
        if i < 5:
            print(f"--- SAMPLE {i+1} ---")
            print(f"INPUT (Snippet): {user_input}...")
            print(f"GROUND TRUTH: {ground_truth}")
            print(f"\nGENERATED OUTPUT:\n{generated_output}")
            print("-" * 20)
            print(f"JSON Valid? {'‚úÖ' if is_valid_json else '‚ùå'}")
            print(f"Tags Preserved? {'‚úÖ' if tags_match else '‚ö†Ô∏è (Model changed tags!)'}")
            if not tags_match and is_valid_json:
                print(f"   Expected: {gt_tags}")
                print(f"   Got:      {gen_tags}")
            print("="*60 + "\n")

print(f"\nSummary:")
print(f"Total Processed: {total_count}")
print(f"Valid JSON Format: {valid_count}/{total_count} ({(valid_count/total_count)*100:.1f}%)")
print(f"Tag Consistency: {total_count - mismatch_count}/{total_count} tags matched exactly.")

Inspecting /content/drive/MyDrive/mipd_train_cot_synthetic_robust.jsonl...

--- SAMPLE 1 ---
INPUT (Snippet): "Terapia homoseksualizmu ‚Äì szansa czy oszustwo? - Strona ≈ªycia" "Osoby te sƒÖ przypuszczalnie sterowane przez ≈õrodowiska gejowskie, ale te≈º przez dziennikarzy, kt√≥rzy szukajƒÖ gorƒÖcego medialnego tematu. Np. redaktor Jan J√≥zefowicz z telewizji WTK jako gej szukajƒÖcy pomocy przyszed≈Ç do poradni z ukrytym nadajnikiem, kt√≥ry obs≈Çugiwa≈Ça bƒôdƒÖca w pobli≈ºu jego redakcyjna kole≈ºanka. Jednocze≈õnie jest przecie≈º spora grupa os√≥b, kt√≥re naprawdƒô szukajƒÖ pomocy w przezwyciƒô≈ºeniu w≈Çasnego homoseksualizmu. Jak odr√≥≈ºniƒá jednych od drugich, jak chroniƒÖc siƒô przed prowokatorami ‚Äì nie odm√≥wiƒá pomocy potrzebujƒÖcym? Homoseksualizm egodystoniczny PotrzebujƒÖcƒÖ jest osoba, kt√≥ra odczuwajƒÖc popƒôd p≈Çciowy do os√≥b tej samej p≈Çci (SSA ‚Äì same sex attractions) tego pociƒÖgu w sobie nie akceptuje, nie chce. Sama przecie≈º tak nie wybra≈Ça, nie spowodowa≈Ça, ma 