In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [1]:

# --- CONFIGURATION ---
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import sys
import json
import re
from unsloth import FastLanguageModel
import torch
from tqdm import tqdm
from IPython.display import display, Markdown

# --- CONFIGURATION ---
INPUT_FILE = "/content/drive/MyDrive/mipd_train_16k.jsonl"
OUTPUT_FILE = "/content/drive/MyDrive/mipd_train_cot.jsonl"
DENIED_SAMPLES_FILE = "/content/drive/MyDrive/denied_samples.txt" # New logging file
CHECKPOINT_FILE = "/content/drive/MyDrive/checkpoint.txt"  # NEW: Stores the current index

TEACHER_MODEL = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
MAX_SEQ_LENGTH = 16384 # 16 k to process full article length
MAX_NEW_TOKENS = 512
MAX_RETRIES = 3
SAMPLE_LIMIT = None # Set to None for full processing

# --- LOAD MODEL ---
print(f"Loading Teacher Model: {TEACHER_MODEL}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = TEACHER_MODEL,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)

# --- RESUME LOGIC ---
start_index = 0

if os.path.exists(CHECKPOINT_FILE):
    with open(CHECKPOINT_FILE, 'r') as f:
        try:
            content = f.read().strip()
            if content:
                start_index = int(content) + 1 # Start from the NEXT item
        except ValueError:
            start_index = 0

print(f"RESUME: Starting from index {start_index}")

# --- HELPERS ---
def clean_json_string(text):
    return text.replace("```json", "").replace("```", "").strip()

def validate_reasoning(reasoning, techniques):
    for t in techniques:
        # Check if technique name (case-insensitive) is in the reasoning
        if t.lower() not in reasoning.lower():
            return False
    return True

# --- MAIN LOOP ---
print("Starting Synthetic Dataset Generation...")

with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    all_lines = f.readlines()

if SAMPLE_LIMIT:
    all_lines = all_lines[:SAMPLE_LIMIT]

lines_to_process = all_lines[start_index:]
print(f"Processing {len(lines_to_process)} remaining samples...")

with open(OUTPUT_FILE, 'a', encoding='utf-8') as f_out:

    for i, line in tqdm(enumerate(lines_to_process, start=start_index), total=len(lines_to_process)):
        data = json.loads(line)
        user_input = data['input']
        raw_json_str = clean_json_string(data['output'])

        # Extract techniques list upfront
        try:
            techniques_list = json.loads(raw_json_str).get("discovered_techniques", [])
        except:
            techniques_list = []

        success = False
        reasoning_text = ""

        if not techniques_list:
            # Hardcode the response if no techniques are found
            reasoning_text = "Tekst ma charakter informacyjny i nie zawiera cech manipulacji."
            success = True
        else:
            # --- HARDENED PROMPT ---
            system_prompt = (
                "JesteÅ› surowym sÄ™dziÄ… i ekspertem od dezinformacji. "
                "Twoim zadaniem jest uzasadnienie werdyktu dotyczÄ…cego manipulacji w tekÅ›cie."
            )

            user_prompt = (
                f"TEKST:\n\"{user_input}\"\n\n"
                f"WERDYKT (Techniki): {raw_json_str}\n\n"
                "Napisz uzasadnienie dla tego werdyktu. "
                "Zasady:\n"
                "1. Pisz w trybie orzekajÄ…cym (np. 'Autor stosuje...', 'Tekst zawiera...'). "
                "2. NIE uÅ¼ywaj sÅ‚Ã³w niepewnoÅ›ci ('wydaje siÄ™', 'byÄ‡ moÅ¼e', 'prawdopodobnie').\n"
                "3. Traktuj podane techniki jako FAKT. WyjaÅ›nij GDZIE i DLACZEGO wystÄ™pujÄ….\n"
                "4. Wygeneruj TYLKO treÅ›Ä‡ uzasadnienia (bez cudzysÅ‚owÃ³w i formatowania).\n"
                "5. KaÅ¼da technika z listy MUSI byÄ‡ wymieniona z nazwy dokÅ‚adnie tak jak w werdykcie.\n"
                "6. Nie wolno omawiaÄ‡ technik, ktÃ³rych nie ma na liÅ›cie.\n"
            )

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]

            input_ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to("cuda")

            for attempt in range(MAX_RETRIES):
                try:
                    with torch.no_grad():
                        outputs = model.generate(
                            input_ids=input_ids,
                            max_new_tokens=MAX_NEW_TOKENS,
                            attention_mask=input_ids.ne(tokenizer.pad_token_id),
                            use_cache=True,
                            temperature=0.2,
                            top_p=0.9,
                            pad_token_id=tokenizer.eos_token_id
                        )

                    generated_text = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True).strip()
                    if not validate_reasoning(generated_text, techniques_list):
                        with open(DENIED_SAMPLES_FILE, 'a', encoding='utf-8') as f_denied:
                            f_denied.write(f"--- Denied Sample {i+1} ---\n")
                            f_denied.write(f"User Prompt:\n{user_prompt}\n")
                            f_denied.write(f"Generated Text:\n{generated_text}\n")
                            f_denied.write(f"Techniques List: {techniques_list}\n\n")
                        continue  # skip sample if reasoning doesn't mention techniques

                    if len(generated_text) > 10:
                        reasoning_text = generated_text
                        success = True
                        break
                except Exception as e:
                    print(f"Error generating text for sample {i+1}: {e}")
                    continue

        if not success:
            continue

        # --- CONSTRUCT FINAL JSON OBJECT ---
        # We perform the merge here in Python to guarantee valid JSON structure for the UI
        final_structure = {
            "reasoning": reasoning_text,
            "discovered_techniques": techniques_list
        }

        # Save dataset entry
        new_entry = data.copy()
        # This 'output' is now a perfect stringified JSON ready for SFT
        new_entry['output'] = json.dumps(final_structure, ensure_ascii=False)
        new_entry['original_output'] = raw_json_str

        f_out.write(json.dumps(new_entry, ensure_ascii=False) + "\n")
        f_out.flush()

        if SAMPLE_LIMIT and i < SAMPLE_LIMIT:
            print(f"\n--- Sample {i+1} ---")
            print(json.dumps(final_structure, indent=2, ensure_ascii=False))

        with open(CHECKPOINT_FILE, 'w') as f_ckpt:
            f_ckpt.write(str(i))
print("Done")

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
Loading Teacher Model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit...
==((====))==  Unsloth 2026.1.3: Fast Qwen2 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
RESUME: Found 2176 already processed samples.
Starting Synthetic Dataset Generation...


Generating:   0%|          | 0/10733 [00:00<?, ?it/s]

In [2]:
OUTPUT_FILE = "/content/drive/MyDrive/mipd_train_cot.jsonl"

record_count = 0
try:
    with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            record_count += 1
    print(f"Total records in '{OUTPUT_FILE}': {record_count}")
except FileNotFoundError:
    print(f"Error: Output file '{OUTPUT_FILE}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Total records in '/content/drive/MyDrive/mipd_train_cot.jsonl': 2180


In [7]:

INPUT_FILE = "/content/drive/MyDrive/mipd_train_16k.jsonl"
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    all_lines = f.readlines()
print(record_count)
print(len(all_lines))
skipped_count = len(all_lines) - record_count
print(f"Done. Skipped {skipped_count} problematic samples.")

2180
10733
Done. Skipped 8553 problematic samples.
