### References

*   [https://www.kaggle.com/code/abdmental01/jigsaw-mpnet-base-v2-inference-cv-0-876](https://www.kaggle.com/code/abdmental01/jigsaw-mpnet-base-v2-inference-cv-0-876)
*   [https://www.kaggle.com/code/aerdem4/jigsaw-acrc-qwen7b-finetune-logits-processor-zoo](https://www.kaggle.com/code/aerdem4/jigsaw-acrc-qwen7b-finetune-logits-processor-zoo)
*   [https://www.guruguru.science/competitions/24/discussions/21027ff1-2074-4e21-a249-b2d4170bd516/](https://www.guruguru.science/competitions/24/discussions/21027ff1-2074-4e21-a249-b2d4170bd516/)
*   https://www.kaggle.com/code/mks2192/jigsaw-llama3-1-8b-instruct-training-one-epoch
*   [https://www.kaggle.com/code/fuumin621/qwen2-5-lora-finetune-baseline-inference](https://www.kaggle.com/code/fuumin621/qwen2-5-lora-finetune-baseline-inference)
*   https://www.kaggle.com/code/neibyr/30-min-just-use-semantic-search-qwen3-emb-0-6b
*   https://www.kaggle.com/code/datafan07/jigsaw-speed-run-10-min-triplet-and-faiss
*   https://www.kaggle.com/code/nahidhossainredom/deberta-v3-base-3-epochs-lb-0-906
*   https://www.kaggle.com/code/wasupandceacar/jigsaw-pseudo-training-llama-3-2-3b-instruct

In [None]:
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'trl==0.21.0' 'optimum==1.27.0' 'auto-gptq==0.7.1' 'bitsandbytes==0.46.1' 'logits-processor-zoo==0.2.1' 'vllm==0.10.0'
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'deepspeed==0.17.4' -q
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'triton==3.2.0'
!uv pip install --system --no-index --find-links='/kaggle/input/jigsaw-packages2/whls/' 'clean-text'
!uv pip install --system --no-index -U --no-deps --find-links='/kaggle/input/jigsaw-packages2/whls/' 'peft' 'accelerate' 'datasets'

In [None]:
%%writefile constants.py

seed = 0

base_model_path = "/kaggle/input/jigsaw-pretrain-public/pytorch/llama-3.2-3b-instruct/1"
pretrain_lora_path = None
lora_path = "/kaggle/working/pseudo_lora"
use_gptq = "gptq" in base_model_path

positive = "Yes"
negative = "No"
judge_words = "Violation:"
system_prompt = '''You are given a comment from reddit and a rule. 
Your task is to classify whether the comment violates the rule. 
Only respond Yes/No.'''

frac = 0.05
use_train = True

import kagglehub

deterministic = kagglehub.package_import('wasupandceacar/deterministic').deterministic
deterministic.init_all(seed)

In [None]:
%%writefile utils.py

import numpy as np
import pandas as pd
from datasets import Dataset
from constants import *

def build_prompt(row):
    return f"""{system_prompt}
Subreddit: r/{row["subreddit"]}
Rule: {row["rule"]}
Examples:
1) {row["positive_example"]}
{judge_words} Yes
2) {row["negative_example"]}
{judge_words} No
Comment: {row["body"]}
{judge_words}"""

def get_df():
    merge = list()
    if use_train:
        train_dataset = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/train.csv")
        train_df = train_dataset[["body", "rule", "subreddit", "rule_violation",
                                "positive_example_1", "positive_example_2", 
                                "negative_example_1", "negative_example_2"]].copy()
        train_df["positive_example"] = np.where(np.random.rand(len(train_df)) < 0.5, train_df["positive_example_1"], train_df["positive_example_2"])
        train_df["negative_example"] = np.where(np.random.rand(len(train_df)) < 0.5, train_df["negative_example_1"], train_df["negative_example_2"])
        train_df.drop(columns=["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"], inplace=True)
        merge.append(train_df)
    test_dataset = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/test.csv")
    test_dataset = test_dataset.groupby('rule', group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=seed)).reset_index(drop=True)
    print(f"Select {len(test_dataset)} test data")
    for violation_type in ["positive", "negative"]:
        for i in range(1, 3):
            sub_dataset = test_dataset[["rule", "subreddit", "positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"]].copy()
            body_col = f"{violation_type}_example_{i}"
            other_positive_col = f"{violation_type}_example_{3-i}"
            sub_dataset["body"] = sub_dataset[body_col]
            sub_dataset[f"{violation_type}_example"] = sub_dataset[other_positive_col]
            anti_violation_type = "negative" if violation_type == "positive" else "positive"
            sub_dataset[f"{anti_violation_type}_example"] = np.where(np.random.rand(len(sub_dataset)) < 0.5, sub_dataset[f"{anti_violation_type}_example_1"], sub_dataset[f"{anti_violation_type}_example_2"])
            sub_dataset["rule_violation"] = 1 if violation_type == "positive" else 0
            sub_dataset.drop(columns=["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"], inplace=True)
            merge.append(sub_dataset)
    return pd.concat(merge, axis=0).drop_duplicates(ignore_index=True)

def build_dataset(df):
    df["prompt"] = df.apply(build_prompt, axis=1)
    columns = ["prompt"]
    if "rule_violation" in df:
        df["completion"] = df["rule_violation"].map({
            1: positive,
            0: negative,})
        columns.append("completion")
    dataset = Dataset.from_pandas(df[columns])
    return dataset

In [None]:
%%writefile train.py

import torch
import pandas as pd
from trl import SFTTrainer, SFTConfig
from peft import PeftModel, LoraConfig, get_peft_model
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.utils import is_torch_bf16_gpu_available

from utils import *
from constants import *

def main():
    train_dataset = build_dataset(get_df())
    lora_config = LoraConfig(
        r=64,
        lora_alpha=128,
        lora_dropout=0.1,
        bias="none",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        task_type="CAUSAL_LM",
    )
    
    training_args = SFTConfig(
        num_train_epochs=1,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        learning_rate=1e-4,
        weight_decay=0.01,
        max_grad_norm=1.0,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        bf16=is_torch_bf16_gpu_available(),
        fp16=not is_torch_bf16_gpu_available(),
        dataloader_pin_memory=True,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
        save_strategy="no",
        report_to="none",
        completion_only_loss=True,
        packing=False,
        remove_unused_columns=False,
    )

    if use_gptq:
        model = AutoModelForCausalLM.from_pretrained(
            base_model_path,
            device_map="balanced_low_0",
            trust_remote_code=True,
            use_cache=False,
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            base_model_path,
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,     
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
            ),
            device_map="balanced_low_0",
            trust_remote_code=True,
            use_cache=False,
        )
    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
    tokenizer.pad_token = tokenizer.eos_token
    if pretrain_lora_path:
        model = PeftModel.from_pretrained(model, pretrain_lora_path)
        model = model.merge_and_unload()

    if len(train_dataset) > 0:
        trainer = SFTTrainer(
            model=model,
            processing_class=tokenizer,
            args=training_args,
            train_dataset=train_dataset,
            peft_config=lora_config,
        )
        trainer.train()
        trainer.save_model(lora_path)
    else:
        peft_model = get_peft_model(model, lora_config)
        peft_model.save_pretrained(lora_path)
        tokenizer.save_pretrained(lora_path)

if __name__ == "__main__":
    main()

In [None]:
%%writefile accelerate_config.yaml
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  gradient_accumulation_steps: 4
  gradient_clipping: 1.0
  train_micro_batch_size_per_gpu: 4
  
  zero_stage: 2
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  
  stage3_gather_16bit_weights_on_model_save: false
  stage3_max_live_parameters: 1e8
  stage3_max_reuse_distance: 1e8
  stage3_prefetch_bucket_size: 5e7
  stage3_param_persistence_threshold: 1e5
  
  zero_allow_untested_optimizer: true
  zero_force_ds_cpu_optimizer: false
  
  # fp16:
  #   enabled: true
  #   loss_scale: 0
  #   initial_scale_power: 16
  #   loss_scale_window: 1000
  #   hysteresis: 2
  #   min_loss_scale: 1
  bf16:
    enabled: true
  
distributed_type: DEEPSPEED
downcast_bf16: 'yes'
dynamo_config:
  dynamo_backend: INDUCTOR
  dynamo_use_fullgraph: false
  dynamo_use_dynamic: false
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

In [None]:
%%writefile inference.py

import os
os.environ["VLLM_USE_V1"] = "0"

import random
import vllm
import torch
import numpy as np
import pandas as pd
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
from vllm.lora.request import LoRARequest
from utils import build_dataset
from constants import *
import multiprocessing as mp

def run_inference_on_device(df_slice):
    llm = vllm.LLM(
        base_model_path,
        quantization="gptq" if use_gptq else None,
        tensor_parallel_size=1,
        gpu_memory_utilization=0.98,
        trust_remote_code=True,
        dtype="half",
        enforce_eager=True,
        max_model_len=2048,
        disable_log_stats=True,
        enable_prefix_caching=True,
        enable_lora=True,
        max_lora_rank=64,
    )
    tokenizer = llm.get_tokenizer()
    outputs = llm.generate(
        build_dataset(df_slice)["prompt"],
        vllm.SamplingParams(
            skip_special_tokens=True,
            max_tokens=1,
            logits_processors=[MultipleChoiceLogitsProcessor(tokenizer, choices=[positive, negative])],
            logprobs=2,
        ),
        use_tqdm=True,
        lora_request=LoRARequest("lora1", 1, lora_path)
    )
    log_probs = [{lp.decoded_token: np.exp(lp.logprob) for lp in out.outputs[0].logprobs[0].values()} for out in outputs]
    predictions = pd.DataFrame(log_probs)[[positive, negative]]
    predictions["row_id"] = df_slice["row_id"].values
    return predictions

def worker(device_id, df_slice, return_dict):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
    print(f"[Worker {device_id}] Running on GPU {device_id}, data size={len(df_slice)}")
    preds = run_inference_on_device(df_slice)
    return_dict[device_id] = preds

def main():
    test_df = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/test.csv")
    test_df["positive_example"] = test_df.apply(lambda row: random.choice([row["positive_example_1"], row["positive_example_2"]]), axis=1)
    test_df["negative_example"] = test_df.apply(lambda row: random.choice([row["negative_example_1"], row["negative_example_2"]]), axis=1)
    test_df = test_df.drop(columns=["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"], errors="ignore")

    mid = len(test_df) // 2
    df0 = test_df.iloc[:mid].reset_index(drop=True)
    df1 = test_df.iloc[mid:].reset_index(drop=True)

    manager = mp.Manager()
    return_dict = manager.dict()
    p0 = mp.Process(target=worker, args=(0, df0, return_dict))
    p1 = mp.Process(target=worker, args=(1, df1, return_dict))
    p0.start()
    p1.start()
    p0.join()
    p1.join()

    predictions = pd.concat([return_dict[0], return_dict[1]], ignore_index=True)
    submission = predictions[["row_id", positive]].rename(columns={positive: "rule_violation"})
    submission.to_csv("/kaggle/working/submission_qwen.csv", index=False)

if __name__ == "__main__":
    main()

In [None]:
!accelerate launch --config_file accelerate_config.yaml train.py
!python inference.py


In [None]:
import os
import pandas as pd

In [None]:
%%writefile constants.py
EMBDEDDING_MODEL_PATH = "/kaggle/input/qwen-3-embedding/transformers/0.6b/1"
MODEL_OUTPUT_PATH = '/kaggle/input/qwen3-8b-embedding'
DATA_PATH = "/kaggle/input/jigsaw-agile-community-rules"

# https://huggingface.co/Qwen/Qwen3-Embedding-0.6B/blob/main/config_sentence_transformers.json
EMBEDDING_MODEL_QUERY = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:"

CLEAN_TEXT = True
TOP_K = 2000
BATCH_SIZE = 128

In [None]:
%%writefile utils.py
import pandas as pd
import torch.distributed as dist

from datasets import Dataset
from cleantext import clean
from tqdm.auto import tqdm

from constants import CLEAN_TEXT


def build_prompt(row):
    return f"""r/{row["subreddit"]}\nComment: {row["body"]}"""


def cleaner(text):
    return clean(
        text,
        fix_unicode=True,
        to_ascii=True,
        lower=False,
        no_line_breaks=False,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=False,
        no_punct=False,
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        lang="en",
    )



def get_dataframe_to_train(data_path):
    train_dataset = pd.read_csv(f"{data_path}/train.csv")
    test_dataset = pd.read_csv(f"{data_path}/test.csv").sample(frac=0.6, random_state=42).reset_index(drop=True)

    flatten = []
    flatten.append(train_dataset[["body", "rule", "subreddit", "rule_violation"]])
    
    for violation_type in ["positive", "negative"]:
        for i in range(1, 3):
            sub_dataset = test_dataset[[f"{violation_type}_example_{i}", "rule", "subreddit"]].copy()
            sub_dataset = sub_dataset.rename(columns={f"{violation_type}_example_{i}": "body"})
            sub_dataset["rule_violation"] = 1 if violation_type == "positive" else 0
            flatten.append(sub_dataset)

    dataframe = pd.concat(flatten, axis=0)    
    dataframe = dataframe.drop_duplicates(ignore_index=True)
    return dataframe


def prepare_dataframe(dataframe):
    dataframe["prompt"] = dataframe.apply(build_prompt, axis=1)

    
    if CLEAN_TEXT:
        tqdm.pandas(desc="cleaner")
        dataframe["prompt"] = dataframe["prompt"].progress_apply(cleaner)

    if "rule_violation" in dataframe.columns:
        dataframe["rule_violation"] = dataframe["rule_violation"].map(
            {
                1: 1,
                0: -1,
            }
        )

    return dataframe

In [None]:
%%writefile semantic.py
import pandas as pd
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search, dot_score
from tqdm.auto import tqdm
from peft import PeftModel, PeftConfig


from utils import get_dataframe_to_train, prepare_dataframe
from constants import DATA_PATH, EMBDEDDING_MODEL_PATH, EMBEDDING_MODEL_QUERY, TOP_K, BATCH_SIZE, MODEL_OUTPUT_PATH



def get_scores(test_dataframe):
    corpus_dataframe = get_dataframe_to_train(DATA_PATH)
    corpus_dataframe = prepare_dataframe(corpus_dataframe)
    
    # Load base model
    model = AutoModelForCausalLM.from_pretrained(EMBDEDDING_MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(EMBDEDDING_MODEL_PATH)
    
    # Load adapter configuration and model
    adapter_config = PeftConfig.from_pretrained(MODEL_OUTPUT_PATH)
    lora_model = PeftModel.from_pretrained(model, MODEL_OUTPUT_PATH, config=adapter_config)
    merged_model = lora_model.merge_and_unload()
    tokenizer.save_pretrained("Qwen3Emb_Finetuned")
    merged_model.save_pretrained("Qwen3Emb_Finetuned")

    # 4. Tạo lại SentenceTransformer từ encoder đã merge
    embedding_model = SentenceTransformer(model_name_or_path="Qwen3Emb_Finetuned", device="cuda")

    print('Done loading model!')

    result = []
    for rule in tqdm(test_dataframe["rule"].unique(), desc=f"Generate scores for each rule"):
        test_dataframe_part = test_dataframe.query("rule == @rule").reset_index(drop=True)
        corpus_dataframe_part = corpus_dataframe.query("rule == @rule").reset_index(drop=True)
        corpus_dataframe_part = corpus_dataframe_part.reset_index(names="row_id")
        
        query_embeddings = embedding_model.encode(
            sentences=test_dataframe_part["prompt"].tolist(),
            prompt=EMBEDDING_MODEL_QUERY,
            batch_size=BATCH_SIZE,
            show_progress_bar=True,
            convert_to_tensor=True,
            device="cuda",
            normalize_embeddings=True,
        )
        document_embeddings = embedding_model.encode(
            sentences=corpus_dataframe_part["prompt"].tolist(),
            batch_size=BATCH_SIZE,
            show_progress_bar=True,
            convert_to_tensor=True,
            device="cuda",
            normalize_embeddings=True,
        )
        test_dataframe_part["semantic"] = semantic_search(
            query_embeddings,
            document_embeddings,
            top_k=TOP_K,
            score_function=dot_score,
        )
        def get_score(semantic):
            semantic = pd.DataFrame(semantic)
            semantic = semantic.merge(
                corpus_dataframe_part[["row_id", "rule_violation"]],
                how="left",
                left_on="corpus_id",
                right_on="row_id",
            )
            semantic["score"] = semantic["score"]*semantic["rule_violation"]
            return semantic["score"].sum()
            
        tqdm.pandas(desc=f"Add label for {rule=}")
        test_dataframe_part["rule_violation"] = test_dataframe_part["semantic"].progress_apply(get_score)
        result.append(test_dataframe_part[["row_id", "rule_violation"]].copy())
        
    submission = pd.concat(result, axis=0)
    return submission


def generate_submission():
    test_dataframe = pd.read_csv(f"{DATA_PATH}/test.csv")
    test_dataframe = prepare_dataframe(test_dataframe)
    
    submission = get_scores(test_dataframe)
    submission = test_dataframe[["row_id"]].merge(submission, on="row_id", how="left")
    submission.to_csv("submission_qwen3.csv", index=False)


if __name__ == "__main__":
    generate_submission()

In [None]:
!python semantic.py

In [None]:
%%writefile triplet.py
#!/usr/bin/env python3

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd
import numpy as np
import random
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    models
)
from sentence_transformers.losses import TripletLoss
from sklearn.metrics.pairwise import cosine_similarity  # (원본 유지)
import re
from urllib.parse import urlparse
import faiss  # (원본 유지)
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Advanced clustering
from sklearn.cluster import AgglomerativeClustering
from umap import UMAP

# -----------------------------
# Helpers (원본 동일)
# -----------------------------
def cleaner(text):
    """Replace URLs with format: <url>: (domain/important-path)"""
    if not text:
        return text
    url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
    def replace_url(match):
        url = match.group(0)
        try:
            parsed = urlparse(url)
            domain = parsed.netloc.lower()
            if domain.startswith('www.'):
                domain = domain[4:]
            path_parts = [part for part in parsed.path.split('/') if part]
            if path_parts:
                important_path = '/'.join(path_parts[:2])
                return f"<url>: ({domain}/{important_path})"
            else:
                return f"<url>: ({domain})"
        except:
            return "<url>: (unknown)"
    return re.sub(url_pattern, replace_url, str(text))


def load_test_data():
    """Load test data."""
    print("Loading test data...")
    test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')
    print(f"Loaded {len(test_df)} test examples")
    print(f"Unique rules: {test_df['rule'].nunique()}")
    return test_df


def collect_all_texts(test_df):
    """Collect all unique texts from test set."""
    print("\nCollecting all texts for embedding...")
    all_texts = set()
    for body in test_df['body']:
        if pd.notna(body):
            all_texts.add(cleaner(str(body)))
    example_cols = ['positive_example_1', 'positive_example_2',
                    'negative_example_1', 'negative_example_2']
    for col in example_cols:
        for example in test_df[col]:
            if pd.notna(example):
                all_texts.add(cleaner(str(example)))
    all_texts = list(all_texts)
    print(f"Collected {len(all_texts)} unique texts")
    return all_texts


def generate_embeddings(texts, model, batch_size=64):
    """Generate BGE embeddings for all texts."""
    print(f"Generating embeddings for {len(texts)} texts...")
    embeddings = model.encode(
        sentences=texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_tensor=False,
        normalize_embeddings=True
    )
    return embeddings


def create_test_triplet_dataset(test_df, augmentation_factor=2, random_seed=42, subsample_fraction=1.0):
    """Create triplet dataset from test data: anchor=rule, positive=positive_example, negative=negative_example."""
    random.seed(random_seed)
    np.random.seed(random_seed)
    anchors, positives, negatives = [], [], []
    print("Creating rule-aligned triplets from test data...")
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing test rows"):
        rule = cleaner(str(row['rule']))
        pos_examples = []
        neg_examples = []
        for neg_col in ['negative_example_1', 'negative_example_2']:  # Compliant → triplet positive
            if pd.notna(row[neg_col]):
                pos_examples.append(cleaner(str(row[neg_col])))
        for pos_col in ['positive_example_1', 'positive_example_2']:  # Violating → triplet negative
            if pd.notna(row[pos_col]):
                neg_examples.append(cleaner(str(row[pos_col])))
        for pos_ex in pos_examples:
            for neg_ex in neg_examples:
                anchors.append(rule)
                positives.append(pos_ex)
                negatives.append(neg_ex)

    if augmentation_factor > 0:
        print(f"Adding {augmentation_factor}x augmentation...")
        rule_positives = {}
        rule_negatives = {}
        for rule in test_df['rule'].unique():
            rule_df = test_df[test_df['rule'] == rule]
            pos_pool, neg_pool = [], []
            for _, row in rule_df.iterrows():
                for neg_col in ['negative_example_1', 'negative_example_2']:
                    if pd.notna(row[neg_col]):
                        pos_pool.append(cleaner(str(row[neg_col])))
                for pos_col in ['positive_example_1', 'positive_example_2']:
                    if pd.notna(row[pos_col]):
                        neg_pool.append(cleaner(str(row[pos_col])))
            rule_positives[rule] = list(set(pos_pool))
            rule_negatives[rule] = list(set(neg_pool))

        for rule in test_df['rule'].unique():
            clean_rule = cleaner(str(rule))
            pos_pool = rule_positives[rule]
            neg_pool = rule_negatives[rule]
            n_samples = min(augmentation_factor * len(pos_pool), len(pos_pool) * len(neg_pool))
            for _ in range(n_samples):
                if pos_pool and neg_pool:
                    anchors.append(clean_rule)
                    positives.append(random.choice(pos_pool))
                    negatives.append(random.choice(neg_pool))

    combined = list(zip(anchors, positives, negatives))
    random.shuffle(combined)
    original_count = len(combined)
    if subsample_fraction < 1.0:
        n_samples = int(len(combined) * subsample_fraction)
        combined = combined[:n_samples]
        print(f"Subsampled {original_count} -> {len(combined)} triplets ({subsample_fraction*100:.1f}%)")
    anchors, positives, negatives = zip(*combined) if combined else ([], [], [])
    print(f"Created {len(anchors)} triplets from test data")
    dataset = Dataset.from_dict({'anchor': list(anchors), 'positive': list(positives), 'negative': list(negatives)})
    return dataset


def fine_tune_model(model, train_dataset, epochs=3, batch_size=32, learning_rate=2e-5, margin=0.25, output_dir="./models/test-finetuned-bge"):
    """Fine-tune the sentence transformer model using triplet loss on test data."""
    print(f"Fine-tuning model on {len(train_dataset)} triplets...")
    loss = TripletLoss(model=model, triplet_margin=margin)
    dataset_size = len(train_dataset)
    steps_per_epoch = max(1, dataset_size // batch_size)
    max_steps = steps_per_epoch * epochs
    args = SentenceTransformerTrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        warmup_steps=0,
        learning_rate=learning_rate,
        logging_steps=max(1, max_steps // 4),
        save_strategy="epoch",
        save_total_limit=1,
        fp16=True,  # 원본 유지(=GPU 환경 가정)
        max_grad_norm=1.0,
        dataloader_drop_last=False,
        gradient_checkpointing=True,
        gradient_accumulation_steps=1,
        max_steps=max_steps,
        report_to="none"
    )
    trainer = SentenceTransformerTrainer(model=model, args=args, train_dataset=train_dataset, loss=loss)
    trainer.train()
    final_model_path = f"{output_dir}/final"
    print(f"Saving fine-tuned model to {final_model_path}...")
    model.save_pretrained(final_model_path)
    return model, final_model_path


def load_or_create_finetuned_model(test_df):
    """Load fine-tuned model if exists, otherwise create and fine-tune it."""
    fine_tuned_path = "./models/test-finetuned-bge/final"
    if os.path.exists(fine_tuned_path):
        print(f"Loading existing fine-tuned model from {fine_tuned_path}...")
        try:
            word_embedding_model = models.Transformer(fine_tuned_path, max_seq_length=128, do_lower_case=True)
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
            model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
            print("Loaded fine-tuned model with explicit pooling")
        except:
            model = SentenceTransformer(fine_tuned_path)
            print("Loaded fine-tuned model with default configuration")
        model.half()  # 원본과 동일하게 half()
        return model

    print("Fine-tuned model not found. Creating new one...")
    print("Loading base BGE embedding model...")
    try:
        model_path = "/kaggle/input/baai/transformers/bge-base-en-v1.5/1"
        word_embedding_model = models.Transformer(model_path, max_seq_length=256, do_lower_case=True)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
        base_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        print("Loaded base model from Kaggle path with explicit pooling")
    except:
        model_path = ""  # BAAI/bge-small-en-v1.5
        word_embedding_model = models.Transformer(model_path, max_seq_length=256, do_lower_case=True)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
        base_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        print("Loaded base model from local path with explicit pooling")

    triplet_dataset = create_test_triplet_dataset(test_df, augmentation_factor=2, subsample_fraction=1.)
    fine_tuned_model, model_path = fine_tune_model(
        model=base_model,
        train_dataset=triplet_dataset,
        epochs=1,
        batch_size=16,
        learning_rate=2e-5,
        margin=0.25
    )
    print(f"Fine-tuning completed. Model saved to: {model_path}")
    fine_tuned_model.half()  # 원본과 동일
    return fine_tuned_model


def generate_rule_embeddings(test_df, model):
    """Generate embeddings for each unique rule."""
    print("Generating rule embeddings...")
    unique_rules = test_df['rule'].unique()
    rule_embeddings = {}
    for rule in unique_rules:
        clean_rule = cleaner(str(rule))
        rule_emb = model.encode(clean_rule, convert_to_tensor=False, normalize_embeddings=True)
        rule_embeddings[rule] = rule_emb
    print(f"Generated embeddings for {len(rule_embeddings)} rules")
    return rule_embeddings


# -----------------------------
# 핵심 수정: UMAP crash 회피(최소 변경)
#   - 원본 조건(샘플 수 > 10 그리고 > 32) 유지
#   - 실제 UMAP 호출 시, n_components는 min(32, N-2)로만 clamp
#   - spectral init(기본값) 유지 → 원본과 가장 가깝게
# -----------------------------
def create_rule_centroids_with_hierarchical_clustering(test_df, text_to_embedding, rule_embeddings):
    """Create centroids using Hierarchical Clustering + UMAP for better cluster representation."""
    print(f"\nCreating rule centroids with Hierarchical Clustering + UMAP...")
    base_umap_components = 32
    rule_centroids = {}

    for rule in test_df['rule'].unique():
        rule_data = test_df[test_df['rule'] == rule]

        pos_embeddings = []
        for _, row in rule_data.iterrows():
            for col in ['positive_example_1', 'positive_example_2']:
                if pd.notna(row[col]):
                    clean_text = cleaner(str(row[col]))
                    if clean_text in text_to_embedding:
                        pos_embeddings.append(text_to_embedding[clean_text])

        neg_embeddings = []
        for _, row in rule_data.iterrows():
            for col in ['negative_example_1', 'negative_example_2']:
                if pd.notna(row[col]):
                    clean_text = cleaner(str(row[col]))
                    if clean_text in text_to_embedding:
                        neg_embeddings.append(text_to_embedding[clean_text])

        if pos_embeddings and neg_embeddings:
            pos_embeddings = np.array(pos_embeddings)
            neg_embeddings = np.array(neg_embeddings)

            # --- 원본 조건 유지: n > 10 and n > 32 일 때만 UMAP ---
            # 단, 호출 직전에 n_components를 N-2로 clamp하여 k>=N 에러 회피
            def maybe_umap(X):
                n = X.shape[0]
                if n > 10 and n > base_umap_components:
                    n_components_safe = min(base_umap_components, max(2, n - 2))
                    reducer = UMAP(n_components=n_components_safe, random_state=42)  # init='spectral'(default)
                    return reducer.fit_transform(X)
                else:
                    return X

            pos_reduced = maybe_umap(pos_embeddings)
            neg_reduced = maybe_umap(neg_embeddings)

            # Agglomerative clustering (원본 동일)
            n_pos_clusters = min(3, len(pos_embeddings))
            n_neg_clusters = min(3, len(neg_embeddings))

            pos_centroids = []
            neg_centroids = []

            if n_pos_clusters > 1:
                pos_clusterer = AgglomerativeClustering(n_clusters=n_pos_clusters)
                pos_labels = pos_clusterer.fit_predict(pos_reduced)
                for cluster_id in np.unique(pos_labels):
                    cluster_mask = pos_labels == cluster_id
                    cluster_embeddings = pos_embeddings[cluster_mask]
                    cluster_centroid = cluster_embeddings.mean(axis=0)
                    cluster_centroid = cluster_centroid / np.linalg.norm(cluster_centroid)
                    pos_centroids.append(cluster_centroid)
            else:
                pos_centroid = pos_embeddings.mean(axis=0)
                pos_centroid = pos_centroid / np.linalg.norm(pos_centroid)
                pos_centroids.append(pos_centroid)

            if n_neg_clusters > 1:
                neg_clusterer = AgglomerativeClustering(n_clusters=n_neg_clusters)
                neg_labels = neg_clusterer.fit_predict(neg_reduced)
                for cluster_id in np.unique(neg_labels):
                    cluster_mask = neg_labels == cluster_id
                    cluster_embeddings = neg_embeddings[cluster_mask]
                    cluster_centroid = cluster_embeddings.mean(axis=0)
                    cluster_centroid = cluster_centroid / np.linalg.norm(cluster_centroid)
                    neg_centroids.append(cluster_centroid)
            else:
                neg_centroid = neg_embeddings.mean(axis=0)
                neg_centroid = neg_centroid / np.linalg.norm(neg_centroid)
                neg_centroids.append(neg_centroid)

            rule_centroids[rule] = {
                'positive_centroids': pos_centroids,
                'negative_centroids': neg_centroids,
                'pos_count': len(pos_embeddings),
                'neg_count': len(neg_embeddings),
                'rule_embedding': rule_embeddings[rule]
            }

            print(f"  Rule: {rule[:50]}... - Pos: {len(pos_embeddings)}, Neg: {len(neg_embeddings)} - Clusters: Pos={len(pos_centroids)}, Neg={len(neg_centroids)}")

    print(f"Created hierarchical centroids for {len(rule_centroids)} rules")
    return rule_centroids


def predict_test_set_with_hierarchical_clustering(test_df, text_to_embedding, rule_centroids):
    """Predict test set using hierarchical clustering centroids and distance metrics."""
    print("\nMaking predictions on test set with Hierarchical Clustering centroids...")
    row_ids = []
    predictions = []
    for rule in test_df['rule'].unique():
        print(f"  Processing rule: {rule[:50]}...")
        rule_data = test_df[test_df['rule'] == rule]
        if rule not in rule_centroids:
            continue
        pos_centroids = rule_centroids[rule]['positive_centroids']
        neg_centroids = rule_centroids[rule]['negative_centroids']
        for _, row in rule_data.iterrows():
            body = cleaner(str(row['body']))
            row_id = row['row_id']
            if body not in text_to_embedding:
                continue
            body_embedding = text_to_embedding[body]
            pos_distances = []
            for pos_centroid in pos_centroids:
                distance = np.linalg.norm(body_embedding - pos_centroid)
                pos_distances.append(distance)
            neg_distances = []
            for neg_centroid in neg_centroids:
                distance = np.linalg.norm(body_embedding - neg_centroid)
                neg_distances.append(distance)
            min_pos_distance = min(pos_distances) if pos_distances else 1.0
            min_neg_distance = min(neg_distances) if neg_distances else 1.0
            rule_prediction = min_neg_distance - min_pos_distance
            row_ids.append(row_id)
            predictions.append(rule_prediction)
    print(f"Made predictions for {len(predictions)} test examples")
    return row_ids, np.array(predictions)


def main():
    print("="*70)
    print("IMPROVED SIMILARITY CLASSIFIER - HIERARCHICAL CLUSTERING + UMAP (same results as your script)")
    print("="*70)

    test_df = load_test_data()

    print("\n" + "="*50)
    print("MODEL PREPARATION PHASE")
    print("="*50)
    model = load_or_create_finetuned_model(test_df)

    all_texts = collect_all_texts(test_df)

    print("\n" + "="*50)
    print("EMBEDDING GENERATION PHASE")
    print("="*50)
    all_embeddings = generate_embeddings(all_texts, model)

    text_to_embedding = {text: emb for text, emb in zip(all_texts, all_embeddings)}

    rule_embeddings = generate_rule_embeddings(test_df, model)

    # 동일 로직 + UMAP crash만 회피
    rule_centroids = create_rule_centroids_with_hierarchical_clustering(test_df, text_to_embedding, rule_embeddings)

    print("\n" + "="*50)
    print("PREDICTION PHASE")
    print("="*50)
    row_ids, predictions = predict_test_set_with_hierarchical_clustering(test_df, text_to_embedding, rule_centroids)

    submission_df = pd.DataFrame({'row_id': row_ids, 'rule_violation': predictions})
    submission_df.to_csv('Triplet_submission.csv', index=False)

    print(f"\nSaved predictions for {len(submission_df)} test examples to submission.csv and Triplet_submission.csv")

    print(f"\n{'='*70}")
    print(f"HIERARCHICAL CLUSTERING + UMAP INFERENCE COMPLETED")
    print(f"Model: Fine-tuned BGE on test data triplets")
    print(f"Method: Hierarchical clustering + UMAP dimensionality reduction")
    print(f"Predicted on {len(test_df)} test examples")
    print(f"Prediction stats: min={predictions.min():.4f}, max={predictions.max():.4f}, mean={predictions.mean():.4f}")
    print(f"{'='*70}")


if __name__ == "__main__":
    main()


In [None]:
%%writefile deberta.py
# ============================================================
# DeBERTa Multi-Seed Train + Rank-Average Ensemble (Single File)
# Run with:  !python deberta.py
# → writes /kaggle/working/deberta_submission.csv by default
# ============================================================

import os
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
os.environ.setdefault("TRANSFORMERS_NO_TORCHVISION", "1")

import re
import argparse
import random
import numpy as np
import pandas as pd
import torch

from typing import List
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

# =========================
# URL semantics extraction
# =========================
URL_PATTERN = re.compile(r'https?://[^\s/$.?#].[^\s]*', re.IGNORECASE)
DOMAIN_PATTERN = re.compile(r"(?:https?://)?([a-z0-9\-\.]+)\.[a-z]{2,}", re.IGNORECASE)
CLEAN_PATH_PART_PATTERN = re.compile(r"\.(html?|php|asp|jsp)$|#.*|\?.*", re.IGNORECASE)

def url_to_semantics(text: str) -> str:
    if not isinstance(text, str):
        return ""
    urls = URL_PATTERN.findall(text)
    if not urls:
        return ""
    all_semantics, seen = [], set()
    for url in urls:
        u = url.lower()
        m = DOMAIN_PATTERN.search(u)
        if m:
            for part in m.group(1).split('.'):
                if part and len(part) > 3 and part not in seen:
                    all_semantics.append(f"domain:{part}")
                    seen.add(part)
        path = re.sub(r"^(?:https?://)?[a-z0-9\.-]+\.[a-z]{2,}/?", "", u)
        for p in [p for p in re.split(r'[/_.-]+', path) if p and p.isalnum()]:
            pc = CLEAN_PATH_PART_PATTERN.sub("", p)
            if pc and len(pc) > 3 and pc not in seen:
                all_semantics.append(f"path:{pc}")
                seen.add(pc)
    return ("\nURL Keywords: " + " ".join(all_semantics)) if all_semantics else ""

# =========================
# Data assembly
# =========================
def _extract_violation_examples(df: pd.DataFrame) -> List[pd.DataFrame]:
    out = []
    for k in ("positive", "negative"):
        label = 1 if k == "positive" else 0
        for i in (1, 2):
            col = f"{k}_example_{i}"
            if col in df.columns:
                sub = df[[col, "rule", "subreddit"]].copy()
                sub.rename(columns={col: "body"}, inplace=True)
                sub["rule_violation"] = label
                sub.dropna(subset=["body"], inplace=True)
                sub = sub[sub["body"].str.strip().str.len() > 0]
                if not sub.empty:
                    out.append(sub)
    return out

def get_dataframe_to_train(data_path: str, seed: int = 42) -> pd.DataFrame:
    train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
    test_df  = pd.read_csv(os.path.join(data_path, "test.csv"))
    combined = []
    if {"body","rule","subreddit","rule_violation"}.issubset(train_df.columns):
        combined.append(train_df[["body","rule","subreddit","rule_violation"]].copy())
    combined.extend(_extract_violation_examples(train_df))
    combined.extend(_extract_violation_examples(test_df))
    if not combined:
        return pd.DataFrame(columns=["body","rule","subreddit","rule_violation"])
    full_df = pd.concat(combined, ignore_index=True)
    full_df.drop_duplicates(subset=["body","rule","subreddit"], inplace=True)
    full_df.drop_duplicates(subset=["body","rule"], keep="first", inplace=True)
    return full_df.sample(frac=1, random_state=seed).reset_index(drop=True)

# =========================
# Reproducibility
# =========================
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# =========================
# Dataset
# =========================
class JigsawDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k,v in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings["input_ids"])

# =========================
# Build inputs
# =========================
def add_url_semantics_column(series_body: pd.Series) -> pd.Series:
    return series_body.apply(lambda x: (x if isinstance(x, str) else "") + url_to_semantics(x))

def build_input_text(df: pd.DataFrame) -> pd.Series:
    return df["rule"].astype(str) + "[SEP]" + df["body_with_url"].astype(str)

def prepare_dataset(df: pd.DataFrame, tokenizer, max_length: int, is_train: bool) -> JigsawDataset:
    enc = tokenizer(df["input_text"].tolist(), truncation=True, padding=True, max_length=max_length)
    return JigsawDataset(enc, df["rule_violation"].tolist() if is_train else None)

# =========================
# Train & Predict (one seed)
# =========================
def train_and_predict_one_seed(
    seed: int,
    model_name_or_path: str,
    data_path: str,
    output_dir: str,
    epochs: int = 2,
    lr: float = 2e-5,
    max_length: int = 512,
    batch_size: int = 8,
    save_each_submission: bool = True
) -> pd.DataFrame:
    seed_everything(seed)
    train_df = get_dataframe_to_train(data_path, seed=seed)
    train_df = train_df.copy()
    train_df["body_with_url"] = add_url_semantics_column(train_df["body"])
    train_df["input_text"] = build_input_text(train_df)

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    train_ds = prepare_dataset(train_df, tokenizer, max_length, is_train=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=2)
    training_args = TrainingArguments(
        output_dir=os.path.join(output_dir, f"s{seed}"),
        num_train_epochs=epochs,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        warmup_ratio=0.1,
        weight_decay=0.01,
        report_to="none",
        save_strategy="no",
        logging_steps=10,
        dataloader_num_workers=2,
        fp16=torch.cuda.is_available(),
    )
    trainer = Trainer(model=model, args=training_args, train_dataset=train_ds)
    trainer.train()

    test_df = pd.read_csv(os.path.join(data_path, "test.csv")).copy()
    test_df["body_with_url"] = add_url_semantics_column(test_df["body"])
    test_df["input_text"] = build_input_text(test_df)
    test_ds = prepare_dataset(test_df, tokenizer, max_length, is_train=False)

    out = trainer.predict(test_ds)
    probs = torch.nn.functional.softmax(torch.tensor(out.predictions), dim=1)[:, 1].numpy()

    sub = pd.DataFrame({"row_id": test_df["row_id"].values, "rule_violation": probs})
    if save_each_submission:
        p = os.path.join("/kaggle/working", f"submission_s{seed}.csv")
        sub.to_csv(p, index=False)
    return sub

# =========================
# Rank-average ensemble
# =========================
def rank_average_ensemble(subs: List[pd.DataFrame]) -> pd.DataFrame:
    base = subs[0][["row_id"]].drop_duplicates().copy()
    cols = []
    for i, df in enumerate(subs):
        c = f"s{i}"
        df_ = df[["row_id","rule_violation"]].drop_duplicates("row_id").rename(columns={"rule_violation": c})
        base = base.merge(df_, on="row_id", how="inner")
        cols.append(c)
    n = len(base)
    for c in cols:
        s = pd.to_numeric(base[c], errors="coerce").fillna(base[c].median())
        base[c] = s.rank(method="average") / (n + 1)
    base["rule_violation"] = base[cols].mean(axis=1)
    return base[["row_id","rule_violation"]].sort_values("row_id").reset_index(drop=True)

# =========================
# Defaults & CLI
# =========================
def resolve_model_path(user_path: str) -> str:
    for p in [user_path,
              "/kaggle/input/debertav3base",
              "/kaggle/input/huggingfacedebertav3variants/deberta-v3-base",
              "/kaggle/input/deberta-v3-small/deberta-v3-small"]:
        if p and os.path.exists(p):
            return p
    return user_path

def parse_args():
    ap = argparse.ArgumentParser(description="DeBERTa multi-seed training + rank-average ensemble (single file).",
                                 add_help=True)
    ap.add_argument("--seeds", type=str, default="42,43,44",
                    help="Comma-separated seeds (default: 42,43,44)")
    ap.add_argument("--model-path", type=str, default="/kaggle/input/debertav3base",
                    help="Local model path (offline).")
    ap.add_argument("--data-path", type=str, default="/kaggle/input/jigsaw-agile-community-rules/",
                    help="Path containing train.csv and test.csv")
    ap.add_argument("--output-dir", type=str, default="./dbv3_base_ens_model",
                    help="Directory for trainer outputs")
    ap.add_argument("--epochs", type=int, default=2)
    ap.add_argument("--batch-size", type=int, default=8)
    ap.add_argument("--max-length", type=int, default=512)
    ap.add_argument("--lr", type=float, default=2e-5)
    ap.add_argument("--final-outfile", type=str, default="/kaggle/working/deberta_submission.csv",
                    help="Final ensemble path")
    ap.add_argument("--skip-save-per-seed", action="store_true")
    return ap.parse_args()

def main():
    args = parse_args()  # all defaults chosen so `!python deberta.py` just works
    seeds = [int(s) for s in args.seeds.split(",") if s.strip().isdigit()]
    model_path = resolve_model_path(args.model_path)
    os.makedirs(args.output_dir, exist_ok=True)

    subs = []
    for seed in seeds:
        subs.append(
            train_and_predict_one_seed(
                seed=seed,
                model_name_or_path=model_path,
                data_path=args.data_path,
                output_dir=args.output_dir,
                epochs=args.epochs,
                lr=args.lr,
                max_length=args.max_length,
                batch_size=args.batch_size,
                save_each_submission=not args.skip_save_per_seed
            )
        )

    final_sub = rank_average_ensemble(subs)
    final_sub.to_csv(args.final_outfile, index=False)
    print(final_sub.head())
    print(f"[OK] Wrote final ensemble: {args.final_outfile} (rows={len(final_sub)})")

if __name__ == "__main__":
    main()


In [None]:
!python triplet.py
%env RUN_MODE=train
!python deberta.py

In [None]:
import pandas as pd
import numpy as np

# ---------- 1) 파일 읽기 & 키 이름 통일 ----------
deb = pd.read_csv("deberta_submission.csv")[["row_id", "rule_violation"]].rename(columns={"rule_violation":"deb"})
tri = pd.read_csv("Triplet_submission.csv")[["row_id", "rule_violation"]].rename(columns={"rule_violation":"tri"})
q   = pd.read_csv("submission_qwen.csv")[["row_id", "rule_violation"]].rename(columns={"rule_violation":"q"})
q3  = pd.read_csv("submission_qwen3.csv")[["row_id", "rule_violation"]].rename(columns={"rule_violation":"q3"})

# 교집합 머지(순서 보장 위해 row_id 기준 inner join)
df = deb.merge(tri, on="row_id", how="inner").merge(q, on="row_id", how="inner").merge(q3, on="row_id", how="inner")
models = ["deb", "tri", "q", "q3"]

# ---------- 2) [0,1] 랭크 정규화 ----------
def to_rank01(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors="coerce")
    return s.rank(method="average") / (len(s) + 1)

for m in models:
    df[f"{m}_r"] = to_rank01(df[m])

rcols = [f"{m}_r" for m in models]

# ---------- 3) 분산(ensemble 평균 대비) 계산 ----------
rmean = df[rcols].mean(axis=1)
variances = {}
for m in models:
    variances[m] = float(((df[f"{m}_r"] - rmean) ** 2).mean())

# ---------- 4) 분산 역수 가중치 + PRIOR (q, tri 우대) ----------
EPS = 1e-12
base_w = {m: 1.0 / (variances[m] + EPS) for m in models}

# 원하는 우선순위(필요시 미세조정): q 가장 높게, 그 다음 tri
PRIORS = {
    "deb": 1.4,
    "tri": 1.3,  # ↑ triplet 가중 우대
    "q"  : 1.6,  # ↑ qwen 가중 최우대
    "q3" : 0.5,  # 보조
}

w = {m: base_w[m] * PRIORS[m] for m in models}
wsum = sum(w.values())
w = {m: w[m] / wsum for m in models}

print("Model variances (lower is better):", variances)
print("Final weights:", w)

# ---------- 5) 가중 랭크 블렌딩 ----------
df["rule_violation"] = sum(w[m] * df[f"{m}_r"] for m in models)

# ---------- 6) 저장 ----------
out = df[["row_id", "rule_violation"]].copy()
out = out.sort_values("row_id").reset_index(drop=True)
out.to_csv("/kaggle/working/submission.csv", index=False)
print(f"Saved -> /kaggle/working/submission.csv  (rows={len(out)})")

# 참고: 스케일 후 상관관계
print("Rank corr:\n", df[rcols].corr())
