Credit to Muhammad Danyal Javed for the initial notebook.

In [1]:
# !pip install vllm
# !pip install logits-processor-zoo==0.1.10
# !pip install triton==3.2.0
# !pip install clean-text

## 1. Qwen2.5 32B GPTQ Int4

In [2]:
! mkdir -p /tmp/src

In [None]:
%%writefile /tmp/src/infer_qwen.py

import os
import pandas as pd
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
import torch
import vllm
import numpy as np
from vllm.lora.request import LoRARequest
import argparse
from scipy.special import softmax
df = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/test.csv")

MODEL_NAME = "/kaggle/input/qwen2-5-32b-instruct-gptq-int4"
LORA_PATH = "/kaggle/input/qwen2-5-32b-gptq-int4-batch4-full"
if __name__=='__main__':
    # Optimization: Disable v1 to force vLLM's optimized execution path
    os.environ["VLLM_USE_V1"] = "0"

    # Initialize vLLM engine
    # Using GPTQ quantization and half-precision (dtype="half") to fit the 
    # 32B parameter model within the GPU memory constraints (T4/P100).
    llm = vllm.LLM(
        MODEL_NAME,
        quantization = "gptq",
        tensor_parallel_size = torch.cuda.device_count(),
        gpu_memory_utilization = 0.95, # Aggressive memory usage for max context
        trust_remote_code = True,
        dtype = "half",
        enforce_eager = True,
        max_model_len = 4096,
        disable_log_stats = True,
        enable_prefix_caching = True, # Speed up processing for shared system prompts
        enable_lora = True,
    )
    tokenizer = llm.get_tokenizer()
    SYS_PROMPT = """
    You are given a comment on reddit. Your task is to classify if it violates the given rule. Only respond Yes/No.
    """
    
    # Prompt Construction: 
    # We use a 4-shot prompting strategy (2 positive, 2 negative examples) 
    # derived dynamically from the dataframe rows to ground the model's reasoning.
    prompts = []
    for i, row in df.iterrows():
        text = f"""
    r/{row.subreddit}
    Rule: {row.rule}
    
    1) {row.positive_example_1}
    Violation: Yes
    
    2) {row.positive_example_2}
    Violation: Yes
    
    3) {row.negative_example_1}
    Violation: No
    
    4) {row.negative_example_2}
    Violation: No
    
    5) {row.body}
    """
        
        messages = [
            {"role": "system", "content": SYS_PROMPT},
            {"role": "user", "content": text}
        ]
    
        # Apply standard chat template to ensure special tokens (<|im_start|>, etc.) are correct
        prompt = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt = True,
            tokenize = False,
        ) + "Answer:"
        prompts.append(prompt)
    
    df["prompt"] = prompts
    
    # Critical: Restrict the output space to only "Yes" or "No" tokens.
    # This prevents the model from generating hallucinated explanations or synonyms.
    mclp = MultipleChoiceLogitsProcessor(tokenizer, choices = ["Yes", "No"])

    outputs = llm.generate(
        prompts,
        vllm.SamplingParams(
            skip_special_tokens = True,
            max_tokens = 1, # Just need the first token classification
            logits_processors = [mclp],
            logprobs = 2, # Return the log probabilities for ensemble weighting
        ),
        use_tqdm = True,
        lora_request = LoRARequest("default", 1, LORA_PATH)
    )
    logprobs = [
        {lp.decoded_token: lp.logprob for lp in out.outputs[0].logprobs[0].values()}
        for out in outputs
    ]
    logit_matrix = pd.DataFrame(logprobs)[["Yes", "No"]]
    df = pd.concat([df, logit_matrix], axis = 1)
    
    # Normalize logprobs to probabilities using Softmax for consistent ensemble averaging
    df[["Yes", "No"]] = df[["Yes", "No"]].apply(lambda x: softmax(x.values), axis = 1, result_type = "expand")
    df["pred"] = df["Yes"]
    df["rule_violation"] = df["pred"]
    df[["row_id", "rule_violation"]].to_csv("submission_qwen.csv",index = False)
    pd.read_csv("submission_qwen.csv") # Verification step (optional)

Writing /tmp/src/infer_qwen.py


In [4]:
%cd /tmp
!python src/infer_qwen.py

/tmp
2025-08-18 19:27:41.847895: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755545262.080186      68 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755545262.147420      68 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO 08-18 19:27:55 [__init__.py:235] Automatically detected platform cuda.
INFO 08-18 19:28:11 [config.py:1604] Using max model len 4096
INFO 08-18 19:28:13 [llm_engine.py:228] Initializing a V0 LLM engine (v0.10.0) with config: model='/kaggle/input/qwen2-5-32b-instruct-gptq-int4', speculative_config=None, tokenizer='/kaggle/input/qwen2-5-32b-instruct-gptq-int4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=Non

## 2. Qwen3 8b Embedding

In [None]:
%%writefile constants.py
# Model Configuration
# Use the 0.6B parameter model for low-latency embedding generation
EMBDEDDING_MODEL_PATH = "/kaggle/input/qwen-3-embedding/transformers/0.6b/1"
DATA_PATH = "/kaggle/input/jigsaw-agile-community-rules"

# Instruction Tuning:
# Critical for this model architecture. Asymmetric retrieval requires specific 
# "Instruct" prefixes to differentiate between the query (Need) and the passage (Solution).
EMBEDDING_MODEL_QUERY = (
    "Instruct: Given a Reddit comment and the rule context, "
    "retrieve passages that indicate violation (positives) or compliance (negatives).\nQuery:"
)

EMBEDDING_MODEL_PASSAGE = (
    "Instruct: You are a passage for retrieval given a Reddit rule and a comment.\nPassage:"
)

# Pipeline Settings
CLEAN_TEXT = True
TOP_K = 300 # High recall window to maximize candidate capture before re-ranking
BATCH_SIZE = 64 # Optimized for T4 GPU VRAM limits

Writing constants.py


In [None]:
%%writefile utils.py
import pandas as pd
from datasets import Dataset
from cleantext import clean
from tqdm.auto import tqdm
from constants import CLEAN_TEXT


def build_prompt(row):
    """
    Constructs the input string for the embedding model.
    Structure: Subreddit -> Rule -> Comment
    """
    return (
        f"r/{row['subreddit']}\n"
        f"Rule: {row['rule']}\n"
        f"Comment: {row['body']}"
    )


def cleaner(text):
    """
    Normalization pipeline to reduce noise density in the embedding space.
    Removes PII (emails/phones) and standardizes formatting while preserving
    semantic content.
    """
    return clean(
        text,
        fix_unicode=True,
        to_ascii=True,
        lower=False,
        no_line_breaks=False,
        no_urls=True, # URLs are high-entropy noise for this task
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=False,
        no_punct=False,
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        lang="en",
    )


def get_dataframe_to_train(data_path):
    """
    Corpus Construction Strategy:
    We aggregate the official training data AND the few-shot examples provided 
    in the test set (positive/negative examples).
    
    This 'Corpus Expansion' maximizes the retrieval candidate pool, ensuring
    we can retrieve relevant precedents even if they appeared in the test prompts.
    """
    train_dataset = pd.read_csv(f"{data_path}/train.csv")
    test_dataset = pd.read_csv(f"{data_path}/test.csv")

    flatten = []
    # Add base training data
    flatten.append(train_dataset[["body", "rule", "subreddit", "rule_violation"]])
    
    # Extract and inject few-shot examples from the test set columns
    for violation_type in ["positive", "negative"]:
        for i in range(1, 3):
            sub_dataset = test_dataset[[f"{violation_type}_example_{i}", "rule", "subreddit"]].copy()
            sub_dataset = sub_dataset.rename(columns={f"{violation_type}_example_{i}": "body"})
            sub_dataset["rule_violation"] = 1 if violation_type == "positive" else 0
            flatten.append(sub_dataset)

    dataframe = pd.concat(flatten, axis=0)    
    dataframe = dataframe.drop_duplicates(ignore_index=True)
    return dataframe


def prepare_dataframe(dataframe):
    """
    Preprocessing pipeline:
    1. Construct prompts
    2. Clean text (optional)
    3. Remap labels for Cosine Similarity metric
    """
    dataframe["prompt"] = dataframe.apply(build_prompt, axis = 1)

    if CLEAN_TEXT:
        tqdm.pandas(desc = "cleaner")
        dataframe["prompt"] = dataframe["prompt"].progress_apply(cleaner)

    # Label Mapping Strategy:
    # We map 0 (Non-Violation) to -1. 
    # This aligns with Cosine Similarity, where we want non-violations to be
    # geometrically opposite (-1) to violations (+1).
    if "rule_violation" in dataframe.columns:
        dataframe["rule_violation"] = dataframe["rule_violation"].map(
            {
                1: 1,
                0: -1,
            }
        )

    return dataframe

Writing utils.py


In [None]:
%%writefile semantic.py
import pandas as pd

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search, dot_score
from tqdm.auto import tqdm

from utils import get_dataframe_to_train, prepare_dataframe
from constants import (
    DATA_PATH, EMBDEDDING_MODEL_PATH,
    EMBEDDING_MODEL_QUERY, EMBEDDING_MODEL_PASSAGE,
    TOP_K, BATCH_SIZE
)


def get_scores(test_dataframe):
    """
    Executes the semantic retrieval pipeline.
    Strategy:
    1. Embed all queries (test rows) and documents (train corpus).
    2. Perform asymmetric vector search.
    3. Aggregate similarity scores weighted by ground-truth labels (+1/-1).
    """
    # Load and preprocess the knowledge base (Corpus)
    corpus_dataframe = get_dataframe_to_train(DATA_PATH)
    corpus_dataframe = prepare_dataframe(corpus_dataframe)
    
    # Initialize Vector DB Model
    # Loading to CUDA device for GPU-accelerated embedding generation
    embedding_model = SentenceTransformer(
        model_name_or_path = EMBDEDDING_MODEL_PATH,
        device = "cuda",
    )

    result = []

    # Partitioning Strategy:
    # Instead of a global search, we iterate by unique Rule.
    # This acts as a 'Hard Filter' before the vector search, ensuring we only
    # retrieve precedents relevant to the specific rule being adjudicated.
    # This significantly reduces search space noise and improves precision.
    for rule in tqdm(test_dataframe["rule"].unique(), desc = f"Generate scores for each rule"):
        # Slice data for the current partition
        test_dataframe_part = test_dataframe.query("rule == @rule").reset_index(drop = True)
        corpus_dataframe_part = corpus_dataframe.query("rule == @rule").reset_index(drop = True)
        corpus_dataframe_part = corpus_dataframe_part.reset_index(names="row_id")
        
        # Encode Queries (the incoming comments)
        # 'normalize_embeddings=True' ensures dot_score == Cosine Similarity
        query_embeddings = embedding_model.encode(
            sentences=test_dataframe_part["prompt"].tolist(),
            prompt=EMBEDDING_MODEL_QUERY,            
            batch_size=BATCH_SIZE,
            show_progress_bar=True,
            convert_to_tensor=True,
            device="cuda",
            normalize_embeddings=True,
        )

        # Encode Documents (the precedents/examples)
        document_embeddings = embedding_model.encode(
            sentences = corpus_dataframe_part["prompt"].tolist(),
            prompt = EMBEDDING_MODEL_PASSAGE,         
            batch_size = BATCH_SIZE,
            show_progress_bar = True,
            convert_to_tensor = True,
            device = "cuda",
            normalize_embeddings = True,
        )

        # Vector Search
        # Retrieve top_k semantically similar documents for each query
        test_dataframe_part["semantic"] = semantic_search(
            query_embeddings,
            document_embeddings,
            top_k = TOP_K,
            score_function = dot_score,
        )


        def get_score(semantic):
            """
            Soft Voting Mechanism:
            Calculates a violation score based on the 'Signed Similarity Sum'.
            
            Logic:
            - If we retrieve a 'Violation' document (+1), we ADD the similarity score.
            - If we retrieve a 'Non-Violation' document (-1), we SUBTRACT the similarity score.
            
            Result:
            - High Positive Score -> Strong evidence of violation.
            - High Negative Score -> Strong evidence of compliance.
            """
            semantic = pd.DataFrame(semantic)
            semantic = semantic.merge(
                corpus_dataframe_part[["row_id", "rule_violation"]],
                how = "left",
                left_on = "corpus_id",
                right_on = "row_id",
            )
            semantic["score"] = semantic["score"]*semantic["rule_violation"]
            return semantic["score"].sum()
            
        tqdm.pandas(desc = f"Add label for {rule=}")
        test_dataframe_part["rule_violation"] = test_dataframe_part["semantic"].progress_apply(get_score)
        result.append(test_dataframe_part[["row_id", "rule_violation"]].copy())
        
    submission = pd.concat(result, axis=0)
    
    return submission


def generate_submission():
    test_dataframe = pd.read_csv(f"{DATA_PATH}/test.csv")
    test_dataframe = prepare_dataframe(test_dataframe)
    
    submission = get_scores(test_dataframe)
    submission = test_dataframe[["row_id"]].merge(submission, on = "row_id", how = "left")
    submission.to_csv("submission_qwen3.csv", index = False)


if __name__ == "__main__":
    generate_submission()

Writing semantic.py


In [None]:
# Execution Entry Point
# Triggers the full semantic retrieval pipeline to generate the CSV submission file.
from semantic import generate_submission
generate_submission()

2025-08-18 19:33:55.057376: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755545635.081680      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755545635.089404      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


cleaner:   0%|          | 0/10 [00:00<?, ?it/s]

cleaner:   0%|          | 0/1983 [00:00<?, ?it/s]

Generate scores for each rule:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Add label for rule='No Advertising: Spam, referral links, unsolicited advertising, and promotional content are…

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Add label for rule='No legal advice: Do not offer or request legal advice.':   0%|          | 0/1 [00:00<?, ?i…

## 3. ENSEMBLE RESULT

In [None]:
import pandas as pd

# Load individual model outputs
q = pd.read_csv("submission_qwen.csv") 
l = pd.read_csv("submission_qwen3.csv")

# Rank Normalization:
# Since the models output different score distributions (Probability vs. Cosine Similarity),
# we convert raw scores to relative ranks (0-1) to make them comparable before blending.
rq = q["rule_violation"].rank(method = "average") / (len(q)+1)
rl = l["rule_violation"].rank(method = "average") / (len(l)+1)

# Weighted Ensemble:
# Apply a 70/30 weight favoring the LLM's deeper reasoning capabilities (0.7),
# while using the semantic retrieval score (0.3) as a supporting signal.
blend = 0.7 * rq + 0.3 * rl  

q["rule_violation"] = blend
q.to_csv("/kaggle/working/submission.csv", index = False)

## 4. CROSS VALIDATION

In [None]:
%%writefile cv_embedding.py
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search, dot_score

from utils import prepare_dataframe, get_dataframe_to_train
from constants import (
    DATA_PATH, EMBDEDDING_MODEL_PATH,
    EMBEDDING_MODEL_QUERY, EMBEDDING_MODEL_PASSAGE,
    TOP_K, BATCH_SIZE,
)

def _signed_sum(scores_df, corpus_labeled):
    """
    Aggregates retrieval scores. 
    Matches the 'weighted vote' logic used in inference.
    """
    # scores_df has columns: corpus_id, score
    df = scores_df.merge(
        corpus_labeled[["row_id", "rule_violation"]],
        how = "left", left_on = "corpus_id", right_on = "row_id"
    )
    # rule_violation is +1 / -1 after prepare_dataframe()
    return (df["score"] * df["rule_violation"]).sum()

def run_cv_embedding(n_splits=5, seed=42, verbose=True):
    # Load raw train (labels 0/1) and build groups
    train_raw = pd.read_csv(f"{DATA_PATH}/train.csv")
    y = train_raw["rule_violation"].astype(int).values

    # Corpus Construction
    # Build the 'Union Corpus' (Train + Test examples) to simulate the full
    # knowledge base available at inference time.
    corpus_all_raw = get_dataframe_to_train(DATA_PATH)
    corpus_all = prepare_dataframe(corpus_all_raw.copy())

    train_prep = prepare_dataframe(train_raw.copy())

    # Grouping Strategy
    # CRITICAL: group by 'Subreddit + Rule'. 
    # Standard KFold would cause data leakage because comments under the same rule 
    # are highly correlated. GroupKFold ensures a rule appears EITHER in train OR validation,
    # strictly testing generalization to unseen rules/contexts.
    groups = (train_raw["subreddit"].astype(str) + " | " + train_raw["rule"].astype(str)).values
    gkf = GroupKFold(n_splits = n_splits)

    # Model (load once to save overhead)
    model = SentenceTransformer(EMBDEDDING_MODEL_PATH, device = "cuda")

    # Storage for OOF scores
    oof_scores = np.full(len(train_raw), np.nan, dtype = float)
    fold_aucs = []

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(train_raw, groups = groups), 1):
        va_raw = train_raw.iloc[va_idx].reset_index(drop = True)
        va_prep = train_prep.iloc[va_idx].reset_index(drop = True)

        # Anti-Leakage Mechanism:
        # Remove the current validation rows from the retrieval corpus.
        # If these are not removed, the model will simply retrieve the query itself (Cosine Similarity = 1.0),
        # leading to artificially inflated accuracy metrics.
        va_keys = va_raw[["body", "rule", "subreddit"]].drop_duplicates()
        corpus_fold = corpus_all.merge(
            va_keys.assign(_drop = 1),
            on = ["body", "rule", "subreddit"],
            how = "left"
        )
        corpus_fold = corpus_fold[corpus_fold["_drop"].isna()].drop(columns = ["_drop"]).reset_index(drop = True)
        corpus_fold = corpus_fold.reset_index(names = "row_id") 

        # Encode per rule. This keeps retrieval focused & small.
        scores_this_fold = np.zeros(len(va_prep), dtype = float)

        unique_rules = va_prep["rule"].unique()
        if verbose:
            pbar = tqdm(unique_rules, desc=f"[Fold {fold}] Scoring by rule")
        else:
            pbar = unique_rules

        for rule_text in pbar:
            q_part = va_prep.query("rule == @rule_text").reset_index()
            d_part = corpus_fold.query("rule == @rule_text").reset_index(drop = True)

            if len(q_part) == 0 or len(d_part) == 0:
                continue

            q_emb = model.encode(
                q_part["prompt"].tolist(),
                prompt = EMBEDDING_MODEL_QUERY,
                batch_size = BATCH_SIZE,
                show_progress_bar = False,
                convert_to_tensor = True,
                device = "cuda",
                normalize_embeddings = True,
            )
            d_emb = model.encode(
                d_part["prompt"].tolist(),
                prompt = EMBEDDING_MODEL_PASSAGE,
                batch_size = BATCH_SIZE,
                show_progress_bar = False,
                convert_to_tensor = True,
                device = "cuda",
                normalize_embeddings = True,
            )
            hits = semantic_search(q_emb, d_emb, top_k = TOP_K, score_function = dot_score)

            # Aggregate signed similarity for each query
            for i, hit in enumerate(hits):
                ssum = _signed_sum(pd.DataFrame(hit), d_part)
                scores_this_fold[q_part.loc[i, "index"]] = ssum

        # Fill OOF and compute fold AUC
        oof_scores[va_idx] = scores_this_fold
        fold_auc = roc_auc_score(y[va_idx], scores_this_fold)
        fold_aucs.append(fold_auc)
        if verbose:
            print(f"Fold {fold}: AUC = {fold_auc:.6f}")

    overall_auc = roc_auc_score(y, oof_scores)
    if verbose:
        print(f"\nOOF AUC (mean over folds): {np.mean(fold_aucs):.6f}  "
              f"(± {np.std(fold_aucs):.6f})")
        print(f"OOF AUC (all OOF preds):  {overall_auc:.6f}")

    # Save OOF for later blending and diagnostics
    out = pd.DataFrame({
        "row_id": train_raw.get("row_id", pd.Series(np.arange(len(train_raw)))),
        "y_true": y,
        "oof_score": oof_scores,
        "subreddit": train_raw["subreddit"],
        "rule": train_raw["rule"],
    })
    out.to_csv("/kaggle/working/oof_embedding.csv", index = False)

    return overall_auc, fold_aucs


Writing cv_embedding.py


In [None]:
from cv_embedding import run_cv_embedding

# Benchmark Execution:
# We run a full 5-fold cross-validation to establish the model's generalization capability.
# A fixed seed (42) ensures strict reproducibility for regression testing across experiments.
overall_auc, fold_aucs = run_cv_embedding(n_splits = 5, seed = 42, verbose = True)
overall_auc, fold_aucs

cleaner:   0%|          | 0/1983 [00:00<?, ?it/s]

cleaner:   0%|          | 0/2029 [00:00<?, ?it/s]

[Fold 1] Scoring by rule:   0%|          | 0/2 [00:00<?, ?it/s]

Fold 1: AUC = 0.638677


[Fold 2] Scoring by rule:   0%|          | 0/2 [00:00<?, ?it/s]

Fold 2: AUC = 0.461994


[Fold 3] Scoring by rule:   0%|          | 0/2 [00:00<?, ?it/s]

Fold 3: AUC = 0.532712


[Fold 4] Scoring by rule:   0%|          | 0/2 [00:00<?, ?it/s]

Fold 4: AUC = 0.532649


[Fold 5] Scoring by rule:   0%|          | 0/2 [00:00<?, ?it/s]

Fold 5: AUC = 0.613014

OOF AUC (mean over folds): 0.555809  (± 0.063262)
OOF AUC (all OOF preds):  0.536594


(0.536593555685571,
 [0.638676844783715,
  0.4619940640348804,
  0.5327120947388857,
  0.5326490713587488,
  0.6130141999707217])