In [None]:
%%writefile constants.py
BASE_MODEL_PATH = "/kaggle/input/qwen-3-embedding/transformers/0.6b/1"
LORA_PATH = "output/"
COMPLETE = "Answer:"
prompt = f"You are given a comment from reddit and a rule. Your task is to classify whether the comment violates the rule. Answer 'yes' or 'no' only."

In [None]:
import pandas as pd

train_data = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/train.csv")
print(len(train_data))
test_sample=    pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/test.csv").sample(frac=0.5,random_state = 42 )

print(len(test_sample))

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install flash-attn --no-build-isolation

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from torch import Tensor
import kagglehub


# Pooling function (same as before)
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    # Check whether padding is on the left
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


# OOP wrapper around tokenizer + model + pooling
class QwenEmbedder(nn.Module):
    def __init__(self, model_dir: str, max_length: int = 8192, device: str = None):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, padding_side="left")
        self.model = AutoModel.from_pretrained(model_dir)
        self.max_length = max_length
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)  # move model to device

    def forward(self, texts: list[str]) -> Tensor:
        # Tokenize batch
        batch_dict = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        ).to(self.device)

        # Forward pass
        outputs = self.model(**batch_dict)

        # Pool to sequence embedding
        embeddings = last_token_pool(outputs.last_hidden_state, batch_dict["attention_mask"])

        # Normalize embeddings
        embeddings = F.normalize(embeddings, p=2, dim=1)
        return embeddings

if __name__ == "__main__":
    model_dir = kagglehub.model_download("qwen-lm/qwen-3-embedding/transformers/0.6b")
    embedder = QwenEmbedder(model_dir)

    # Queries and docs
    queries = [
        "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: What is the capital of China?",
        "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: Explain gravity"
    ]
    documents = [
        "The capital of China is Beijing.",
        "Gravity is a force that attracts two bodies towards each other..."
    ]

    # Get embeddings
    query_emb = embedder(queries)
    doc_emb = embedder(documents)

    # Compute similarity
    scores = query_emb @ doc_emb.T
    print(scores.tolist())

In [None]:
%%writefile utils.py

import pandas as pd 
from constants import prompt, COMPLETE
import numpy as np 
import random
import re
from datasets import Dataset

random.seed(42)
np.random.seed(42)

def url_to_semantics(txt : str) -> str:
    if not isinstance(txt,str):
        return ""
    url_pattern = r'https?://[^\s/$.?#].[^\s]*'
    urls = re.findall(url_pattern, txt)
    
    if not urls:
        return "" 

    all_semantics = []
    seen_semantics = set()

    for url in urls:
        url_lower = url.lower()
        
        domain_match = re.search(r"(?:https?://)?([a-z0-9\-\.]+)\.[a-z]{2,}", url_lower)
        if domain_match:
            full_domain = domain_match.group(1)
            parts = full_domain.split('.')
            for part in parts:
                if part and part not in seen_semantics and len(part) > 3: # Avoid short parts like 'www'
                    all_semantics.append(f"domain:{part}")
                    seen_semantics.add(part)

        # Extract path parts
        path = re.sub(r"^(?:https?://)?[a-z0-9\.-]+\.[a-z]{2,}/?", "", url_lower)
        path_parts = [p for p in re.split(r'[/_.-]+', path) if p and p.isalnum()] # Split by common delimiters

        for part in path_parts:
            # Clean up potential file extensions or query params
            part_clean = re.sub(r"\.(html?|php|asp|jsp)$|#.*|\?.*", "", part)
            if part_clean and part_clean not in seen_semantics and len(part_clean) > 3:
                all_semantics.append(f"path:{part_clean}")
                seen_semantics.add(part_clean)

    if not all_semantics:
        return ""

    return f"\nURL Keywords: {' '.join(all_semantics)}"

def build_prompt(row):
    subreddit = row.get("subreddit", "unknown")
    rule = row.get("rule", "")
    pos_example = row.get("positive_example", "")
    neg_example = row.get("negative_example", "")
    body = row.get("body", "")
    url_features_body = url_to_semantics(body)
    url_features_pos = url_to_semantics(pos_example)
    url_features_neg = url_to_semantics(neg_example)
    return f"""
{prompt}

r/{subreddit} 
rule: {rule}
Examples : 
1) {pos_example}{url_features_pos}
{COMPLETE} yes
2) {neg_example}{url_features_neg}
{COMPLETE} no

------
Comment: {body}{url_features_body}
{COMPLETE} """


def get_data_for_training(fpath,sample_frac = 0.5):
    train_data = pd.read_csv(f"{fpath}/train.csv")
    
    test_df= pd.read_csv(f"{fpath}/test.csv").sample(frac=sample_frac,random_state = 42 )

    


    train_df = train_data[['body','rule','subreddit','positive_example_1','positive_example_2', 'negative_example_1','negative_example_2','rule_violation']]
    
    #randomly assign examples
    train_df['positive_example'] = np.where(np.random.rand(len(train_df)) <0.5 , train_df['positive_example_1'],train_df['positive_example_2'])
    train_df['negative_example'] = np.where(np.random.rand(len(train_df)) <0.5 , train_df['negative_example_1'], train_df['negative_example_2'])
    train_df.drop(columns = ['positive_example_1','positive_example_2', 'negative_example_1','negative_example_2'], inplace = True)

    dfs = [train_df]
    
    # build test df 
    
    for rule_violation in ['yes', 'no']:
        for i in range(1,3): #loop through both examples
            subdf =  test_df.copy().drop(columns=['body','positive_example_1','positive_example_2', 'negative_example_1','negative_example_2'])

            if rule_violation == 'yes':   # case when rule is violated 
                subdf['body'] = test_df[f'positive_example_{i}']
                subdf['positive_example'] = test_df[f'positive_example_{3-i}']
                subdf['negative_example'] = np.where(np.random.rand(len(test_df))<0.5, test_df[f'negative_example_{i}'],test_df[f'negative_example_{3-i}'])
                subdf['rule_violation'] = 1
            else:  # case when rule is not violated 
                subdf['body'] = test_df[f'negative_example_{i}']
                subdf['positive_example'] = np.where(np.random.rand(len(test_df))<0.5, test_df[f'positive_example_{i}'],test_df[f'positive_example_{3-i}'])

                subdf['neagtive_example'] = test_df[f'negative_example_{3-i}']
                subdf['rule_violation'] = 0
            dfs.append(subdf)

    df =  pd.concat(dfs, axis = 0).drop_duplicates(ignore_index = True)
    
    return df

def build_dataset(df):
    df['prompt'] = df.apply(build_prompt, axis = 1)

    df['completion'] = df['rule_violation'].map(
        {
            1 : 'yes',
            0 : 'no'
        }
    )
    df = df[['prompt','completion']]

    print(df)

    dataset = Dataset.from_pandas(df)
    dataset.to_pandas().to_csv("/kaggle/working/dataset.csv", index=False)
    return dataset

In [None]:
!pip install trl

In [None]:
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

In [None]:
!pip install bitsandbytes


In [None]:
%%writefile train.py
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from utils import get_data_for_training, build_dataset, build_prompt, url_to_semantics

# Lora imports
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
from tqdm.auto import tqdm
from transformers.utils import is_torch_bf16_gpu_available
from constants import LORA_PATH, BASE_MODEL_PATH


def main():
    data_path = "/kaggle/input/jigsaw-agile-community-rules/"
    df = get_data_for_training(data_path)
    train_dataset = build_dataset(df)
    df_train = pd.DataFrame(train_dataset)

    df_train = pd.DataFrame(train_dataset)

    lora_config = LoraConfig(
        r = 16,
        lora_alpha= 32,
        lora_dropout = 0.1,
        bias = "none",
        target_modules = ["q_proj","k_proj","v_proj","o_proj", "gate_proj", "up_proj", "down_proj"],
        task_type = "CAUSAL_LM")
    
    training_args = SFTConfig(
        num_train_epochs = 1,
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        optim = "paged_adamw_8bit",
        learning_rate = 5e-5,
        weight_decay = 0.01,
        max_grad_norm = 1.0,

        lr_scheduler_type = "cosine",
        warmup_ratio=0.05,

        bf16= is_torch_bf16_gpu_available(),
        fp16=not is_torch_bf16_gpu_available(),
        dataloader_pin_memory=True,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs = {"use_reentrant": False},

        save_strategy= "no",
        report_to = "none",

        completion_only_loss = True,
        packing = True,
        remove_unused_columns = False
    )
    
    trainer = SFTTrainer(
        BASE_MODEL_PATH,
        args = training_args,
        train_dataset = train_dataset,
        peft_config = lora_config
    )

    trainer.train()
    trainer.save_model(LORA_PATH)
    #print(df_train.head(10))
    

if __name__ == "__main__":
    main()
    


In [None]:
!pip install vllm

In [None]:
!pip install logits_processor_zoo

In [None]:
%%writefile inference.py

import random
import multiprocessing as mp
import numpy as np
import pandas as pd
import torch
import vllm
import os

from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
from utils import build_dataset
from constants import (
    BASE_MODEL_PATH)

def _filter_single_token_aliases(tokenizer, candidates):
    keep = []
    for s in candidates:
        ids = tokenizer.encode(s, add_special_tokens=False)
        if len(ids) == 1:
            keep.append(s)
    return keep


def _fallback_single_token(tokenizer, word: str) -> str:
    spaced = " " + word
    if len(tokenizer.encode(spaced, add_special_tokens=False)) == 1:
        return spaced
    if len(tokenizer.encode(word, add_special_tokens=False)) == 1:
        return word
    return spaced

def run_inference_on_device(df_slice: pd.DataFrame, device_id: int) -> pd.DataFrame:
    llm = vllm.LLM(
        BASE_MODEL_PATH,
        #quantization="awq",
        tensor_parallel_size=1,
        gpu_memory_utilization=0.98,
        trust_remote_code=True,
        dtype="half",
        enforce_eager=True,
        max_model_len=2836,
        disable_log_stats=True,
        enable_prefix_caching=True,
        max_lora_rank=64,
    )
    
    tokenizer = llm.get_tokenizer()
    test_dataset = build_dataset(df_slice)
    texts = test_dataset["prompt"]
    
    yes_candidates = ["Yes", " yes", "yes", "Yes.", ...]
    no_candidates  = ["No",  " no",  "no", "No.", ...]

    yes_alias = _filter_single_token_aliases(tokenizer, yes_candidates)
    no_alias  = _filter_single_token_aliases(tokenizer, no_candidates)
    if not yes_alias:
        yes_alias = [_fallback_single_token(tokenizer, POSITIVE_ANSWER)]
    if not no_alias:
        no_alias = [_fallback_single_token(tokenizer, NEGATIVE_ANSWER)]

    choices = yes_alias + no_alias
    mclp = MultipleChoiceLogitsProcessor(tokenizer, choices=choices)
    outputs = llm.generate(
        texts,
        vllm.SamplingParams(
            temperature=0.0,
            skip_special_tokens=True,
            max_tokens=1,
            logits_processors=[mclp],
            logprobs=len(choices),
        ),
        use_tqdm=True,
    )

    rows = []
    for out, rid in zip(outputs, df_slice["row_id"].values):
        lp0 = {t.decoded_token: t.logprob for t in out.outputs[0].logprobs[0].values()}
        p_yes = sum(np.exp(lp0.get(tok, -1e9)) for tok in yes_alias)
        p_no  = sum(np.exp(lp0.get(tok, -1e9)) for tok in no_alias)
        score = p_yes / (p_yes + p_no + 1e-12)
        rows.append({"row_id": rid, "rule_violation": float(score)})
    predictions = pd.DataFrame(rows, columns=["row_id", "rule_violation"])
    return predictions

def worker(device_id: int, df_slice: pd.DataFrame, return_dict):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
    print(f"[Worker {device_id}] Running on GPU {device_id}, data size={len(df_slice)}")
    preds = run_inference_on_device(df_slice, device_id)
    return_dict[device_id] = preds
    
def main():
    random.seed(42)
    np.random.seed(42)
    DATA_PATH = "/kaggle/input/jigsaw-agile-community-rules/"

    test_dataframe = pd.read_csv(f"{DATA_PATH}/test.csv")
    test_dataframe["positive_example"] = test_dataframe.apply(
        lambda row: random.choice([row["positive_example_1"], row["positive_example_2"]]), axis=1
    )
    test_dataframe["negative_example"] = test_dataframe.apply(
        lambda row: random.choice([row["negative_example_1"], row["negative_example_2"]]), axis=1
    )

    test_dataframe = test_dataframe.drop(
        columns=["positive_example_1", "positive_example_2", "negative_example_1", "negative_example_2"],
        errors="ignore"
    ).reset_index(drop=True)
    n_gpus = max(1, torch.cuda.device_count())
    n_procs = min(2, n_gpus)

    df_slices = np.array_split(test_dataframe, n_procs)
    for i in range(n_procs):
        df_slices[i] = df_slices[i].reset_index(drop=True)

    if n_procs == 1:
        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
        predictions = run_inference_on_device(df_slices[0], 0)
    else:
        manager = mp.Manager()
        return_dict = manager.dict()
        procs = []
        for dev_id in range(n_procs):
            p = mp.Process(target=worker, args=(dev_id, df_slices[dev_id], return_dict))
            p.start()
            procs.append(p)
        for p in procs:
            p.join()
        predictions = pd.concat([return_dict[i] for i in range(n_procs)], ignore_index=True)
    submission = predictions[["row_id", "rule_violation"]].copy()
    submission.to_csv("submission_qwen.csv", index=False)
    print("✅ Saved submission_qwen.csv")

if __name__ == "__main__":
    main()

In [None]:
!python train.py

In [None]:
!python inference.py