In [2]:
%%writefile constants.py
BASE_MODEL_PATH = "/kaggle/input/qwen-3-embedding/transformers/0.6b/1"
LORA_PATH = "output/"
COMPLETE = "Answer:"
prompt = f"You are given a comment from reddit and a rule. Your task is to classify whether the comment violates the rule. Answer 'yes' or 'no' only."

Overwriting constants.py


In [3]:
import pandas as pd

train_data = pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/train.csv")
print(len(train_data))
test_sample=    pd.read_csv("/kaggle/input/jigsaw-agile-community-rules/test.csv").sample(frac=0.5,random_state = 42 )

print(len(test_sample))

2029
5


In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-agile-community-rules/sample_submission.csv
/kaggle/input/jigsaw-agile-community-rules/train.csv
/kaggle/input/jigsaw-agile-community-rules/test.csv
/kaggle/input/qwen-3-embedding/transformers/0.6b/1/config.json
/kaggle/input/qwen-3-embedding/transformers/0.6b/1/merges.txt
/kaggle/input/qwen-3-embedding/transformers/0.6b/1/README.md
/kaggle/input/qwen-3-embedding/transformers/0.6b/1/tokenizer.json
/kaggle/input/qwen-3-embedding/transformers/0.6b/1/vocab.json
/kaggle/input/qwen-3-embedding/transformers/0.6b/1/tokenizer_config.json
/kaggle/input/qwen-3-embedding/transformers/0.6b/1/model.safetensors
/kaggle/input/qwen-3-embedding/transformers/0.6b/1/generation_config.json


In [None]:
!pip install flash-attn --no-build-isolation

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from torch import Tensor
import kagglehub


# Pooling function (same as before)
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    # Check whether padding is on the left
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


# OOP wrapper around tokenizer + model + pooling
class QwenEmbedder(nn.Module):
    def __init__(self, model_dir: str, max_length: int = 8192, device: str = None):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, padding_side="left")
        self.model = AutoModel.from_pretrained(model_dir)
        self.max_length = max_length
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)  # move model to device

    def forward(self, texts: list[str]) -> Tensor:
        # Tokenize batch
        batch_dict = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        ).to(self.device)

        # Forward pass
        outputs = self.model(**batch_dict)

        # Pool to sequence embedding
        embeddings = last_token_pool(outputs.last_hidden_state, batch_dict["attention_mask"])

        # Normalize embeddings
        embeddings = F.normalize(embeddings, p=2, dim=1)
        return embeddings

if __name__ == "__main__":
    model_dir = kagglehub.model_download("qwen-lm/qwen-3-embedding/transformers/0.6b")
    embedder = QwenEmbedder(model_dir)

    # Queries and docs
    queries = [
        "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: What is the capital of China?",
        "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: Explain gravity"
    ]
    documents = [
        "The capital of China is Beijing.",
        "Gravity is a force that attracts two bodies towards each other..."
    ]

    # Get embeddings
    query_emb = embedder(queries)
    doc_emb = embedder(documents)

    # Compute similarity
    scores = query_emb @ doc_emb.T
    print(scores.tolist())

In [5]:
%%writefile utils.py

import pandas as pd 
from constants import prompt, COMPLETE
import numpy as np 
import random
import re
from datasets import Dataset

random.seed(42)
np.random.seed(42)

def url_to_semantics(txt : str) -> str:
    if not isinstance(txt,str):
        return ""
    url_pattern = r'https?://[^\s/$.?#].[^\s]*'
    urls = re.findall(url_pattern, txt)
    
    if not urls:
        return "" 

    all_semantics = []
    seen_semantics = set()

    for url in urls:
        url_lower = url.lower()
        
        domain_match = re.search(r"(?:https?://)?([a-z0-9\-\.]+)\.[a-z]{2,}", url_lower)
        if domain_match:
            full_domain = domain_match.group(1)
            parts = full_domain.split('.')
            for part in parts:
                if part and part not in seen_semantics and len(part) > 3: # Avoid short parts like 'www'
                    all_semantics.append(f"domain:{part}")
                    seen_semantics.add(part)

        # Extract path parts
        path = re.sub(r"^(?:https?://)?[a-z0-9\.-]+\.[a-z]{2,}/?", "", url_lower)
        path_parts = [p for p in re.split(r'[/_.-]+', path) if p and p.isalnum()] # Split by common delimiters

        for part in path_parts:
            # Clean up potential file extensions or query params
            part_clean = re.sub(r"\.(html?|php|asp|jsp)$|#.*|\?.*", "", part)
            if part_clean and part_clean not in seen_semantics and len(part_clean) > 3:
                all_semantics.append(f"path:{part_clean}")
                seen_semantics.add(part_clean)

    if not all_semantics:
        return ""

    return f"\nURL Keywords: {' '.join(all_semantics)}"

def build_prompt(row):
    subreddit = row.get("subreddit", "unknown")
    rule = row.get("rule", "")
    pos_example = row.get("positive_example", "")
    neg_example = row.get("negative_example", "")
    body = row.get("body", "")
    url_features_body = url_to_semantics(body)
    url_features_pos = url_to_semantics(pos_example)
    url_features_neg = url_to_semantics(neg_example)
    return f"""
{prompt}

r/{subreddit} 
rule: {rule}
Examples : 
1) {pos_example}{url_features_pos}
{COMPLETE} yes
2) {neg_example}{url_features_neg}
{COMPLETE} no

------
Comment: {body}{url_features_body}
{COMPLETE} """


def get_data_for_training(fpath,sample_frac = 0.5):
    train_data = pd.read_csv(f"{fpath}/train.csv")
    
    test_df= pd.read_csv(f"{fpath}/test.csv").sample(frac=sample_frac,random_state = 42 )

    


    train_df = train_data[['body','rule','subreddit','positive_example_1','positive_example_2', 'negative_example_1','negative_example_2','rule_violation']]
    
    #randomly assign examples
    train_df['positive_example'] = np.where(np.random.rand(len(train_df)) <0.5 , train_df['positive_example_1'],train_df['positive_example_2'])
    train_df['negative_example'] = np.where(np.random.rand(len(train_df)) <0.5 , train_df['negative_example_1'], train_df['negative_example_2'])
    train_df.drop(columns = ['positive_example_1','positive_example_2', 'negative_example_1','negative_example_2'], inplace = True)

    dfs = [train_df]
    
    # build test df 
    
    for rule_violation in ['yes', 'no']:
        for i in range(1,3): #loop through both examples
            subdf =  test_df.copy().drop(columns=['body','positive_example_1','positive_example_2', 'negative_example_1','negative_example_2'])

            if rule_violation == 'yes':   # case when rule is violated 
                subdf['body'] = test_df[f'positive_example_{i}']
                subdf['positive_example'] = test_df[f'positive_example_{3-i}']
                subdf['negative_example'] = np.where(np.random.rand(len(test_df))<0.5, test_df[f'negative_example_{i}'],test_df[f'negative_example_{3-i}'])
                subdf['rule_violation'] = 1
            else:  # case when rule is not violated 
                subdf['body'] = test_df[f'negative_example_{i}']
                subdf['positive_example'] = np.where(np.random.rand(len(test_df))<0.5, test_df[f'positive_example_{i}'],test_df[f'positive_example_{3-i}'])

                subdf['neagtive_example'] = test_df[f'negative_example_{3-i}']
                subdf['rule_violation'] = 0
            dfs.append(subdf)

    df =  pd.concat(dfs, axis = 0).drop_duplicates(ignore_index = True)
    
    return df

def build_dataset(df):
    df['prompt'] = df.apply(build_prompt, axis = 1)

    df['completion'] = df['rule_violation'].map(
        {
            1 : 'yes',
            0 : 'no'
        }
    )
    df = df[['prompt','completion']]

    print(df)

    dataset = Dataset.from_pandas(df)
    dataset.to_pandas().to_csv("/kaggle/working/dataset.csv", index=False)
    return dataset

Writing utils.py


In [6]:
!pip install trl

Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers>=4.56.1 (from trl)
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.0.0->trl)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface_hub>=0.21.0 (from accelerate>=1.4.0->trl)
  Downloading huggingface_hub-0.35.1-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers>=4.56.1->trl)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runti

In [7]:
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

2025-09-25 17:34:34.954157: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758821675.123431      84 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758821675.174693      84 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
pip install bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
%%writefile train.py
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from utils import get_data_for_training, build_dataset, build_prompt, url_to_semantics

# Lora imports
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
from tqdm.auto import tqdm
from transformers.utils import is_torch_bf16_gpu_available
from constants import LORA_PATH, BASE_MODEL_PATH


def main():
    data_path = "/kaggle/input/jigsaw-agile-community-rules/"
    df = get_data_for_training(data_path)
    train_dataset = build_dataset(df)
    df_train = pd.DataFrame(train_dataset)

    df_train = pd.DataFrame(train_dataset)

    lora_config = LoraConfig(
        r = 16,
        lora_alpha= 32,
        lora_dropout = 0.1,
        bias = "none",
        target_modules = ["q_proj","k_proj","v_proj","o_proj", "gate_proj", "up_proj", "down_proj"],
        task_type = "CAUSAL_LM")
    
    training_args = SFTConfig(
        num_train_epochs = 1,
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        optim = "paged_adamw_8bit",
        learning_rate = 5e-5,
        weight_decay = 0.01,
        max_grad_norm = 1.0,

        lr_scheduler_type = "cosine",
        warmup_ratio=0.05,

        bf16= is_torch_bf16_gpu_available(),
        fp16=not is_torch_bf16_gpu_available(),
        dataloader_pin_memory=True,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs = {"use_reentrant": False},

        save_strategy= "no",
        report_to = "none",

        completion_only_loss = True,
        packing = True,
        remove_unused_columns = False
    )
    
    trainer = SFTTrainer(
        BASE_MODEL_PATH,
        args = training_args,
        train_dataset = train_dataset,
        peft_config = lora_config
    )

    trainer.train()
    trainer.save_model(LORA_PATH)
    #print(df_train.head(10))
    

if __name__ == "__main__":
    main()
    


                                                 prompt completion
0     \nYou are given a comment from reddit and a ru...         no
1     \nYou are given a comment from reddit and a ru...         no
2     \nYou are given a comment from reddit and a ru...        yes
3     \nYou are given a comment from reddit and a ru...        yes
4     \nYou are given a comment from reddit and a ru...        yes
...                                                 ...        ...
2044  \nYou are given a comment from reddit and a ru...         no
2045  \nYou are given a comment from reddit and a ru...         no
2046  \nYou are given a comment from reddit and a ru...         no
2047  \nYou are given a comment from reddit and a ru...         no
2048  \nYou are given a comment from reddit and a ru...         no

[2049 rows x 2 columns]


Adding EOS to train dataset:   0%|          | 0/2049 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2049 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/2049 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 151645, 'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
10,14.1239
20,9.8737
30,8.7262
40,8.1736
50,7.9733
60,7.9142


In [None]:
%%writefile inference.py

In [None]:
!python train.py