# Jigsaw - Agile Community Rules Classification
### https://www.kaggle.com/competitions/jigsaw-agile-community-rules

## Install packages on Kaggle: Add-ons > Install Dependencies 

```bash
pip install pip3-autoremove
pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu124
pip install unsloth vllm
pip install scikit-learn
```

In [5]:
%%writefile load_data.py
import kagglehub
import pandas as pd
import os
import glob

# Check if running on Kaggle
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
    # Running on Kaggle
    base_path = "/kaggle/input/jigsaw-agile-community-rules/"
    df_train = pd.read_csv(f"{base_path}train.csv")
    df_test = pd.read_csv(f"{base_path}test.csv")
else:
    # Running locally
    base_path = "./data/"
    
    # Find all train files
    train_files = glob.glob(f"{base_path}train.csv")
    
    if train_files:
        train_dfs = [pd.read_csv(file) for file in train_files]
        df_train = pd.concat(train_dfs, ignore_index=True)
        print(f"Concatenated {len(train_files)} train files: {train_files}")
    else:
        raise FileNotFoundError(f"No train files found in {base_path}")
    
    # Find all test files
    test_files = glob.glob(f"{base_path}test.csv")
    if test_files:
        test_dfs = [pd.read_csv(file) for file in test_files]
        df_test = pd.concat(test_dfs, ignore_index=True)
        print(f"Concatenated {len(test_files)} test files: {test_files}")
    else:
        raise FileNotFoundError(f"No test files found in {base_path}")

df_test["rule_violation"]=0
df_train["rule_violation_yn"]="na"
df_test["rule_violation_yn"]="na"
print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")
print(df_train.columns)
        
req_cols = ['subreddit', 'rule', 'positive_example_1', 'negative_example_1', 'positive_example_2',
           'negative_example_2', 'body', 'rule_violation', 'rule_violation_yn']


df_train = df_train[req_cols]
df_test = df_test[req_cols]


# Improved normalization function that handles quotes
def normalize_rule_violation(value):
    """Normalize rule_violation values to 'Yes' or 'No', handling quotes and case variations"""
    if pd.isna(value):
        return None
    
    # Convert to string, strip whitespace and quotes, then convert to lowercase
    normalized = str(value).strip().strip('"').strip("'").strip().lower()
    
    # Map all variations to Yes/No
    if normalized in ['yes', 'true', '1', 'y', 't']:
        return 'Yes'
    elif normalized in ['no', 'false', '0', 'n', 'f']:
        return 'No'
    else:
        return None  # Invalid values will become NaN

# Apply normalization to both datasets
for name, df in [("train", df_train), ("test", df_test)]:
    print(f"\n{name.capitalize()} - Before normalization:")
    print(f"Unique values:\n{df['rule_violation'].value_counts()}")
    
    df["rule_violation_yn"] = df["rule_violation"].apply(normalize_rule_violation)
    
    print(f"\n{name.capitalize()} - After normalization:")
    print(f"Unique values:\n{df['rule_violation_yn'].value_counts()}")
    
    before = len(df)
    df.dropna(subset=["rule_violation_yn"], inplace=True)  # drop rows with invalid values
    after = len(df)
    print(f"Dropped {before - after} rows from {name} due to invalid 'rule_violation'")

# Check for missing values in other columns
print("\nChecking for missing values in other columns:")
for col in req_cols:
    dropped_rows = df_train[df_train[col].isna()].shape[0]
    print(f"{col}: {dropped_rows} rows would be dropped")
    
df_train = df_train[req_cols].dropna()
df_test = df_test[req_cols].dropna()

df_train = df_train.sample(frac=1, random_state=21).reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=21).reset_index(drop=True)

print(f"\nUsing path: {base_path}")
print("\nFinal results:")
print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

print("\nSample of processed data:")
print(df_train.head(2))


import pandas as pd
import random

def generate_minimal_synthetic_data(df_test, random_seed=21):
    req_cols = ['subreddit', 'rule', 'positive_example_1', 'negative_example_1', 
                'positive_example_2', 'negative_example_2', 'body', 
                'rule_violation', 'rule_violation_yn']

    synthetic_rows = []
    random.seed(random_seed)

    grouped = df_test.groupby('rule')

    for rule, group in grouped:
        group = group.reset_index(drop=True)

        for i in range(0, len(group) - 4, 5):
            source_row = group.iloc[i + 4]

            # Prepare example mappings from the 5th row
            examples = [
                (source_row['positive_example_1'], 1, 'Yes'),
                (source_row['positive_example_2'], 1, 'Yes'),
                (source_row['negative_example_1'], 0, 'No'),
                (source_row['negative_example_2'], 0, 'No'),
            ]

            # Shuffle examples so assignment is randomized but unique
            random.shuffle(examples)

            # For rows 0 to 3 in the chunk, assign one unique shuffled example each
            for j in range(4):
                target_row = group.iloc[i + j]
                example_text, label, label_yn = examples[j]

                synthetic_row = target_row.copy()
                synthetic_row['body'] = example_text
                synthetic_row['rule_violation'] = label
                synthetic_row['rule_violation_yn'] = label_yn

                synthetic_rows.append(synthetic_row)

    df_synthetic = pd.DataFrame(synthetic_rows)[req_cols]
    return df_synthetic

    
df_test_train=generate_minimal_synthetic_data(df_test)
print(df_test_train.shape)
df_test_train = df_test_train.sample(frac=1, random_state=21).reset_index(drop=True)
df_test_train.to_csv("df_train_synthetic.csv",index=False)

Overwriting load_data.py


## Set imports, variable names, parameters

In [6]:
%%writefile train_model.py

from unsloth import FastLanguageModel
import pandas as pd
import torch
import os
#os.environ["CUDA_LAUNCH_BLOCKING"] = "0"

dtype = ( None )
load_in_4bit = True
load_in_8bit = False

######---Parameters to change---#######
kaggle_model_path="/kaggle/input/model/transformers/1b-instruct/1"
local_model_path="unsloth/Qwen2.5-7B-Instruct"
local_model_path="./lora/Qwen25_3B_Instruct_unsloth_lora_fp16_r64_a64_s105759_e_1_msl3072-0-swap-cr123-kdsr1-gksr1_merged_fp16"
df_test_train=pd.read_csv("df_train_synthetic.csv")
print(df_test_train.shape)

max_seq_length = 3072
Rank=64
LORA_ALPHA=64
sample_len=int(df_test_train.shape[0])
max_iter_steps= 10
Epochs=-1


#load model
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
    model_path=kaggle_model_path
else:
    model_path=local_model_path
        
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    load_in_8bit=load_in_8bit
)

#set lora
print(model.dtype)
model = FastLanguageModel.get_peft_model(
    model,
    r = Rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = LORA_ALPHA,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 123,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

#prepare dataset
import pandas as pd
from datasets import Dataset
import kagglehub
import os
import glob

def formatting_prompts_func(examples, tokenizer):
    """
    Format Reddit moderation dataset using tokenizer chat template
    """
    texts = []
    for i in range(len(examples['subreddit'])):
        # Create system message
        system_msg = f"""You are a really experienced moderator for the subreddit /r/{examples['subreddit'][i]}. 
Your job is to determine if the following reported comment violates the given rule. Answer with only Yes or No."""
        
        # Create user message with the rule and examples
        user_msg = f"""<rule>
{examples['rule'][i]}
</rule>
<examples>
<example>
<comment>{examples['positive_example_1'][i]}</comment>
<rule_violation>Yes</rule_violation>
</example>
<example>
<comment>{examples['positive_example_2'][i]}</comment>
<rule_violation>Yes</rule_violation>
</example>
<example>
<comment>{examples['negative_example_1'][i]}</comment>
<rule_violation>No</rule_violation>
</example>
<example>
<comment>{examples['negative_example_2'][i]}</comment>
<rule_violation>No</rule_violation>
</example>
</examples>
<test_comment>
{examples['body'][i]}
</test_comment>"""
        
        # Assistant response is "Yes" or "No"
        assistant_msg = examples['rule_violation_yn'][i]

        # Create messages list for chat template
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": assistant_msg}
        ]


        formatted_text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )

        texts.append(formatted_text)

    return {"text": texts}


def build_dataset(tokenizer):
    """
    Build both train and test datasets using tokenizer chat template
    """
   
    train_dataset = Dataset.from_pandas(df_test_train)
    train_dataset = train_dataset.map(
        lambda examples: formatting_prompts_func(examples, tokenizer), 
        batched=True
    )
    
    # test_dataset = Dataset.from_pandas(df_test)
    # test_dataset = test_dataset.map(
    #     lambda examples: formatting_prompts_func(examples, tokenizer), 
    #     batched=True
    # )
    
    return train_dataset

#make dataset    
dataset_train = build_dataset(tokenizer)
print(dataset_train['text'][0])

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    eval_dataset = None,  # Add test dataset here
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = Epochs, 
        max_steps = max_iter_steps,
        learning_rate = 1e-4,
        fp16 = False,  # Disable FP16 to avoid gradient unscaling issues
        bf16 = is_bfloat16_supported(),  # Use BF16 instead if supported
        logging_steps = 10,
        optim = "adamw_torch",  # Use adamw_torch instead of adamw_8bit for better compatibility
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 123,
        eval_strategy = "no", 
        eval_steps = 2000000,
        max_grad_norm = 1.0,  # Add explicit gradient clipping
        # Save settings - save after each epoch
        save_strategy = "no",
        save_total_limit = 1,  # Keep only last 3 checkpoints to save space
        #load_best_model_at_end = True,
        #metric_for_best_model = "eval_loss",
        output_dir = "none",
        report_to = "none", # Use this for WandB etc
    ),
)

trainer.train()

#save merged 16bit
import os
dir_path = "./trained_model_lora_1"
os.makedirs(dir_path, exist_ok=True)
model.save_pretrained(f"{dir_path}/")  # Local saving
tokenizer.save_pretrained(f"{dir_path}/")

Overwriting train_model.py


In [7]:
!python load_data.py

Concatenated 1 train files: ['./data/train.csv']
Concatenated 1 test files: ['./data/test.csv']
Train shape: (2029, 10)
Test shape: (15, 10)
Index(['row_id', 'body', 'rule', 'subreddit', 'positive_example_1',
       'positive_example_2', 'negative_example_1', 'negative_example_2',
       'rule_violation', 'rule_violation_yn'],
      dtype='object')

Train - Before normalization:
Unique values:
rule_violation
1    1031
0     998
Name: count, dtype: int64

Train - After normalization:
Unique values:
rule_violation_yn
Yes    1031
No      998
Name: count, dtype: int64
Dropped 0 rows from train due to invalid 'rule_violation'

Test - Before normalization:
Unique values:
rule_violation
0    15
Name: count, dtype: int64

Test - After normalization:
Unique values:
rule_violation_yn
No    15
Name: count, dtype: int64
Dropped 0 rows from test due to invalid 'rule_violation'

Checking for missing values in other columns:
subreddit: 0 rows would be dropped
rule: 0 rows would be dropped
positive_ex

In [8]:
!python train_model.py

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-21 19:35:27 [__init__.py:235] Automatically detected platform cuda.
Switching to PyTorch attention since your Xformers is broken.

Requires Flash-Attention version >=2.7.1,<=2.8.0 but got 2.8.3.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
(8, 9)
==((====))==  Unsloth 2025.10.1: Fast Qwen2 patching. Transformers: 4.55.4. vLLM: 0.10.0.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.693 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:06<00:00,  3.10s/it]
torch.bfloat16
Unsloth 2025.10.1 patched 36 