# Jigsaw - Agile Community Rules Classification
### https://www.kaggle.com/competitions/jigsaw-agile-community-rules

In [3]:
#pip install packages
!pip install --upgrade pip
!pip install trl
!pip install optimum
!pip install auto-gptq
!pip install bitsandbytes
!pip install peft accelerate deepspeed
!pip install kagglehub

[0m

In [4]:
import kagglehub
kagglehub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

In [5]:
%%writefile config.py

RESUME_TRAINING=False
LOCAL_MODEL_PATH = "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4"
LORA_IN_PATH= "./lora"
LORA_OUT_PATH = "./"
DATA_PATH = "./"
OUTPUT_PATH="./outputs/"

# Training parameters
MAX_SEQ_LENGTH = 4096
RANK = 64
MAX_ITER_STEPS = -1
EPOCHS = 2
SAMPLE_LEN="25k"

# Kaggle upload configuration
MODEL_SLUG = "qwen25-32b-gptq-int4-jigsaw-acrc-lora"
VARIATION_SLUG = "01"

###--------------------------------###
DATASET_ID="1to9yn"
BASE_MODEL=LOCAL_MODEL_PATH.split("/")[-1].replace(".", "p")
TRAIN_DIR=f"{BASE_MODEL}_lora_fp16_r{RANK}_s{SAMPLE_LEN}_e_{EPOCHS}_msl{MAX_SEQ_LENGTH}-{DATASET_ID}"
print("TRAIN_DIR",TRAIN_DIR)

Writing config.py


In [None]:
%%writefile get_dataset.py
import pandas as pd
from datasets import Dataset
import kagglehub
import os
import glob

def load_data():
    """Load Jigsaw ACRC dataset from Kaggle or local files"""
    # Check if running on Kaggle
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        # Running on Kaggle
        base_path = "/kaggle/input/jigsaw-agile-community-rules/"
        df_train = pd.read_csv(f"{base_path}*train*.csv")
        df_test = pd.read_csv(f"{base_path}*test*.csv")
    else:
        # Running locally
        base_path = "./"
        
        # Find all train files
        train_files = glob.glob(f"{base_path}*train*.csv")
        if train_files:
            train_dfs = [pd.read_csv(file) for file in train_files]
            df_train = pd.concat(train_dfs, ignore_index=True)
            print(f"Concatenated {len(train_files)} train files: {train_files}")
        else:
            raise FileNotFoundError(f"No train files found in {base_path}")
        
        # Find all test files
        test_files = glob.glob(f"{base_path}*test*.csv")
        if test_files:
            test_dfs = [pd.read_csv(file) for file in test_files]
            df_test = pd.concat(test_dfs, ignore_index=True)
            print(f"Concatenated {len(test_files)} test files: {test_files}")
        else:
            raise FileNotFoundError(f"No test files found in {base_path}")

    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")
    print(df_train.columns)
            
    req_cols=['subreddit', 'rule', 'positive_example_1', 'negative_example_1', 'positive_example_2',
           'negative_example_2', 'test_comment', 'violates_rule']

    df_train=df_train[req_cols]
    df_test=df_test[req_cols]

    # Normalize "True"/"False" -> "Yes"/"No" and drop anything else
    for name, df in [("train", df_train), ("test", df_test)]:
        df["violates_rule"] = (
            df["violates_rule"]
            .astype(str).str.strip()
            .map({"True": "Yes", "False": "No", "Yes": "Yes", "No": "No"})  # normalize
        )
        before = len(df)
        df.dropna(subset=["violates_rule"], inplace=True)  # drop rows with NaN (anything not Yes/No/True/False)
        after = len(df)
        print(f"Dropped {before - after} rows from {name} due to invalid 'violates_rule'")
    
    for col in req_cols:
        dropped_rows = df_train[df_train[col].isna()].shape[0]
        print(f"{col}: {dropped_rows} rows would be dropped")
        
    df_train = df_train[req_cols].dropna()
    df_test = df_test[req_cols].dropna()

    print(f"Using path: {base_path}")
    print("\n After dropping:")
    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")

    df_train["violates_rule"] = df_train["violates_rule"].astype(str)
    df_test["violates_rule"] = df_test["violates_rule"].astype(str)

    valid_values = {"Yes", "No"}
    df_train = df_train[df_train["violates_rule"].isin(valid_values)]
    df_test  = df_test[df_test["violates_rule"].isin(valid_values)]
    print("\n After checking Yes/No:")
    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")
    
    return df_train, df_test

def formatting_prompts_func(examples):
    """
    Format Reddit moderation dataset for ChatML training - matches inference format exactly
    """
    
    texts = []
    
    for i in range(len(examples['subreddit'])):
        # Create system message
        system_msg = f"You are a really experienced moderator for the subreddit /r/{examples['subreddit'][i]}. Your job is to determine if the following reported comment violates the given rule. Answer with only \"Yes\" or \"No\"."
        
        # Create user message with the rule and examples
        user_msg = f"""Rule: {examples['rule'][i]}
Example 1:
{examples['positive_example_1'][i]}
Rule violation: Yes
Example 2:
{examples['negative_example_1'][i]}
Rule violation: No
Example 3:
{examples['positive_example_2'][i]}
Rule violation: Yes
Example 4:
{examples['negative_example_2'][i]}
Rule violation: No
Test sentence:
{examples['test_comment'][i]}"""
        
        # Assistant response is "Yes" or "No"
        assistant_msg = examples['violates_rule'][i]
        
        # Format as ChatML
        chatml_text = f"""<|im_start|>system
{system_msg}<|im_end|>
<|im_start|>user
{user_msg}<|im_end|>
<|im_start|>assistant
{assistant_msg}<|im_end|>"""
        
        texts.append(chatml_text)
    
    return {"text": texts}

def build_dataset():
    """
    Build both train and test datasets using the new ChatML format
    """
    df_train, df_test = load_data()
    
    train_dataset = Dataset.from_pandas(df_train)
    train_dataset = train_dataset.map(
        lambda examples: formatting_prompts_func(examples), 
        batched=True
    )
    
    test_dataset = Dataset.from_pandas(df_test)
    test_dataset = test_dataset.map(
        lambda examples: formatting_prompts_func(examples), 
        batched=True
    )
    
    return train_dataset, test_dataset

In [7]:
%%writefile train.py

import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, PeftModel
from transformers.utils import is_torch_bf16_gpu_available
from get_dataset import build_dataset
from config import LOCAL_MODEL_PATH, LORA_IN_PATH, LORA_OUT_PATH, RANK, MAX_SEQ_LENGTH, EPOCHS, TRAIN_DIR, MAX_ITER_STEPS, OUTPUT_PATH, RESUME_TRAINING
import os

# # ----------------------------
# # Load model & tokenizer
# # ----------------------------

model = AutoModelForCausalLM.from_pretrained(
    LOCAL_MODEL_PATH,
    torch_dtype="auto",       
)

if RESUME_TRAINING: 
    model = PeftModel.from_pretrained(model, LORA_IN_PATH, is_trainable=True)


tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.gradient_checkpointing_enable()  # reduce memory usage    

# ----------------------------
# Build datasets
# ----------------------------
train_dataset, test_dataset = build_dataset()

# ----------------------------
# LoRA config
# ----------------------------
if RESUME_TRAINING:    
    lora_config = None       
else:    
    lora_config = LoraConfig(
        r=RANK,
        lora_alpha=64,
        lora_dropout=0.0,
        bias="none",
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ],
        task_type="CAUSAL_LM",
    )
    
# ----------------------------
# SFT config
# ----------------------------
sft_config = SFTConfig(
    output_dir=OUTPUT_PATH,
    num_train_epochs=EPOCHS,
    max_steps=MAX_ITER_STEPS,
    per_device_train_batch_size=4,       
    gradient_accumulation_steps=4,       
    max_length=min(MAX_SEQ_LENGTH, 4096),  

    optim="paged_adamw_8bit",
    learning_rate=5e-4,
    weight_decay=0.01,
    max_grad_norm=1.0,
        
    lr_scheduler_type="linear",
    warmup_ratio=0.05,
        
    bf16=is_torch_bf16_gpu_available(),
    fp16=not is_torch_bf16_gpu_available(),
    dataloader_pin_memory=True,
        
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},

    warmup_steps=5,
    logging_steps=10,
    eval_steps=1000,
    eval_strategy="steps",
    save_strategy="epoch",
    save_total_limit=3,

    report_to="none",        
    packing=False,
    remove_unused_columns=False,
    dataset_text_field="text",

)

# ----------------------------
# Trainer
# ----------------------------
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=lora_config,
    args=sft_config,
)

trainer.train()
trainer.save_model(os.path.join(LORA_OUT_PATH, TRAIN_DIR))
print("success")

Writing train.py


In [8]:
%%writefile accelerate_config.yaml
# """
# ZeRO Stage 1: Optimizer State Partitioning (2-4 GPU)
# ZeRO Stage 2: + Gradient Partitioning (4-8 GPU)
# ZeRO Stage 3: + Parameter Partitioning (8+ GPU)
# """

compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  gradient_accumulation_steps: 4
  gradient_clipping: 1.0
  train_batch_size: 16
  train_micro_batch_size_per_gpu: 4
  
  zero_stage: 2
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  
  stage3_gather_16bit_weights_on_model_save: false
  stage3_max_live_parameters: 1e8
  stage3_max_reuse_distance: 1e8
  stage3_prefetch_bucket_size: 5e7
  stage3_param_persistence_threshold: 1e5
  
  zero_allow_untested_optimizer: true
  zero_force_ds_cpu_optimizer: false
  
  fp16:
    enabled: true
    loss_scale: 0
    initial_scale_power: 16
    loss_scale_window: 1000
    hysteresis: 2
    min_loss_scale: 1
  
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config:
  dynamo_backend: INDUCTOR
  dynamo_use_fullgraph: false
  dynamo_use_dynamic: false
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false


Writing accelerate_config.yaml


In [None]:
!accelerate launch --config_file accelerate_config.yaml train.py

[2025-08-23 06:22:35,105] [INFO] [real_accelerator.py:260:get_accelerator] Setting ds_accelerator to cuda (auto detect)
df: /home/.triton/autotune: No such file or directory
[2025-08-23 06:22:36,247] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
TRAIN_DIR Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_1_msl4096-1to9
config.json: 1.26kB [00:00, 6.73MB/s]
  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  def backward(ctx, grad_output):
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.
model.safetensors.index.json: 172kB [00:00, 215MB/s]
Fetching 5 files:   0%|                                   | 0/5 [00:00<?, ?it/s]
model-00005-of-00005.safetensors:   0%|             | 0.00/3.48G [00:00<?, ?B/s][A

model-00001-of-00005.safetensors:   0%|             | 0.00/3.95G [00:00<?, ?B/s][A[A


model-00002-of-00005.safetensors:   0%|             | 0.00/3.98G 

In [None]:
# Replace with path to directory containing model files.
import os
from config import LORA_OUT_PATH, TRAIN_DIR, MODEL_SLUG, VARIATION_SLUG
LOCAL_MODEL_DIR = os.path.join(LORA_OUT_PATH, TRAIN_DIR)

MODEL_SLUG = MODEL_SLUG # Replace with model slug.

# Learn more about naming model variations at
# https://www.kaggle.com/docs/models#name-model.
VARIATION_SLUG = VARIATION_SLUG # Replace with variation slug.

kagglehub.model_upload(
  handle = f"vinothkumarsekar89/{MODEL_SLUG}/transformers/{VARIATION_SLUG}",
  local_model_dir = LOCAL_MODEL_DIR,
  version_notes = 'LoRA Merged')