# Jigsaw - Agile Community Rules Classification
### https://www.kaggle.com/competitions/jigsaw-agile-community-rules

In [22]:
#pip install packages
!pip install --upgrade pip
!pip install trl
!pip install optimum
!pip install auto-gptq
!pip install bitsandbytes
!pip install peft accelerate deepspeed
!pip install kagglehub

[0m

In [None]:
import kagglehub
kagglehub.login()

In [23]:
%%writefile config.py

LOCAL_MODEL_PATH = "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4"
LORA_PATH = "./"
DATA_PATH = "./"
OUTPUT_PATH="./outputs/"

# Training parameters
MAX_SEQ_LENGTH = 2048
RANK = 64
MAX_ITER_STEPS = -1
EPOCHS = 1
SAMPLE_LEN="25k"

# Kaggle upload configuration
MODEL_SLUG = "qwen25-32b-gptq-int4-jigsaw-acrc-lora---"
VARIATION_SLUG = "01"

###--------------------------------###
DATASET_ID="1to9"
BASE_MODEL=LOCAL_MODEL_PATH.split("/")[-1].replace(".", "p")
TRAIN_DIR=f"{BASE_MODEL}_lora_fp16_r{RANK}_s{SAMPLE_LEN}_e_{EPOCHS}_msl{MAX_SEQ_LENGTH}-{DATASET_ID}"
print("TRAIN_DIR",TRAIN_DIR)

Overwriting config.py


In [24]:
%%writefile get_dataset.py
import pandas as pd
from datasets import Dataset
import kagglehub
import os
import glob

def load_data():
    """Load Jigsaw ACRC dataset from Kaggle or local files"""
    # Check if running on Kaggle
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        # Running on Kaggle
        base_path = "/kaggle/input/jigsaw-agile-community-rules/"
        df_train = pd.read_csv(f"{base_path}*train*.csv")
        df_test = pd.read_csv(f"{base_path}*test*.csv")
    else:
        # Running locally
        base_path = "./"
        
        # Find all train files
        train_files = glob.glob(f"{base_path}*train*.csv")
        if train_files:
            train_dfs = [pd.read_csv(file) for file in train_files]
            df_train = pd.concat(train_dfs, ignore_index=True)
            print(f"Concatenated {len(train_files)} train files: {train_files}")
        else:
            raise FileNotFoundError(f"No train files found in {base_path}")
        
        # Find all test files
        test_files = glob.glob(f"{base_path}*test*.csv")
        if test_files:
            test_dfs = [pd.read_csv(file) for file in test_files]
            df_test = pd.concat(test_dfs, ignore_index=True)
            print(f"Concatenated {len(test_files)} test files: {test_files}")
        else:
            raise FileNotFoundError(f"No test files found in {base_path}")

    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")
    print(df_train.columns)
            
    req_cols=['subreddit', 'rule', 'positive_example_1', 'negative_example_1', 'positive_example_2',
           'negative_example_2', 'test_comment', 'violates_rule']

    df_train=df_train[req_cols]
    df_test=df_test[req_cols]

    for col in req_cols:
        dropped_rows = df_train[df_train[col].isna()].shape[0]
        print(f"{col}: {dropped_rows} rows would be dropped")
        
    df_train = df_train[req_cols].dropna()
    df_test = df_test[req_cols].dropna()

    print(f"Using path: {base_path}")
    print("\n After dropping:")
    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")

    df_train["violates_rule"] = df_train["violates_rule"].astype(str)
    df_test["violates_rule"] = df_test["violates_rule"].astype(str)

    valid_values = {"True", "False"}
    df_train = df_train[df_train["violates_rule"].isin(valid_values)]
    df_test  = df_test[df_test["violates_rule"].isin(valid_values)]
    print("\n After checking True/False:")
    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")
    
    return df_train, df_test

def formatting_prompts_func(examples):
    """
    Format Reddit moderation dataset for Alpaca training - matches inference format exactly
    """
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
    
    texts = []
    
    for i in range(len(examples['subreddit'])):
        # Create instruction - exactly as in inference
        instruction = f"""You are a really experienced moderator for the subreddit /r/{examples['subreddit'][i]}. 
Your job is to determine if the following reported comment violates the given rule.
Answer with only "True" or "False"."""
        
        # Create input - exactly as in inference
        input_text = f"""Rule: {examples['rule'][i]}
Example 1:
{examples['positive_example_1'][i]}
Rule violation: True
Example 2:
{examples['negative_example_1'][i]}
Rule violation: False
Example 3:
{examples['positive_example_2'][i]}
Rule violation: True
Example 4:
{examples['negative_example_2'][i]}
Rule violation: False
Test sentence:
{examples['test_comment'][i]}"""
        
        # Response is already "True" or "False" string
        response = examples['violates_rule'][i]
                
        # Format the complete prompt
        text = alpaca_prompt.format(instruction, input_text, response)
        texts.append(text)
    
    return {"text": texts}

def build_dataset():
    """
    Build both train and test datasets using the new Alpaca format
    """
    df_train, df_test = load_data()
    
    train_dataset = Dataset.from_pandas(df_train)
    train_dataset = train_dataset.map(
        lambda examples: formatting_prompts_func(examples), 
        batched=True
    )
    
    test_dataset = Dataset.from_pandas(df_test)
    test_dataset = test_dataset.map(
        lambda examples: formatting_prompts_func(examples), 
        batched=True
    )
    
    return train_dataset, test_dataset

Overwriting get_dataset.py


In [25]:
%%writefile train.py

import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig
from transformers.utils import is_torch_bf16_gpu_available
from get_dataset import build_dataset
from config import LOCAL_MODEL_PATH, LORA_PATH, RANK, MAX_SEQ_LENGTH, EPOCHS, TRAIN_DIR, MAX_ITER_STEPS, OUTPUT_PATH

# ----------------------------
# Load model & tokenizer
# ----------------------------
# model = AutoModelForCausalLM.from_pretrained(
#     LOCAL_MODEL_PATH,
#     torch_dtype="auto",
#     device_map="auto",            
# )
# model.gradient_checkpointing_enable()  # reduce memory usage

# tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_PATH)
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

# ----------------------------
# Build datasets
# ----------------------------
train_dataset, test_dataset = build_dataset()

# ----------------------------
# LoRA config
# ----------------------------
lora_config = LoraConfig(
    r=RANK,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    task_type="CAUSAL_LM",
)

# ----------------------------
# SFT config
# ----------------------------
sft_config = SFTConfig(
    output_dir=OUTPUT_PATH,
    num_train_epochs=EPOCHS,
    max_steps=MAX_ITER_STEPS,
    per_device_train_batch_size=4,       
    gradient_accumulation_steps=4,       
    max_length=min(MAX_SEQ_LENGTH, 2048),  

    optim="paged_adamw_8bit",
    learning_rate=5e-4,
    weight_decay=0.01,
    max_grad_norm=1.0,
        
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
        
    bf16=is_torch_bf16_gpu_available(),
    fp16=not is_torch_bf16_gpu_available(),
    dataloader_pin_memory=True,
        
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},

    warmup_steps=5,
    logging_steps=10,
    eval_steps=1000,
    eval_strategy="steps",
    save_strategy="epoch",
    save_total_limit=3,

    report_to="none",        
    packing=False,
    remove_unused_columns=False,
    dataset_text_field="text",

)

# ----------------------------
# Trainer
# ----------------------------
trainer = SFTTrainer(
    LOCAL_MODEL_PATH,
    #processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=lora_config,
    args=sft_config,
)

trainer.train()
trainer.save_model(LORA_PATH + TRAIN_DIR)


Overwriting train.py


In [26]:
%%writefile accelerate_config.yaml
# """
# ZeRO Stage 1: Optimizer State Partitioning (2-4 GPU)
# ZeRO Stage 2: + Gradient Partitioning (4-8 GPU)
# ZeRO Stage 3: + Parameter Partitioning (8+ GPU)
# """

compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  gradient_accumulation_steps: 4
  gradient_clipping: 1.0
  train_batch_size: 16
  train_micro_batch_size_per_gpu: 4
  
  zero_stage: 2
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  
  stage3_gather_16bit_weights_on_model_save: false
  stage3_max_live_parameters: 1e8
  stage3_max_reuse_distance: 1e8
  stage3_prefetch_bucket_size: 5e7
  stage3_param_persistence_threshold: 1e5
  
  zero_allow_untested_optimizer: true
  zero_force_ds_cpu_optimizer: false
  
  fp16:
    enabled: true
    loss_scale: 0
    initial_scale_power: 16
    loss_scale_window: 1000
    hysteresis: 2
    min_loss_scale: 1
  
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config:
  dynamo_backend: INDUCTOR
  dynamo_use_fullgraph: false
  dynamo_use_dynamic: false
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false


Overwriting accelerate_config.yaml


In [27]:
!accelerate launch --config_file accelerate_config.yaml train.py

[2025-08-19 09:03:26,250] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-08-19 09:03:27,264] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
TRAIN_DIR Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9
Concatenated 1 train files: ['./df_train.csv']
Concatenated 1 test files: ['./df_test.csv']
Train shape: (26899, 8)
Test shape: (6449, 8)
Index(['subreddit', 'rule', 'positive_example_1', 'negative_example_1',
       'positive_example_2', 'negative_example_2', 'test_comment',
       'violates_rule'],
      dtype='object')
subreddit: 0 rows would be dropped
rule: 0 rows would be dropped
positive_example_1: 0 rows would be dropped
negative_example_1: 0 rows would be dropped
positive_example_2: 0 rows would be dropped
negative_example_2: 0 rows would be dropped
test_comment: 0 rows would be dropped
violates_rule: 0 rows would be dropped
Using path: ./

 After droppi

In [29]:
# Replace with path to directory containing model files.
from config import LORA_PATH, TRAIN_DIR, MODEL_SLUG, VARIATION_SLUG
LOCAL_MODEL_DIR = LORA_PATH + TRAIN_DIR

MODEL_SLUG = MODEL_SLUG # Replace with model slug.

# Learn more about naming model variations at
# https://www.kaggle.com/docs/models#name-model.
VARIATION_SLUG = VARIATION_SLUG # Replace with variation slug.

kagglehub.model_upload(
  handle = f"vinothkumarsekar89/{MODEL_SLUG}/transformers/{VARIATION_SLUG}",
  local_model_dir = LOCAL_MODEL_DIR,
  version_notes = 'LoRA Merged')

Uploading Model https://www.kaggle.com/models/vinothkumarsekar89/qwen25-32b-gptq-int4-jigsaw-acrc-lora---/transformers/00 ...
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/README.md


Uploading: 100%|██████████| 5.24k/5.24k [00:00<00:00, 5.52kB/s]

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/README.md (5KB)
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/adapter_model.safetensors



Uploading: 100%|██████████| 1.07G/1.07G [00:56<00:00, 18.9MB/s]

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/adapter_model.safetensors (1GB)
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/adapter_config.json



Uploading: 100%|██████████| 946/946 [00:00<00:00, 1.03kB/s]

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/adapter_config.json (946B)
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/chat_template.jinja



Uploading: 100%|██████████| 2.51k/2.51k [00:00<00:00, 2.60kB/s]

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/chat_template.jinja (2KB)
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/tokenizer_config.json



Uploading: 100%|██████████| 4.69k/4.69k [00:00<00:00, 4.97kB/s]

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/tokenizer_config.json (5KB)
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/special_tokens_map.json



Uploading: 100%|██████████| 613/613 [00:01<00:00, 529B/s]  

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/special_tokens_map.json (613B)
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/added_tokens.json



Uploading: 100%|██████████| 605/605 [00:00<00:00, 649B/s]

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/added_tokens.json (605B)
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/vocab.json



Uploading: 100%|██████████| 2.78M/2.78M [00:03<00:00, 911kB/s] 

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/vocab.json (3MB)
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/merges.txt



Uploading: 100%|██████████| 1.67M/1.67M [00:02<00:00, 632kB/s] 

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/merges.txt (2MB)
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/tokenizer.json



Uploading: 100%|██████████| 11.4M/11.4M [00:03<00:00, 3.43MB/s]

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/tokenizer.json (11MB)
Starting upload for file ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/training_args.bin



Uploading: 100%|██████████| 7.38k/7.38k [00:06<00:00, 1.23kB/s]

Upload successful: ./Qwen2p5-32B-Instruct-GPTQ-Int4_lora_fp16_r64_s25k_e_0_msl2048-1to9/training_args.bin (7KB)





Your model instance has been created.
Files are being processed...
See at: https://www.kaggle.com/models/vinothkumarsekar89/qwen25-32b-gptq-int4-jigsaw-acrc-lora---/transformers/00
