# Jigsaw - Agile Community Rules Classification
### https://www.kaggle.com/competitions/jigsaw-agile-community-rules

In [7]:
## Speed up in progress .. not seen it yet

In [1]:
%%writefile config.py

RESUME_TRAINING=False
LOCAL_MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4"
LORA_IN_PATH= "./"
LORA_OUT_PATH = "./"
DATA_PATH = "./"
OUTPUT_PATH="./"

# Training parameters
MAX_SEQ_LENGTH = 3600
RANK = 32
LORA_ALPHA=64
MAX_ITER_STEPS = 20
EPOCHS = -1
SAMPLE_LEN="24k"

# Kaggle upload configuration
MODEL_SLUG = "qwen25-7b-gptq-int4-jigsaw-acrc-lora-cml-0=="
VARIATION_SLUG = "01"

###--------------------------------###
DATASET_ID="1to9"
BASE_MODEL=LOCAL_MODEL_PATH.split("/")[-1].replace(".", "p")
TRAIN_DIR=f"{BASE_MODEL}_lora_fp16_r{RANK}_s{SAMPLE_LEN}_e_{EPOCHS}_msl{MAX_SEQ_LENGTH}-{DATASET_ID}"
print("TRAIN_DIR",TRAIN_DIR)

Overwriting config.py


In [2]:
%%writefile get_dataset.py
import pandas as pd
from datasets import Dataset
import kagglehub
import os
import glob

def load_data():
    """Load Jigsaw ACRC dataset from Kaggle or local files"""
    # Check if running on Kaggle
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        # Running on Kaggle
        base_path = "/kaggle/input/jigsaw-agile-community-rules/"
        df_train = pd.read_csv(f"{base_path}*train*.csv")
        df_test = pd.read_csv(f"{base_path}*test*.csv")
    else:
        # Running locally
        base_path = "./"
        
        # Find all train files
        train_files = glob.glob(f"{base_path}*train*.csv")
        if train_files:
            train_dfs = [pd.read_csv(file) for file in train_files]
            df_train = pd.concat(train_dfs, ignore_index=True)
            print(f"Concatenated {len(train_files)} train files: {train_files}")
        else:
            raise FileNotFoundError(f"No train files found in {base_path}")
        
        # Find all test files
        test_files = glob.glob(f"{base_path}*test*.csv")
        if test_files:
            test_dfs = [pd.read_csv(file) for file in test_files]
            df_test = pd.concat(test_dfs, ignore_index=True)
            print(f"Concatenated {len(test_files)} test files: {test_files}")
        else:
            raise FileNotFoundError(f"No test files found in {base_path}")

    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")
    print(df_train.columns)
            
    req_cols=['subreddit', 'rule', 'positive_example_1', 'negative_example_1', 'positive_example_2',
           'negative_example_2', 'test_comment', 'violates_rule']

    df_train=df_train[req_cols]
    df_test=df_test[req_cols]

    # Normalize "True"/"False" -> "Yes"/"No" and drop anything else
    for name, df in [("train", df_train), ("test", df_test)]:
        df["violates_rule"] = (
            df["violates_rule"]
            .astype(str).str.strip()
            .map({"True": "Yes", "False": "No", "Yes": "Yes", "No": "No"})  # normalize
        )
        before = len(df)
        df.dropna(subset=["violates_rule"], inplace=True)  # drop rows with NaN (anything not Yes/No/True/False)
        after = len(df)
        print(f"Dropped {before - after} rows from {name} due to invalid 'violates_rule'")
    
    for col in req_cols:
        dropped_rows = df_train[df_train[col].isna()].shape[0]
        print(f"{col}: {dropped_rows} rows would be dropped")
        
    df_train = df_train[req_cols].dropna()
    df_test = df_test[req_cols].dropna()

    print(f"Using path: {base_path}")
    print("\n After dropping:")
    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")

    df_train["violates_rule"] = df_train["violates_rule"].astype(str)
    df_test["violates_rule"] = df_test["violates_rule"].astype(str)

    valid_values = {"Yes", "No"}
    df_train = df_train[df_train["violates_rule"].isin(valid_values)]
    df_test  = df_test[df_test["violates_rule"].isin(valid_values)]
    print("\n After checking Yes/No:")
    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")
    
    return df_train, df_test

def formatting_prompts_func(examples, tokenizer):
    """
    Format Reddit moderation dataset using tokenizer chat template
    """

    texts = []

    for i in range(len(examples['subreddit'])):
        # Create system message
        system_msg = f"You are a really experienced moderator for the subreddit /r/{examples['subreddit'][i]}. Your job is to determine if the following reported comment violates the given rule. Answer with only \"Yes\" or \"No\"."

        # Create user message with the rule and examples
        user_msg = f"""Rule: {examples['rule'][i]}
Example 1:
{examples['positive_example_1'][i]}
Rule violation: Yes
Example 2:
{examples['negative_example_1'][i]}
Rule violation: No
Example 3:
{examples['positive_example_2'][i]}
Rule violation: Yes
Example 4:
{examples['negative_example_2'][i]}
Rule violation: No
Test sentence:
{examples['test_comment'][i]}"""

        # Assistant response is "Yes" or "No"
        assistant_msg = examples['violates_rule'][i]

        # Create messages list for chat template
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": assistant_msg}
        ]


        formatted_text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )

        texts.append(formatted_text)

    return {"text": texts}

def build_dataset(tokenizer):
    """
    Build both train and test datasets using tokenizer chat template
    """
    df_train, df_test = load_data()
    
    train_dataset = Dataset.from_pandas(df_train)
    train_dataset = train_dataset.map(
        lambda examples: formatting_prompts_func(examples, tokenizer), 
        batched=True
    )
    
    test_dataset = Dataset.from_pandas(df_test)
    test_dataset = test_dataset.map(
        lambda examples: formatting_prompts_func(examples, tokenizer), 
        batched=True
    )
    
    return train_dataset, test_dataset

Overwriting get_dataset.py


In [3]:
%%writefile train.py
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, PeftModel
from transformers.utils import is_torch_bf16_gpu_available
from get_dataset import build_dataset
from config import LOCAL_MODEL_PATH, LORA_IN_PATH, LORA_OUT_PATH, RANK, MAX_SEQ_LENGTH, EPOCHS, TRAIN_DIR, MAX_ITER_STEPS, OUTPUT_PATH, RESUME_TRAINING, LORA_ALPHA
import os
import gc

# Minimal environment optimizations that definitely help
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false" 
os.environ["TORCH_CUDNN_ALLOW_TF32"] = "1"  # This one is almost always beneficial

# Check for Flash Attention 2 (this is the biggest potential win)
try:
    import flash_attn
    use_flash_attn = True
    print("Flash Attention 2 available - will enable")
except ImportError:
    use_flash_attn = False
    print("Flash Attention 2 not available")

# ----------------------------
# Load model & tokenizer  
# ----------------------------
print("Loading model and tokenizer...")

# Only add optimizations that are proven to help
model_kwargs = {
    "device_map": "auto", 
    "torch_dtype": torch.bfloat16,  # Keep this - often better than fp16
    "low_cpu_mem_usage": True,
    "trust_remote_code": True,
    "use_cache": False,  # This saves memory without hurting speed
}

# Only add Flash Attention if available (this can be 2-4x speedup)
if use_flash_attn:
    model_kwargs["attn_implementation"] = "flash_attention_2"

model = AutoModelForCausalLM.from_pretrained(LOCAL_MODEL_PATH, **model_kwargs)

if RESUME_TRAINING: 
    model = PeftModel.from_pretrained(model, LORA_IN_PATH, is_trainable=True)

# Use fast tokenizer (minimal overhead, good benefit)
tokenizer = AutoTokenizer.from_pretrained(
    LOCAL_MODEL_PATH,
    use_fast=True,
    padding_side="right"
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Keep your original gradient checkpointing
model.gradient_checkpointing_enable()

# ----------------------------
# Build datasets
# ----------------------------
print("Building datasets...")
train_dataset, test_dataset = build_dataset(tokenizer)

# Basic memory cleanup
torch.cuda.empty_cache()
gc.collect()

# ----------------------------
# LoRA config (exactly as you had)
# ----------------------------
if RESUME_TRAINING:    
    lora_config = None       
else:    
    lora_config = LoraConfig(
        r=RANK,
        lora_alpha=LORA_ALPHA,
        lora_dropout=0.0,
        bias="none",
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ],
        task_type="CAUSAL_LM",
    )
    
# ----------------------------
# SFT config (EXACTLY your original settings)
# ----------------------------
sft_config = SFTConfig(
    output_dir=OUTPUT_PATH,
    num_train_epochs=EPOCHS,
    max_steps=MAX_ITER_STEPS,
    per_device_train_batch_size=2,       
    gradient_accumulation_steps=4,       
    max_length=min(MAX_SEQ_LENGTH, 4096),  
    optim="paged_adamw_8bit",
    learning_rate=5e-4,
    weight_decay=0.01,
    max_grad_norm=1.0,
    lr_scheduler_type="linear",
    warmup_ratio=0.05,
    
    # Keep your original precision settings
    bf16=is_torch_bf16_gpu_available(),
    fp16=not is_torch_bf16_gpu_available(),
    dataloader_pin_memory=True,
    
    # Your exact logging/saving settings
    warmup_steps=5,
    logging_steps=10,
    eval_steps=3000,
    eval_strategy="no",
    save_strategy="no",
    save_total_limit=1,
    report_to="none",        
    packing=False,
    remove_unused_columns=False,
    dataset_text_field="text",
)

# ----------------------------
# Trainer (keep it simple)
# ----------------------------
print("Initializing trainer...")
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=lora_config,
    args=sft_config,
)

print("Starting training...")
trainer.train()

trainer.save_model(os.path.join(LORA_OUT_PATH, TRAIN_DIR))
torch.cuda.empty_cache()
print("success")

Overwriting train.py


In [4]:
%%writefile accelerate_config.yaml
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  # Your exact settings that were working
  gradient_accumulation_steps: 4
  gradient_clipping: 1.0
  train_batch_size: 8
  train_micro_batch_size_per_gpu: 2
  
  # Keep ZeRO Stage 2 (it was working well)
  zero_stage: 2
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  
  # Conservative memory settings (don't over-optimize)
  stage3_gather_16bit_weights_on_model_save: false
  stage3_max_live_parameters: 1e8
  stage3_max_reuse_distance: 1e8
  stage3_prefetch_bucket_size: 5e7
  stage3_param_persistence_threshold: 1e5
  
  zero_allow_untested_optimizer: true
  zero_force_ds_cpu_optimizer: false
  
  # Only enable proven optimizations
  contiguous_gradients: true
  
  # Your working precision settings
  fp16:
    enabled: true
    loss_scale: 0
    initial_scale_power: 16
    loss_scale_window: 1000
    hysteresis: 2
    min_loss_scale: 1
  
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 1
use_cpu: false

Overwriting accelerate_config.yaml


In [5]:
## 20 iter in 1.17-1.20 with 3.9s/iter

In [6]:
!accelerate launch --config_file accelerate_config.yaml train.py

[2025-09-19 18:25:49,106] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-09-19 18:25:50,715] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
TRAIN_DIR Qwen2p5-7B-Instruct-GPTQ-Int4_lora_fp16_r32_s24k_e_-1_msl3600-1to9
âœ“ Flash Attention 2 available - will enable
Loading model and tokenizer...
`torch_dtype` is deprecated! Use `dtype` instead!
  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:01<00:00,  1.60it/s]
Building datasets...
Concatenated 1 train files: ['./df_train.csv']
Concatenated 1 test files: ['./df_test.csv']
Train shape: (2029, 11)
Test shape: (2029, 11)
Index(['row_id', 'body', 'rule',