# Jigsaw - Agile Community Rules Classification
### https://www.kaggle.com/competitions/jigsaw-agile-community-rules

In [8]:
# Basic training
#python train.py

# Training with custom config
#python train.py --config config.yaml --use-wandb

# Training with CLI overrides
#python train.py --epochs 5 --batch-size 4 --learning-rate 1e-4

# Evaluation
#python evaluate.py --model-path output/ --config config.yaml

In [1]:
%%writefile config.py
"""Simple configuration management."""

import yaml
from dataclasses import dataclass, field
from typing import List

@dataclass
class Config:
    # Model paths
    base_model_path: str = "/kaggle/input/qwen2.5/transformers/0.5b-instruct-gptq-int4/1"
    lora_output_path: str = "output/"
    data_path: str = "/kaggle/input/jigsaw-agile-community-rules/"
    
    # Prompt settings
    positive_answer: str = "Yes"
    negative_answer: str = "No"
    complete_phrase: str = "Answer:"
    
    # LoRA settings
    lora_r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.1
    target_modules: List[str] = field(default_factory=lambda: [
        "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"
    ])
    
    # Training settings
    num_epochs: int = 1
    batch_size: int = 16
    gradient_accumulation_steps: int = 2
    learning_rate: float = 5e-5
    weight_decay: float = 0.01
    warmup_ratio: float = 0.05
    max_seq_length: int = 2048
    
    # System settings
    seed: int = 42
    output_dir: str = "outputs"
    
    @property
    def base_prompt(self) -> str:
        return f"Reddit moderation: Does the comment violate the rule? Answer '{self.positive_answer}' or '{self.negative_answer}' only."
    
    @classmethod
    def load(cls, config_path: str) -> 'Config':
        """Load config from YAML file."""
        with open(config_path, 'r') as f:
            data = yaml.safe_load(f)
        return cls(**data)
    
    def save(self, config_path: str):
        """Save config to YAML file."""
        with open(config_path, 'w') as f:
            yaml.dump(self.__dict__, f, default_flow_style=False)

Writing config.py


In [2]:
%%writefile utils.py
"""Data processing utilities."""

import pandas as pd
from datasets import Dataset

def build_prompt(row, config):
    """Build training prompt from data row."""
    return f"""
{config.base_prompt}
r/{row["subreddit"]} rule: {row["rule"]}
Comment: {row["body"]}
---
{config.complete_phrase}"""

def load_and_combine_data(config) -> pd.DataFrame:
    """Load and combine training data."""
    # Load datasets
    train_df = pd.read_csv(f"{config.data_path}/train.csv")
    test_df = pd.read_csv(f"{config.data_path}/test.csv")
    
    dataframes = []
    
    # Add original training data
    train_subset = train_df[["body", "rule", "subreddit", "rule_violation"]].copy()
    dataframes.append(train_subset)
    
    # Add positive and negative examples from test set
    for violation_type in ["positive", "negative"]:
        for i in range(1, 3):
            column_name = f"{violation_type}_example_{i}"
            if column_name in test_df.columns:
                sub_df = test_df[["rule", "subreddit", column_name]].copy()
                sub_df = sub_df.rename(columns={column_name: "body"})
                sub_df["rule_violation"] = 1 if violation_type == "positive" else 0
                
                # Remove empty entries
                sub_df = sub_df.dropna(subset=["body"])
                sub_df = sub_df[sub_df["body"].str.strip() != ""]
                dataframes.append(sub_df)
    
    # Combine and deduplicate
    combined_df = pd.concat(dataframes, axis=0, ignore_index=True)
    combined_df = combined_df.drop_duplicates(subset=["body", "rule", "subreddit"], ignore_index=True)
    
    print(f"Combined dataset: {len(combined_df)} samples")
    print(f"Positive: {sum(combined_df['rule_violation'] == 1)}")
    print(f"Negative: {sum(combined_df['rule_violation'] == 0)}")
    
    return combined_df

def create_dataset(config) -> Dataset:
    """Create training dataset."""
    df = load_and_combine_data(config)
    
    # Build prompts and completions
    df["prompt"] = df.apply(lambda row: build_prompt(row, config), axis=1)
    df["completion"] = df["rule_violation"].map({
        1: config.positive_answer,
        0: config.negative_answer,
    })
    
    # Create final dataset
    final_df = df[["prompt", "completion"]].copy()
    dataset = Dataset.from_pandas(final_df, preserve_index=False)
    
    print(f"Training dataset: {len(dataset)} samples")
    return dataset

Writing utils.py


In [3]:
%%writefile train.py
"""Main training script with distributed support."""

import os
import argparse
import torch
from pathlib import Path

from accelerate import Accelerator
from accelerate.utils import set_seed
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_torch_bf16_gpu_available
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

from config import Config
from utils import create_dataset

def get_training_args(config, accelerator) -> SFTConfig:
    """Create training arguments."""
    # Auto-detect mixed precision
    use_bf16 = is_torch_bf16_gpu_available()
    use_fp16 = not use_bf16
    
    # Create output directory
    Path(config.output_dir).mkdir(parents=True, exist_ok=True)
    
    return SFTConfig(
        output_dir=config.output_dir,
        
        # Training params
        num_train_epochs=config.num_epochs,
        per_device_train_batch_size=config.batch_size,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        
        # Optimization
        optim="paged_adamw_8bit",
        learning_rate=config.learning_rate,
        weight_decay=config.weight_decay,
        max_grad_norm=1.0,
        
        # Scheduler
        lr_scheduler_type="cosine",
        warmup_ratio=config.warmup_ratio,
        
        # Mixed precision
        bf16=use_bf16,
        fp16=use_fp16,
        
        # Memory optimization
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
        dataloader_pin_memory=True,
        
        # Logging
        logging_steps=10,
        save_strategy="epoch",
        
        # SFT specific
        completion_only_loss=True,
        packing=False,
        max_seq_length=config.max_seq_length,
        remove_unused_columns=False,
        
        # Simple reporting
        report_to="none",
        seed=config.seed,
    )

def get_lora_config(config) -> LoraConfig:
    """Create LoRA configuration."""
    return LoraConfig(
        r=config.lora_r,
        lora_alpha=config.lora_alpha,
        lora_dropout=config.lora_dropout,
        bias="none",
        target_modules=config.target_modules,
        task_type="CAUSAL_LM",
    )

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, help="Config file path")
    parser.add_argument("--epochs", type=int, help="Number of epochs")
    parser.add_argument("--batch-size", type=int, help="Batch size")
    parser.add_argument("--learning-rate", type=float, help="Learning rate")
    args = parser.parse_args()
    
    # Initialize accelerator for distributed training
    accelerator = Accelerator()
    
    # Load config
    if args.config:
        config = Config.load(args.config)
    else:
        config = Config()
    
    # Override with CLI args
    if args.epochs:
        config.num_epochs = args.epochs
    if args.batch_size:
        config.batch_size = args.batch_size
    if args.learning_rate:
        config.learning_rate = args.learning_rate
    
    # Set seed for reproducibility
    set_seed(config.seed)
    
    # Setup logging
    if accelerator.is_main_process:
        print(f"Training on {accelerator.num_processes} process(es)")
        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"GPU count: {torch.cuda.device_count()}")
        print(f"Config: {config}")
        
        # Save config
        config.save(f"{config.output_dir}/config.yaml")
    
    # Create dataset
    train_dataset = create_dataset(config)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        config.base_model_path,
        trust_remote_code=True,
        padding_side="right"
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Get configs
    training_args = get_training_args(config, accelerator)
    lora_config = get_lora_config(config)
    
    # Create trainer
    trainer = SFTTrainer(
        model=config.base_model_path,
        args=training_args,
        train_dataset=train_dataset,
        peft_config=lora_config,
        tokenizer=tokenizer,
    )
    
    # Train
    if accelerator.is_main_process:
        print("Starting training...")
    
    trainer.train()
    
    # Save model
    output_path = Path(config.lora_output_path)
    output_path.mkdir(parents=True, exist_ok=True)
    trainer.save_model(str(output_path))
    
    if accelerator.is_main_process:
        print(f"Training completed! Model saved to {output_path}")

if __name__ == "__main__":
    main()

Writing train.py


In [4]:
%%writefile evaluate.py
"""Simple evaluation script."""

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from sklearn.metrics import accuracy_score, classification_report
import argparse

from config import Config
from utils import build_prompt

def load_model(config, model_path):
    """Load trained model."""
    print(f"Loading model from {model_path}")
    
    tokenizer = AutoTokenizer.from_pretrained(
        config.base_model_path,
        trust_remote_code=True
    )
    
    base_model = AutoModelForCausalLM.from_pretrained(
        config.base_model_path,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    model = PeftModel.from_pretrained(base_model, model_path)
    model.eval()
    
    return model, tokenizer

def predict_single(model, tokenizer, prompt, config):
    """Make prediction for single prompt."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id
        )
    
    new_tokens = outputs[0][len(inputs.input_ids[0]):]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
    
    # Convert to binary prediction
    if config.positive_answer.lower() in response.lower():
        return 1
    else:
        return 0

def evaluate_model(config, model_path, test_data):
    """Evaluate model on test data."""
    model, tokenizer = load_model(config, model_path)
    
    predictions = []
    true_labels = []
    
    print(f"Evaluating on {len(test_data)} samples...")
    
    for idx, row in test_data.iterrows():
        prompt = build_prompt(row, config)
        pred = predict_single(model, tokenizer, prompt, config)
        
        predictions.append(pred)
        true_labels.append(row['rule_violation'])
        
        if (idx + 1) % 50 == 0:
            print(f"Processed {idx + 1}/{len(test_data)}")
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    
    print(f"\nAccuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    
    return accuracy, predictions, true_labels

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", required=True, help="Path to trained model")
    parser.add_argument("--config", help="Config file path")
    parser.add_argument("--test-file", help="Test data file")
    args = parser.parse_args()
    
    # Load config
    if args.config:
        config = Config.load(args.config)
    else:
        config = Config()
    
    # Load test data
    if args.test_file:
        test_data = pd.read_csv(args.test_file)
    else:
        test_data = pd.read_csv(f"{config.data_path}/test.csv")
    
    # Add rule_violation column for test data (adjust based on your data)
    if 'rule_violation' not in test_data.columns:
        print("Warning: No rule_violation column found. Using dummy labels for demo.")
        test_data['rule_violation'] = 0  # or load from somewhere else
    
    # Evaluate
    evaluate_model(config, args.model_path, test_data)

if __name__ == "__main__":
    main()

Writing evaluate.py


In [5]:
%%writefile config.yaml
# Simple configuration file
base_model_path: "/kaggle/input/qwen2.5/transformers/0.5b-instruct-gptq-int4/1"
lora_output_path: "output/"
data_path: "/kaggle/input/jigsaw-agile-community-rules/"

# Prompt settings
positive_answer: "Yes"
negative_answer: "No"
complete_phrase: "Answer:"

# LoRA settings
lora_r: 16
lora_alpha: 32
lora_dropout: 0.1
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# Training settings
num_epochs: 3
batch_size: 8
gradient_accumulation_steps: 4
learning_rate: 5e-5
weight_decay: 0.01
warmup_ratio: 0.05
max_seq_length: 2048

# System
seed: 42
output_dir: "outputs"

Writing config.yaml


In [6]:
%%writefile accelerate_config.yaml
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
  gradient_accumulation_steps: 4
  gradient_clipping: 1.0
  train_batch_size: 32
  train_micro_batch_size_per_gpu: 8
  
  zero_stage: 2
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  
  stage3_gather_16bit_weights_on_model_save: false
  stage3_max_live_parameters: 1e8
  stage3_max_reuse_distance: 1e8
  stage3_prefetch_bucket_size: 5e7
  stage3_param_persistence_threshold: 1e5
  
  zero_allow_untested_optimizer: true
  zero_force_ds_cpu_optimizer: false
  
  fp16:
    enabled: true
    loss_scale: 0
    initial_scale_power: 16
    loss_scale_window: 1000
    hysteresis: 2
    min_loss_scale: 1
  
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_config:
  dynamo_backend: INDUCTOR
  dynamo_use_fullgraph: false
  dynamo_use_dynamic: false
enable_cpu_affinity: false
machine_rank: 0
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

Writing accelerate_config.yaml


In [14]:
%%writefile run_training.sh
#!/bin/bash
# Simple training script with distributed support

# Configure accelerate (run once)
# accelerate config --config_file accelerate_config.yaml

echo "=== Reddit Moderation Training ==="

# Single GPU training
echo "Single GPU training:"
python train.py --config config.yaml

# Multi-GPU training with accelerate and DeepSpeed
echo "Multi-GPU training with accelerate:"
# accelerate launch --config_file accelerate_config.yaml train.py --config config.yaml

# With custom parameters
echo "Training with custom parameters:"
# python train.py --epochs 5 --batch-size 4 --learning-rate 1e-4

# Evaluation
echo "Evaluation:"
# python evaluate.py --model-path output/ --config config.yaml

echo "Training script ready!"

Writing run_training.sh


In [16]:
# # Setup accelerate (first time only)
# accelerate config --config_file accelerate_config.yaml

# # Single GPU
# python train.py --config config.yaml

# # Multi-GPU with DeepSpeed
# accelerate launch --config_file accelerate_config.yaml train.py --config config.yaml

# # Custom parameters
# python train.py --epochs 5 --batch-size 4 --learning-rate 1e-4

# # Evaluation
# python evaluate.py --model-path output/ --config config.yaml