# Jigsaw - Agile Community Rules Classification
### https://www.kaggle.com/competitions/jigsaw-agile-community-rules

In [13]:
#!/usr/bin/python3
!which python3

/usr/bin/python3


In [14]:
!pwd

/data


In [15]:
%%bash
set -e  # exit immediately if a command fails

VENV_DIR="myvenv"

# Step 1: Ensure uv is installed globally to create venv
if ! command -v uv &> /dev/null; then
  echo "Installing uv globally to create venv..."
  python3 -m ensurepip --upgrade || true
  python3 -m pip install --upgrade pip setuptools wheel
  python3 -m pip install --upgrade uv
else
  echo "✅ uv is already installed globally."
fi

# Step 2: Create virtual environment if it doesn't exist
if [ -d "$VENV_DIR" ]; then
  echo "✅ Virtual environment '$VENV_DIR' already exists. Skipping creation."
else
  echo "Creating virtual environment with uv..."
  uv venv "$VENV_DIR"
fi

# Step 3: Activate virtual environment
echo "Activating virtual environment..."
source "$VENV_DIR/bin/activate"

# Step 4: Upgrade pip, setuptools, wheel, and uv *inside* the venv
echo "Upgrading pip, setuptools, wheel, and uv inside the venv..."
python3 -m ensurepip --upgrade || true
python3 -m pip install --upgrade pip setuptools wheel --progress-bar=on
python3 -m pip install --upgrade uv --progress-bar=on

echo "Checking Python binary path:"
which python3

echo "Checking Python version:"
python3 --version

echo "Checking uv version inside venv:"
uv --version

echo "✅ Virtual environment setup complete and up to date!"


✅ uv is already installed globally.
✅ Virtual environment 'myvenv' already exists. Skipping creation.
Activating virtual environment...
Upgrading pip, setuptools, wheel, and uv inside the venv...
Looking in links: /tmp/tmpea0z1jbk
Checking Python binary path:
/data/myvenv/bin/python3
Checking Python version:
Python 3.11.13
Checking uv version inside venv:
uv 0.8.18
✅ Virtual environment setup complete and up to date!


In [16]:
%%bash
set -e  # Exit immediately if a command fails

PYTHON=./myvenv/bin/python

# echo "Bootstrapping pip if needed..."
# $PYTHON -m ensurepip --upgrade || true
# $PYTHON -m pip install --upgrade pip setuptools wheel

# echo "Installing uv..."
# $PYTHON -m pip install --upgrade uv

echo "Installing required libraries with uv..."
$PYTHON -m uv pip install \
  trl \
  optimum \
  auto-gptq \
  bitsandbytes \
  peft \
  accelerate \
  deepspeed \
  kagglehub

echo "✅ Done installing packages into myvenv using uv."


Installing required libraries with uv...


[2mUsing Python 3.11.13 environment at: myvenv[0m
[2mAudited [1m8 packages[0m [2min 79ms[0m[0m


✅ Done installing packages into myvenv using uv.


In [17]:
%%writefile config.py

RESUME_TRAINING=False
#LOCAL_MODEL_PATH = "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4"
LOCAL_MODEL_PATH = "Qwen/Qwen2.5-3B-Instruct"
LORA_IN_PATH= "/workspace/lora"
LORA_OUT_PATH = "/workspace"
DATA_PATH = "./"
OUTPUT_PATH="/workspace"

# Training parameters
MAX_SEQ_LENGTH = 3600
RANK = 64
LORA_ALPHA=64
MAX_ITER_STEPS = -1
EPOCHS = 2
SAMPLE_LEN="35k"

# Kaggle upload configuration
MODEL_SLUG = "qwen25-3b-instruct-jigsaw-acrc-lora"
VARIATION_SLUG = "15"

###--------------------------------###
DATASET_ID="0-cr2"
BASE_MODEL=LOCAL_MODEL_PATH.split("/")[-1].replace(".", "p")
TRAIN_DIR=f"{BASE_MODEL}_lora_fp16_r{RANK}_s{SAMPLE_LEN}_e_{EPOCHS}_msl{MAX_SEQ_LENGTH}-{DATASET_ID}"
print("TRAIN_DIR",TRAIN_DIR)

Overwriting config.py


In [None]:
%%writefile get_dataset.py
import pandas as pd
from datasets import Dataset
import kagglehub
import os
import glob

def load_data():
    """Load Jigsaw ACRC dataset from Kaggle or local files"""
    # Check if running on Kaggle
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        # Running on Kaggle
        base_path = "/kaggle/input/jigsaw-agile-community-rules/"
        df_train = pd.read_csv(f"{base_path}*train*.csv")
        df_test = pd.read_csv(f"{base_path}*test*.csv")
    else:
        # Running locally
        base_path = "./"
        
        # Find all train files
        train_files = glob.glob(f"{base_path}*train*.csv")
        if train_files:
            train_dfs = [pd.read_csv(file) for file in train_files]
            df_train = pd.concat(train_dfs, ignore_index=True)
            print(f"Concatenated {len(train_files)} train files: {train_files}")
        else:
            raise FileNotFoundError(f"No train files found in {base_path}")
        
        # Find all test files
        test_files = glob.glob(f"{base_path}*test*.csv")
        if test_files:
            test_dfs = [pd.read_csv(file) for file in test_files]
            df_test = pd.concat(test_dfs, ignore_index=True)
            print(f"Concatenated {len(test_files)} test files: {test_files}")
        else:
            raise FileNotFoundError(f"No test files found in {base_path}")

    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")
    print(df_train.columns)
            
    req_cols=['subreddit', 'rule', 'positive_example_1', 'negative_example_1', 'positive_example_2',
           'negative_example_2', 'test_comment', 'violates_rule']

    df_train=df_train[req_cols]
    df_test=df_test[req_cols]

    # Normalize "True"/"False" -> "Yes"/"No" and drop anything else
    for name, df in [("train", df_train), ("test", df_test)]:
        df["violates_rule"] = (
            df["violates_rule"]
            .astype(str).str.strip()
            .map({"True": "Yes", "False": "No", "Yes": "Yes", "No": "No"})  # normalize
        )
        before = len(df)
        df.dropna(subset=["violates_rule"], inplace=True)  # drop rows with NaN (anything not Yes/No/True/False)
        after = len(df)
        print(f"Dropped {before - after} rows from {name} due to invalid 'violates_rule'")
    
    for col in req_cols:
        dropped_rows = df_train[df_train[col].isna()].shape[0]
        print(f"{col}: {dropped_rows} rows would be dropped")
        
    df_train = df_train[req_cols].dropna()
    df_test = df_test[req_cols].dropna()

    print(f"Using path: {base_path}")
    print("\n After dropping:")
    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")

    df_train["violates_rule"] = df_train["violates_rule"].astype(str)
    df_test["violates_rule"] = df_test["violates_rule"].astype(str)

    valid_values = {"Yes", "No"}
    df_train = df_train[df_train["violates_rule"].isin(valid_values)]
    df_test  = df_test[df_test["violates_rule"].isin(valid_values)]
    print("\n After checking Yes/No:")
    print(f"Train shape: {df_train.shape}")
    print(f"Test shape: {df_test.shape}")
    
    return df_train, df_test

def formatting_prompts_func(examples, tokenizer):
    """
    Format Reddit moderation dataset using tokenizer chat template
    """
    texts = []
    for i in range(len(examples['subreddit'])):
        # Create system message
        system_msg = f"""You are a really experienced moderator for the subreddit /r/{examples['subreddit'][i]}. 
Your job is to determine if the following reported comment violates the given rule. Answer with only Yes or No."""
        
        # Create user message with the rule and examples
        user_msg = f"""<rule>
{examples['rule'][i]}
</rule>

<examples>
<example>
<comment>{examples['positive_example_1'][i]}</comment>
<rule_violation>Yes</rule_violation>
</example>

<example>
<comment>{examples['positive_example_2'][i]}</comment>
<rule_violation>Yes</rule_violation>
</example>

<example>
<comment>{examples['negative_example_1'][i]}</comment>
<rule_violation>No</rule_violation>
</example>

<example>
<comment>{examples['negative_example_2'][i]}</comment>
<rule_violation>No</rule_violation>
</example>
</examples>

<test_comment>
{examples['test_comment'][i]}
</test_comment>"""
        
        # Assistant response is "Yes" or "No"
        assistant_msg = examples['violates_rule'][i]

        # Create messages list for chat template
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": assistant_msg}
        ]


        formatted_text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )

        texts.append(formatted_text)

    return {"text": texts}

def build_dataset(tokenizer):
    """
    Build both train and test datasets using tokenizer chat template
    """
    df_train, df_test = load_data()
    
    train_dataset = Dataset.from_pandas(df_train)
    train_dataset = train_dataset.map(
        lambda examples: formatting_prompts_func(examples, tokenizer), 
        batched=True
    )
    
    test_dataset = Dataset.from_pandas(df_test)
    test_dataset = test_dataset.map(
        lambda examples: formatting_prompts_func(examples, tokenizer), 
        batched=True
    )
    
    return train_dataset, test_dataset

In [None]:
%%writefile train.py

from unsloth import FastLanguageModel
import torch
import os
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
#os.environ["CUDA_LAUNCH_BLOCKING"] = "0"


#load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    #model_name="./local-model-name",
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=( None ),
    load_in_4bit=False,
    load_in_8bit=False
)

#set peft config
model = FastLanguageModel.get_peft_model(
    model,
    r = RANK, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = LORA_ALPHA,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 123,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

#load dataset
dataset_train, dataset_test = build_dataset(tokenizer)


#SFT trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    eval_dataset = dataset_test,  # Add test dataset here
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = EPOCHS, 
        max_steps = max_iter_steps,
        learning_rate = 5e-4,
        fp16 = False,  # Disable FP16 to avoid gradient unscaling issues
        bf16 = is_bfloat16_supported(),  # Use BF16 instead if supported
        logging_steps = 10,
        optim = "adamw_torch",  # Use adamw_torch instead of adamw_8bit for better compatibility
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 123,
        eval_strategy = "steps", 
        eval_steps = 2000,
        max_grad_norm = 1.0,  # Add explicit gradient clipping
        # Save settings - save after each epoch
        save_strategy = "epoch",
        save_total_limit = 3,  # Keep only last 3 checkpoints to save space
        #load_best_model_at_end = True,
        #metric_for_best_model = "eval_loss",
        output_dir = "outputs",
        #report_to = "mlflow", # Use this for WandB etc
    ),
)

#train model
trainer.train()


#save lora weights
os.makedirs(os.path.join(LORA_OUT_PATH, TRAIN_DIR), exist_ok=True)
model.save_pretrained(os.path.join(LORA_OUT_PATH, TRAIN_DIR))  # Local saving
tokenizer.save_pretrained(os.path.join(LORA_OUT_PATH, TRAIN_DIR))

print("success")

Overwriting train.py


In [21]:
%%bash
mkdir -p ~/.kaggle
cp kaggle.json ~/.kaggle/
chmod 600 ~/.kaggle/kaggle.json

In [22]:
%%writefile upload.py
import os
import kagglehub
from config import LORA_OUT_PATH, TRAIN_DIR, MODEL_SLUG, VARIATION_SLUG
LOCAL_MODEL_DIR = os.path.join(LORA_OUT_PATH, TRAIN_DIR)

kagglehub.model_upload(
    handle=f"vinothkumarsekar89/{MODEL_SLUG}/transformers/{VARIATION_SLUG}",
    local_model_dir=LOCAL_MODEL_DIR,
    version_notes= TRAIN_DIR
)



Writing upload.py


In [23]:
%%writefile run.sh
./myvenv/bin/python  train.py
./myvenv/bin/python launch upload.py

Writing run.sh


In [1]:
%%bash
chmod +x run.sh
nohup bash run.sh > log.log 2>&1 &

In [4]:
%%bash
tail -f log.log