# Jigsaw - Agile Community Rules Classification
### https://www.kaggle.com/competitions/jigsaw-agile-community-rules

In [1]:
import kagglehub
import pandas as pd
import os

base_path = "./data/final/zothers/"
df_train = pd.read_csv(f"{base_path}rule_comment.csv")
print(df_train.shape)
df_train.head(1)
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train = df_train.sample(n=1000)
print(df_train.shape)

(251740, 4)
(1000, 4)


In [2]:
df_train["chunk"] = "rule: " + df_train["rule"] + " comment: " + df_train["test_comment"]

In [4]:
import numpy as np
import pandas as pd
import faiss
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

class FastEmbedder:
    """Lightweight embedder using Qwen3-Embedding model"""
    def __init__(self, model_name='Qwen/Qwen3-Embedding-0.6B', output_dim=512):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True,
            torch_dtype=torch.float16  # Use FP16 for efficiency
        )
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model.to(self.device)
        self.model.eval()
        self.output_dim = output_dim
        self.max_length = 1024  # Qwen3 supports up to 32K, using 8K for efficiency
    
    def last_token_pool(self, last_hidden_states, attention_mask):
        """Pool using last token (EOS) as recommended for Qwen3"""
        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
        if left_padding:
            return last_hidden_states[:, -1]
        else:
            sequence_lengths = attention_mask.sum(dim=1) - 1
            batch_size = last_hidden_states.shape[0]
            return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
    
    def encode(self, texts, batch_size=32, convert_to_numpy=True, instruction=None):
        """
        Encode texts to embeddings with optional instruction
        
        Args:
            texts: List of texts to encode
            batch_size: Batch size for encoding
            convert_to_numpy: Whether to convert to numpy
            instruction: Optional task instruction (e.g., "Given a rule and comment, retrieve similar examples")
        """
        all_embeddings = []
        
        # Add instruction prefix if provided
        if instruction:
            texts = [f"Instruct: {instruction}\nQuery: {text}" for text in texts]
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encoded = self.tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                return_tensors='pt', 
                max_length=self.max_length
            )
            encoded = {k: v.to(self.device) for k, v in encoded.items()}
            
            with torch.no_grad():
                outputs = self.model(**encoded)
                
                # Use last token pooling (recommended for Qwen3)
                embeddings = self.last_token_pool(
                    outputs.last_hidden_state, 
                    encoded['attention_mask']
                )
                
                # Normalize embeddings
                embeddings = F.normalize(embeddings, p=2, dim=1)
                
                # Truncate to desired dimension (Matryoshka)
                if self.output_dim and self.output_dim < embeddings.shape[1]:
                    embeddings = embeddings[:, :self.output_dim]
                    # Renormalize after truncation
                    embeddings = F.normalize(embeddings, p=2, dim=1)
                
            # Convert to float32 for FAISS compatibility
            all_embeddings.append(embeddings.cpu().float())
        
        embeddings = torch.cat(all_embeddings, dim=0)
        
        if convert_to_numpy:
            return embeddings.numpy().astype(np.float32)
        return embeddings

def build_faiss_index(train_vectors, use_gpu=True):
    """Build FAISS index from training vectors"""
    if not isinstance(train_vectors, np.ndarray):
        train_vectors = train_vectors.cpu().numpy()
    
    # Ensure float32 dtype for FAISS
    train_vectors = train_vectors.astype(np.float32)
    
    # Normalize for cosine similarity
    faiss.normalize_L2(train_vectors)
    
    dimension = train_vectors.shape[1]
    index = faiss.IndexFlatIP(dimension)
    
    if use_gpu and faiss.get_num_gpus() > 0:
        print(f"Using GPU with {faiss.get_num_gpus()} GPU(s)")
        res = faiss.StandardGpuResources()
        index = faiss.index_cpu_to_gpu(res, 0, index)
    else:
        print("Using CPU")
    
    index.add(train_vectors)
    return index

def evaluate_df_test(df_test, model, faiss_index, df_train_values, top_k=20, instruction=None):
    """Evaluate test dataframe using FAISS"""
    # Create chunks
    df_test["chunk"] = "rule: " + df_test["rule"] + " comment: " + df_test["test_comment"]
    
    # Encode with instruction
    new_vectors = model.encode(
        df_test["chunk"].tolist(), 
        convert_to_numpy=True,
        instruction=instruction
    )
    
    # Ensure float32 and normalize
    new_vectors = new_vectors.astype(np.float32)
    faiss.normalize_L2(new_vectors)
    
    # Search
    similarities, indices = faiss_index.search(new_vectors, top_k)
    
    results = []
    for idx_row in indices:
        top_values = df_train_values.iloc[idx_row]
        avg_value = top_values.mean()
        decision = 1 if avg_value > 0.5 else 0
        results.append({"rule_violation": avg_value, "decision": decision})
    
    return pd.DataFrame(results)

# Main execution
if __name__ == "__main__":
    print("Loading Qwen3-Embedding-0.6B model...")
    model = FastEmbedder('Qwen/Qwen3-Embedding-0.6B', output_dim=1024)
    
    # Define task instruction (recommended for Qwen3)
    instruction = "Given a rule and comment, retrieve similar training examples for classification"
    
    print("Encoding training data...")
    df_train["chunk"] = "rule: " + df_train["rule"] + " comment: " + df_train["test_comment"]
    train_vectors = model.encode(
        df_train["chunk"].tolist(),
        instruction=instruction
    )
    
    print(f"Training vectors shape: {train_vectors.shape}, dtype: {train_vectors.dtype}")
    
    print("Building FAISS index...")
    faiss_index = build_faiss_index(train_vectors, use_gpu=True)
    
    print("Loading and evaluating test data...")
    df_test = pd.read_csv("./data/final/df_test_cr_12.csv")
    
    result_df = evaluate_df_test(
        df_test, 
        model, 
        faiss_index, 
        df_train["value"], 
        top_k=10,
        instruction=instruction
    )
    
    print(result_df)

Loading Qwen3-Embedding-0.6B model...
Encoding training data...
Training vectors shape: (1000, 1024), dtype: float32
Building FAISS index...
Using GPU with 1 GPU(s)
Loading and evaluating test data...
      rule_violation  decision
0                0.9         1
1                0.5         0
2                0.7         1
3                0.8         1
4                0.8         1
...              ...       ...
1995             0.3         0
1996             0.2         0
1997             0.0         0
1998             0.3         0
1999             0.1         0

[2000 rows x 2 columns]


In [5]:
from sklearn.metrics import f1_score

# Debug: Check what values actually exist
print("Unique values in violates_rule:")
print(df_test["violates_rule"].unique())
print("\nUnique values in decision:")
print(result_df["decision"].unique())

# More robust conversion
y_true = df_test["violates_rule"].astype(str).str.strip().str.strip('"').str.strip("'").str.lower().map({"yes": 1, "no": 0})
y_pred = result_df["decision"].astype(int)

# Check after conversion
print("\nAfter conversion:")
print(f"y_true unique: {y_true.unique()}")
print(f"y_pred unique: {y_pred.unique()}")
print(f"\ny_true value counts:\n{y_true.value_counts()}")
print(f"\ny_pred value counts:\n{y_pred.value_counts()}")

# Calculate F1 score
f1 = f1_score(y_true, y_pred)
print(f"\nF1-Score: {f1:.4f}")

Unique values in violates_rule:
['Yes' 'No']

Unique values in decision:
[1 0]

After conversion:
y_true unique: [1 0]
y_pred unique: [1 0]

y_true value counts:
violates_rule
1    1000
0    1000
Name: count, dtype: int64

y_pred value counts:
decision
1    1087
0     913
Name: count, dtype: int64

F1-Score: 0.8970


In [6]:
# # write to submissions.csv
# df_test["rule_violation"]=result_df["rule_violation"].copy()
# df_test[["row_id","rule_violation"]].to_csv("submission.csv",index=False)
# print("wrote results to submission.csv")