# Jigsaw - Agile Community Rules Classification
### https://www.kaggle.com/competitions/jigsaw-agile-community-rules

In [29]:
import kagglehub
import pandas as pd
import os

base_path = "./data/final/"
df_train = pd.read_csv(f"{base_path}df_train_qkdsr_03.csv")
print(df_train.shape)
df=df_train.copy()
df_train.head(1)

df_clean=pd.read_csv(f"./data/synthetic_generation/community_rules.csv")
print(df_clean.shape)

(5000, 12)
(4, 1)


In [30]:
df_train.columns

Index(['subreddit', 'rule', 'formatted_rule', 'positive_example_1',
       'negative_example_1', 'positive_example_2', 'negative_example_2',
       'test_comment', 'violates_rule', 'raw_response',
       'example_comments_used', 'error'],
      dtype='object')

In [31]:
df_clean.columns

Index(['formatted_rule'], dtype='object')

In [32]:
df_clean["rule"]=df_clean["formatted_rule"].copy()

In [33]:
# def filter_similar_rules_batched(df_train, df_clean, similarity_threshold=0.7, batch_size=1000):
#     """
#     Memory-efficient version for very large datasets
#     """
#     model = SentenceTransformer('all-MiniLM-L6-v2')
    
#     # Generate clean embeddings once
#     clean_embeddings = model.encode(df_clean['rule'].tolist())
    
#     filtered_indices = []
#     similarities = []
    
#     # Process train data in batches
#     for i in range(0, len(df_train), batch_size):
#         batch = df_train.iloc[i:i+batch_size]
#         batch_embeddings = model.encode(batch['rule'].tolist())
        
#         # Compute similarity for this batch
#         batch_similarities = cosine_similarity(batch_embeddings, clean_embeddings)
#         max_sims = np.max(batch_similarities, axis=1)
        
#         # Find indices that meet threshold
#         valid_batch_indices = np.where(max_sims >= similarity_threshold)[0] + i
#         filtered_indices.extend(valid_batch_indices)
#         similarities.extend(max_sims[max_sims >= similarity_threshold])
        
#         print(f"Processed batch {i//batch_size + 1}/{(len(df_train)-1)//batch_size + 1}")
    
#     # Create filtered dataframe
#     filtered_df = df_train.iloc[filtered_indices].copy()
#     filtered_df['max_similarity'] = similarities
    
#     return filtered_df

# # Usage for large datasets:
# # filtered_train = filter_similar_rules_batched(df_train, df_clean, similarity_threshold=0.7)

In [34]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

def filter_similar_rules(df_train, df_clean, similarity_threshold=0.7, method='sentence_transformer'):
    """
    Filter df_train rules based on similarity to df_clean rules
    
    Parameters:
    - df_train: DataFrame with 'rule' column (30k+ entries)
    - df_clean: DataFrame with 'rule' column (200 entries) 
    - similarity_threshold: float, minimum similarity score to keep (0-1)
    - method: 'sentence_transformer' or 'tfidf'
    """
    
    if method == 'sentence_transformer':
        # Load pre-trained sentence transformer model
        model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and good quality
        
        # Generate embeddings
        print("Generating embeddings for clean rules...")
        clean_embeddings = model.encode(df_clean['rule'].tolist(), show_progress_bar=True)
        
        print("Generating embeddings for train rules...")
        train_embeddings = model.encode(df_train['rule'].tolist(), show_progress_bar=True)
        
    elif method == 'tfidf':
        # Alternative: TF-IDF approach (faster but less semantic)
        vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        
        # Fit on combined data
        all_rules = df_clean['rule'].tolist() + df_train['rule'].tolist()
        vectorizer.fit(all_rules)
        
        clean_embeddings = vectorizer.transform(df_clean['rule'].tolist()).toarray()
        train_embeddings = vectorizer.transform(df_train['rule'].tolist()).toarray()
    
    # Compute similarity matrix (clean_rules vs train_rules)
    print("Computing similarity matrix...")
    similarity_matrix = cosine_similarity(train_embeddings, clean_embeddings)
    
    # Get maximum similarity for each train rule
    max_similarities = np.max(similarity_matrix, axis=1)
    
    # Filter based on threshold
    similar_mask = max_similarities >= similarity_threshold
    filtered_df_train = df_train[similar_mask].copy()
    
    # Add similarity scores
    filtered_df_train['max_similarity'] = max_similarities[similar_mask]
    filtered_df_train['most_similar_clean_rule_idx'] = np.argmax(similarity_matrix[similar_mask], axis=1)
    
    # Add the most similar clean rule for reference
    most_similar_clean_rules = df_clean.iloc[filtered_df_train['most_similar_clean_rule_idx']]['rule'].values
    filtered_df_train['most_similar_clean_rule'] = most_similar_clean_rules
    
    print(f"Original train rules: {len(df_train)}")
    print(f"Filtered train rules: {len(filtered_df_train)}")
    print(f"Reduction: {(1 - len(filtered_df_train)/len(df_train))*100:.1f}%")
    
    return filtered_df_train, similarity_matrix

# Example usage:
if __name__ == "__main__":
    # Assuming you have your dataframes ready
    # df_train = pd.read_csv('train_data.csv')  # 30k+ rules
    # df_clean = pd.read_csv('clean_data.csv')  # 200 rules
    
    # Method 1: Using Sentence Transformers (recommended for semantic similarity)
    filtered_train, sim_matrix = filter_similar_rules(
        df_train, 
        df_clean, 
        similarity_threshold=0.7,  # Adjust this threshold
        method='sentence_transformer'
    )
    
    # Method 2: Using TF-IDF (faster alternative)
    # filtered_train, sim_matrix = filter_similar_rules(
    #     df_train, 
    #     df_clean, 
    #     similarity_threshold=0.3,  # Lower threshold for TF-IDF
    #     method='tfidf'
    # )
    
    # Display results
    print("\nTop 5 most similar matches:")
    print(filtered_train.nlargest(5, 'max_similarity')[['rule', 'max_similarity', 'most_similar_clean_rule']])

Generating embeddings for clean rules...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings for train rules...


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Computing similarity matrix...
Original train rules: 5000
Filtered train rules: 16
Reduction: 99.7%

Top 5 most similar matches:
                                                   rule  max_similarity  \
5     "No Medical or Clinical Advice: Do not give or...        0.754827   
470   "No Medical or Clinical Advice: Do not give or...        0.754827   
702   "No Medical or Clinical Advice: Do not give or...        0.754827   
1315  "No Medical or Clinical Advice: Do not give or...        0.754827   
1638  "No Medical or Clinical Advice: Do not give or...        0.754827   

                                most_similar_clean_rule  
5     no medical advice: do not offer or request spe...  
470   no medical advice: do not offer or request spe...  
702   no medical advice: do not offer or request spe...  
1315  no medical advice: do not offer or request spe...  
1638  no medical advice: do not offer or request spe...  


In [35]:
filtered_train.shape

(16, 15)

In [36]:
#filtered_train.to_csv("df_train_f0p6_10k.csv",index=False)

In [37]:
filtered_train["rule"].nunique()

1