In [None]:
## 📝 Convert to Instruction-Tuned Format

print("📝 Converting to instruction-following format...")

def create_instruction_sample(sample):
    """Convert sample to instruction format for LLM training"""
    
    # Create instruction prompt
    instruction = "Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak)."
    
    # Create the response based on our labels
    if sample['label'] == 'safe':
        response = f"This content is safe. Category: {sample['category']}, Severity: {sample['severity']}"
    else:
        response = f"This content is {sample['label']}. Category: {sample['category']}, Severity: {sample['severity']}"
    
    # Create instruction-tuned format
    return {
        "instruction": instruction,
        "input": sample['text'],
        "output": response,
        "label": sample['label'],
        "category": sample['category'],
        "severity": sample['severity'],
        "is_algospeak": sample.get('is_algospeak_variant', False)
    }

# Convert all samples to instruction format
instruction_samples = []
for sample in all_samples:
    instruction_sample = create_instruction_sample(sample)
    instruction_samples.append(instruction_sample)

print("✅ Converted to instruction format!")
print("   Total instruction samples:", len(instruction_samples))

# Show examples
print("\n📝 Instruction format examples:")
for i in range(3):
    sample = instruction_samples[i]
    print(f"\n{i+1}. Instruction: {sample['instruction']}")
    print(f"   Input: \"{sample['input'][:60]}...\"")
    print(f"   Output: {sample['output']}")
    print(f"   Algospeak variant: {sample['is_algospeak']}")
    print("-" * 80)

# Show distribution
labels = [s['label'] for s in instruction_samples]
algospeak_count = sum(1 for s in instruction_samples if s['is_algospeak'])

print("\n📊 Final instruction dataset:")
print(f"   Total samples: {len(instruction_samples):,}")
print(f"   Algospeak variants: {algospeak_count:,} ({algospeak_count/len(instruction_samples)*100:.1f}%)")

from collections import Counter
print("\n📊 Label distribution:")
for label, count in Counter(labels).most_common():
    pct = (count / len(instruction_samples)) * 100
    print(f"   {label}: {count:,} ({pct:.1f}%)")

📝 Converting to instruction-following format...
✅ Converted to instruction format!
   Total instruction samples: 52913

📝 Instruction format examples:

1. Instruction: Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak).
   Input: "This is so cool. It's like, 'would you want your mother to r..."
   Output: This content is safe. Category: general, Severity: 0
   Algospeak variant: False
--------------------------------------------------------------------------------

2. Instruction: Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak).
   Input: "Thank you!! This would make my life a lot less anxiety-induc..."
   Output: This content is safe. Category: general, Severity: 0
   Algospeak variant: False
--------------------------------------------------------------------------------

3. Instruction: Analyze this content for harmfulness and

In [76]:
import polars as pl
df = pl.read_csv("train.csv")
df.head()

id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
i64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i64,f64,i64,str,i64,i64,i64,i64,i64,f64,i64,i64
59848,0.0,"""This is so cool. It's like, 'w…",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,"""2015-09-29 10:50:41.987077+00""",2,,2006,"""rejected""",0,0,0,0,0,0.0,0,4
59849,0.0,"""Thank you!! This would make my…",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,"""2015-09-29 10:50:42.870083+00""",2,,2006,"""rejected""",0,0,0,0,0,0.0,0,4
59852,0.0,"""This is such an urgent design …",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,"""2015-09-29 10:50:45.222647+00""",2,,2006,"""rejected""",0,0,0,0,0,0.0,0,4
59855,0.0,"""Is this something I'll be able…",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,"""2015-09-29 10:50:47.601894+00""",2,,2006,"""rejected""",0,0,0,0,0,0.0,0,4
59856,0.893617,"""haha you guys are a bunch of l…",0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""2015-09-29 10:50:48.488476+00""",2,,2006,"""rejected""",0,0,0,1,0,0.0,4,47


In [77]:
# Make it vertical like pandas df.info()
print("📊 Dataset Info (pandas-style):")
print("Shape:", df.shape)                           # Show rows and columns  
print("Memory usage:", df.estimated_size('mb'), "MB")  # Show memory usage
print("📋 Column count:", len(df.columns))
print()
print("Column info:")
print("   Column Name                          Data Type")
print("   " + "-" * 50)                             # Print separator line

# Loop through schema to print each column vertically
for col_name, col_type in df.schema.items():        # schema.items() gives (name, type) pairs
    print("   " + col_name.ljust(35) + str(col_type))  # ljust(35) makes column name left-aligned in 35 characters

📊 Dataset Info (pandas-style):
Shape: (1804874, 45)
Memory usage: 1159.576078414917 MB
📋 Column count: 45

Column info:
   Column Name                          Data Type
   --------------------------------------------------
   id                                 Int64
   target                             Float64
   comment_text                       String
   severe_toxicity                    Float64
   obscene                            Float64
   identity_attack                    Float64
   insult                             Float64
   threat                             Float64
   asian                              Float64
   atheist                            Float64
   bisexual                           Float64
   black                              Float64
   buddhist                           Float64
   christian                          Float64
   female                             Float64
   heterosexual                       Float64
   hindu                              Float

In [78]:
# Check for missing values in the columns we care about for content moderation
key_columns = ["comment_text", "target", "severe_toxicity", "obscene", "identity_attack", "insult", "threat"]

print("❓ Missing values in key columns:")
for col in key_columns:                           # Loop through each important column
    null_count = df.select(col).null_count().item()  # Count nulls in this column, .item() gets the actual number
    total_rows = df.shape[0]                      # Total number of rows
    percentage = (null_count / total_rows) * 100  # Calculate percentage of missing data
    print("   " + col + ": " + str(null_count) + " missing (" + str(round(percentage, 2)) + "%)")

❓ Missing values in key columns:
   comment_text: 0 missing (0.0%)
   target: 0 missing (0.0%)
   severe_toxicity: 0 missing (0.0%)
   obscene: 0 missing (0.0%)
   identity_attack: 0 missing (0.0%)
   insult: 0 missing (0.0%)
   threat: 0 missing (0.0%)


In [79]:
# Let's see the target column distribution to understand toxicity scores
print("📊 Target column (toxicity scores) statistics:")
target_stats = df.select("target").describe()  # Get basic statistics
print(target_stats)

📊 Target column (toxicity scores) statistics:
shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ target     │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 1.804874e6 │
│ null_count ┆ 0.0        │
│ mean       ┆ 0.103017   │
│ std        ┆ 0.197076   │
│ min        ┆ 0.0        │
│ 25%        ┆ 0.0        │
│ 50%        ┆ 0.0        │
│ 75%        ┆ 0.166667   │
│ max        ┆ 1.0        │
└────────────┴────────────┘


In [80]:
# Let's see some actual full comments (not truncated)
print("💬 Sample comments with full text:")
sample_data = df.select(["comment_text", "target", "insult", "threat"]).head(10)  # Get first 10 with key scores

for i in range(10):                           # Loop through each comment
    row = sample_data.row(i)                  # Get the row data  
    comment = str(row[0])                     # Full comment text (not truncated)
    target_score = row[1]                     # Toxicity score
    insult_score = row[2]                     # Insult score
    threat_score = row[3]                     # Threat score
    
    print("Row " + str(i+1) + ":")
    print("   Target: " + str(round(target_score, 3)) + " | Insult: " + str(round(insult_score, 3)) + " | Threat: " + str(round(threat_score, 3)))
    print("   Comment: " + comment)
    print("   " + "-" * 80)

💬 Sample comments with full text:
Row 1:
   Target: 0.0 | Insult: 0.0 | Threat: 0.0
   Comment: This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!
   --------------------------------------------------------------------------------
Row 2:
   Target: 0.0 | Insult: 0.0 | Threat: 0.0
   Comment: Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!
   --------------------------------------------------------------------------------
Row 3:
   Target: 0.0 | Insult: 0.0 | Threat: 0.0
   Comment: This is such an urgent design problem; kudos to you for taking it on. Very impressive!
   --------------------------------------------------------------------------------
Row 4:
   Target: 0.0 | Insult: 0.0 | Threat: 0.0
   Comment: Is this something I'll be able to install on my site? When will you be releasing it?
   --------------------------------------------------------------------------

In [81]:
# Let's see how many comments fall into different toxicity ranges
print("📊 Toxicity level distribution:")

# Create ranges similar to our content moderation categories
high_toxic = df.filter(df["target"] >= 0.8).shape[0]          # Very toxic (>= 0.8)
moderate_toxic = df.filter((df["target"] >= 0.5) & (df["target"] < 0.8)).shape[0]  # Moderate toxic (0.5-0.8)
low_toxic = df.filter((df["target"] >= 0.2) & (df["target"] < 0.5)).shape[0]       # Low toxic (0.2-0.5) 
safe = df.filter(df["target"] < 0.2).shape[0]                 # Safe (< 0.2)


total = df.shape[0]
print("   Extremely harmful (>= 0.8): " + str(high_toxic) + " (" + str(round(high_toxic/total*100, 1)) + "%)")
print("   Harmful (0.5-0.8): " + str(moderate_toxic) + " (" + str(round(moderate_toxic/total*100, 1)) + "%)")  
print("   Potentially harmful (0.2-0.5): " + str(low_toxic) + " (" + str(round(low_toxic/total*100, 1)) + "%)")
print("   Safe (< 0.2): " + str(safe) + " (" + str(round(safe/total*100, 1)) + "%)")

📊 Toxicity level distribution:
   Extremely harmful (>= 0.8): 30831 (1.7%)
   Harmful (0.5-0.8): 113503 (6.3%)
   Potentially harmful (0.2-0.5): 233352 (12.9%)
   Safe (< 0.2): 1427188 (79.1%)


In [82]:
# Let's explore the different types of toxicity in our dataset
print("🔍 Toxicity subcategory analysis:")
print()

# Get basic stats for each toxicity type  
toxicity_columns = ["severe_toxicity", "obscene", "identity_attack", "insult", "threat"]

for col in toxicity_columns:                    # Loop through each toxicity type
    stats = df.select(col).describe()           # Get statistics for this column
    mean_val = stats.filter(stats["statistic"] == "mean")[col].item()  # Extract mean value
    
    # Count how many comments have this type of toxicity (> 0.5 threshold)
    toxic_count = df.filter(df[col] >= 0.5).shape[0]   # Count comments above 0.5
    percentage = round(toxic_count / df.shape[0] * 100, 1)  # Calculate percentage
    
    print("📊 " + col + ":")
    print("   Mean score: " + str(round(mean_val, 3)))      # Average score
    print("   Toxic count: " + str(toxic_count) + " (" + str(percentage) + "%)")  # Count above 0.5
    print()

🔍 Toxicity subcategory analysis:

📊 severe_toxicity:
   Mean score: 0.005
   Toxic count: 13 (0.0%)

📊 obscene:
   Mean score: 0.014
   Toxic count: 9603 (0.5%)

📊 identity_attack:
   Mean score: 0.023
   Toxic count: 13410 (0.7%)

📊 insult:
   Mean score: 0.081
   Toxic count: 106534 (5.9%)

📊 threat:
   Mean score: 0.009
   Toxic count: 4280 (0.2%)



In [83]:
# Let's start preparing our training data
print("🎯 Data preparation for fine-tuning:")
print()

# First, let's see what identity columns we have for comprehensive coverage
identity_columns = ["asian", "atheist", "bisexual", "black", "buddhist", "christian", 
                   "female", "heterosexual", "hindu", "homosexual_gay_or_lesbian",
                   "intellectual_or_learning_disability", "jewish", "latino", "male", 
                   "muslim", "transgender", "white"]

print("🏷️ Identity-based annotation columns available:")
for i, col in enumerate(identity_columns, 1):           # Loop through identity columns with numbers
    count = df.filter(df[col] >= 0.5).shape[0]         # Count comments with this identity annotation
    if count > 0:                                       # Only show columns that have data
        print("   " + str(i) + ". " + col + ": " + str(count) + " comments")

🎯 Data preparation for fine-tuning:

🏷️ Identity-based annotation columns available:
   1. asian: 4578 comments
   2. atheist: 1412 comments
   3. bisexual: 287 comments
   4. black: 14901 comments
   5. buddhist: 588 comments
   6. christian: 40423 comments
   7. female: 53429 comments
   8. heterosexual: 1291 comments
   9. hindu: 580 comments
   10. homosexual_gay_or_lesbian: 10997 comments
   11. intellectual_or_learning_disability: 93 comments
   12. jewish: 7651 comments
   13. latino: 2004 comments
   14. male: 44484 comments
   15. muslim: 21006 comments
   16. transgender: 2499 comments
   17. white: 25082 comments


In [84]:
# Let's see what short comments we're removing to decide if 10 chars is right
print("🔍 Analyzing short comments we're removing...")

# Get comments that are less than 10 characters
short_comments = df.filter(pl.col("comment_text").str.len_chars() < 10)
print("   Short comments count:", short_comments.shape[0])

# Look at some examples of short comments and their toxicity scores
sample_short = short_comments.select(["comment_text", "target", "insult", "threat"]).head(20)

print("\n📝 Examples of short comments (< 10 chars):")
for i in range(min(20, sample_short.shape[0])):
    row = sample_short.row(i)
    comment = str(row[0])
    target = row[1]
    insult = row[2] 
    threat = row[3]
    
    print(f"   '{comment}' (len={len(comment)}) | Target: {target:.3f} | Insult: {insult:.3f} | Threat: {threat:.3f}")

# Check how many short comments are actually harmful
harmful_short = short_comments.filter(pl.col("target") >= 0.5).shape[0]
print(f"\n🚨 Harmful short comments (target >= 0.5): {harmful_short}")
print(f"   That's {harmful_short/short_comments.shape[0]*100:.1f}% of short comments")

🔍 Analyzing short comments we're removing...
   Short comments count: 12927

📝 Examples of short comments (< 10 chars):
   'Awesome!' (len=8) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   'Me too!' (len=7) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   ':(' (len=2) | Target: 0.100 | Insult: 0.000 | Threat: 0.000
   'I agree!' (len=8) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   'Perhaps!' (len=8) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   ';)' (len=2) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   'log' (len=3) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   '101th!' (len=6) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   'No' (len=2) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   'yes' (len=3) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   'Spot on.' (len=8) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   'YES!' (len=4) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   'BYE!' (len=4) | Target: 0.000 | Insult: 0.000 | Threat: 0.000
   'lovel

In [85]:
# Better cleaning approach - keep harmful short comments, remove only junk
print("🧹 Smarter dataset cleaning...")

# Remove rows with missing key data
df_clean = df.filter(
    pl.col("comment_text").is_not_null() & 
    pl.col("target").is_not_null()
)

# Keep ALL harmful content regardless of length (target >= 0.2)
# Only filter short comments that are safe (target < 0.2)
df_clean = df_clean.filter(
    (pl.col("comment_text").str.len_chars() >= 10) |  # Keep long comments OR
    (pl.col("target") >= 0.2)                         # Keep harmful short comments
)

print("✅ Smart cleaning complete!")
print("   Original rows:", df.shape[0])
print("   Clean rows:", df_clean.shape[0])
print("   Removed:", df.shape[0] - df_clean.shape[0], "rows")

# Verify we kept the harmful short comments
harmful_short_kept = df_clean.filter(
    (pl.col("comment_text").str.len_chars() < 10) & 
    (pl.col("target") >= 0.5)
).shape[0]

print("   🎯 Harmful short comments kept:", harmful_short_kept)

🧹 Smarter dataset cleaning...
✅ Smart cleaning complete!
   Original rows: 1804874
   Clean rows: 1793024
   Removed: 11850 rows
   🎯 Harmful short comments kept: 615


In [86]:
## 🎯 Map to Our Content Moderation Categories

# Create our classification system mapping
def get_label_and_category(target, severe, obscene, insult, threat, identity_attack):
    """Map Jigsaw scores to our content moderation system"""
    
    # Determine primary label based on target score
    if target >= 0.8:
        label = "extremely_harmful"
    elif target >= 0.5:
        label = "harmful"  
    elif target >= 0.2:
        label = "potentially_harmful"
    else:
        label = "safe"
    
    # Determine category based on highest subcategory score
    if identity_attack > 0.3:
        category = "hate_speech"
    elif threat > 0.3:
        category = "threats"
    elif insult > 0.3:
        category = "harassment"
    elif obscene > 0.3:
        category = "profanity"
    elif severe > 0.3:
        category = "severe"
    else:
        category = "general"
    
    # Calculate severity (0-3 scale)
    if target >= 0.8: 
        severity = 3
    elif target >= 0.5: 
        severity = 2  
    elif target >= 0.2: 
        severity = 1
    else:
        severity = 0
    
    return label, category, severity

# Apply the mapping
print("🏷️ Mapping to content moderation categories...")

# Create a list to store results (first 50k for speed)
sample_size = 50000  # Process subset first to test
mapped_data = []

count = 0
for row in df_clean.iter_rows():
    if count >= sample_size:
        break
        
    # Get the relevant scores
    target = row[1]  # target column
    severe = row[3]  # severe_toxicity  
    obscene = row[4]  # obscene
    identity_attack = row[5]  # identity_attack
    insult = row[6]  # insult
    threat = row[7]  # threat
    
    # Get our mappings
    label, category, severity = get_label_and_category(target, severe, obscene, insult, threat, identity_attack)
    
    # Store the result
    mapped_data.append({
        'id': row[0],
        'text': row[2],  # comment_text
        'target': target,
        'label': label,
        'category': category, 
        'severity': severity
    })
    
    count += 1

print("✅ Categories created!")
print("   Processed:", len(mapped_data), "comments")

# Show distribution
from collections import Counter
labels = [item['label'] for item in mapped_data]
categories = [item['category'] for item in mapped_data]

print("\n📊 Label distribution:")
for label, count in Counter(labels).most_common():
    pct = (count / len(mapped_data)) * 100
    print(f"   {label}: {count:,} ({pct:.1f}%)")

print("\n📊 Category distribution:")
for category, count in Counter(categories).most_common():
    pct = (count / len(mapped_data)) * 100
    print(f"   {category}: {count:,} ({pct:.1f}%)")

🏷️ Mapping to content moderation categories...
✅ Categories created!
   Processed: 50000 comments

📊 Label distribution:
   safe: 41,145 (82.3%)
   potentially_harmful: 5,520 (11.0%)
   harmful: 2,589 (5.2%)
   extremely_harmful: 746 (1.5%)

📊 Category distribution:
   general: 45,828 (91.7%)
   harassment: 3,299 (6.6%)
   hate_speech: 458 (0.9%)
   threats: 265 (0.5%)
   profanity: 150 (0.3%)


In [None]:
## 🎭 Create Algospeak Variants

# Load our algospeak patterns from Stage 1
import json
import random

print("📖 Loading algospeak patterns...")

# Load patterns from local dataset folder
with open('algospeak_patterns.json', 'r') as f:
    patterns = json.load(f)

print("✅ Patterns loaded!")
print("   Direct mappings:", len(patterns.get('direct_mappings', {})))
print("   Homophones:", len(patterns.get('homophones', {})))
print("   Leetspeak:", len(patterns.get('leetspeak', {})))

# Create reverse mapping (normal word -> algospeak variants)
reverse_mappings = {}

# Add direct mappings (algospeak -> normal becomes normal -> [algospeak])
for algospeak, normal in patterns.get('direct_mappings', {}).items():
    if normal not in reverse_mappings:
        reverse_mappings[normal] = []
    reverse_mappings[normal].append(algospeak)

# Add homophones and leetspeak
for pattern_type in ['homophones', 'leetspeak']:
    for algospeak, normal in patterns.get(pattern_type, {}).items():
        if normal not in reverse_mappings:
            reverse_mappings[normal] = []
        reverse_mappings[normal].append(algospeak)

print("🔄 Reverse mappings created for", len(reverse_mappings), "words")

# Show some examples
print("\n📝 Example mappings:")
for word, variants in list(reverse_mappings.items())[:5]:
    print(f"   {word} → {variants}")

📖 Loading algospeak patterns...
✅ Patterns loaded!
   Direct mappings: 74
   Homophones: 10
   Leetspeak: 11
🔄 Reverse mappings created for 60 words

📝 Example mappings:
   kill → ['unalive']
   killing → ['unaliving']
   killed → ['unalived']
   suicide → ['sewer slide', 'endgame', '13×']
   commit suicide → ['kermit sewer slide']


In [88]:
## 🔀 Generate Algospeak Variants

import re

print("🎭 Creating algospeak variants...")

# Focus on harmful content (where algospeak is most used)
harmful_samples = [item for item in mapped_data 
                  if item['label'] in ['harmful', 'extremely_harmful', 'potentially_harmful']]

print("📊 Working with", len(harmful_samples), "harmful samples")

algospeak_variants = []
variant_count = 0

for sample in harmful_samples:
    text = sample['text'].lower()
    words = text.split()
    
    # Try to create algospeak variants
    for i, word in enumerate(words):
        # Clean the word (remove punctuation)
        clean_word = re.sub(r'[^\w\s]', '', word)
        
        # Check if this word has algospeak variants
        if clean_word in reverse_mappings:
            # Create variants with algospeak substitutions
            for algospeak_variant in reverse_mappings[clean_word][:2]:  # Max 2 variants per word
                new_words = words.copy()
                new_words[i] = algospeak_variant
                variant_text = ' '.join(new_words)
                
                # Don't create variants identical to original
                if variant_text != text:
                    algospeak_variants.append({
                        'id': f"algospeak_{variant_count}",
                        'text': variant_text,
                        'target': sample['target'],
                        'label': sample['label'],
                        'category': sample['category'], 
                        'severity': sample['severity'],
                        'is_algospeak_variant': True,
                        'original_text': sample['text']
                    })
                    variant_count += 1

print("✅ Created", len(algospeak_variants), "algospeak variants!")

# Show some examples
print("\n📝 Algospeak variant examples:")
for i, variant in enumerate(algospeak_variants[:5]):
    print(f"{i+1}. Original: \"{variant['original_text'][:50]}...\"")
    print(f"   Variant:  \"{variant['text'][:50]}...\"")
    print(f"   Category: {variant['category']} ({variant['label']})")
    print("-" * 60)

# Combine original data with variants
all_samples = mapped_data + algospeak_variants
print(f"\n📊 Final training dataset:")
print(f"   Original samples: {len(mapped_data):,}")
print(f"   Algospeak variants: {len(algospeak_variants):,}")
print(f"   Total samples: {len(all_samples):,}")

🎭 Creating algospeak variants...
📊 Working with 8855 harmful samples
✅ Created 2913 algospeak variants!

📝 Algospeak variant examples:
1. Original: "This is a great story. Man. I wonder if the person..."
   Variant:  "this is a great story. man. i wonder if the person..."
   Category: harassment (potentially_harmful)
------------------------------------------------------------
2. Original: "This bitch is nuts. Who would read a book by a wom..."
   Variant:  "this b1tch is nuts. who would read a book by a wom..."
   Category: hate_speech (extremely_harmful)
------------------------------------------------------------
3. Original: "“Will Whiteness History Month make all white peopl..."
   Variant:  "“will whiteness history month make all white peopl..."
   Category: hate_speech (harmful)
------------------------------------------------------------
4. Original: "“Will Whiteness History Month make all white peopl..."
   Variant:  "“will whiteness history month make all white peopl..."
   C

In [89]:
## 📝 Convert to Instruction-Tuned Format

print("📝 Converting to instruction-following format...")

def create_instruction_sample(sample):
    """Convert sample to instruction format for LLM training"""
    
    # Create instruction prompt
    instruction = "Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak)."
    
    # Create the response based on our labels
    if sample['label'] == 'safe':
        response = f"This content is safe. Category: {sample['category']}, Severity: {sample['severity']}"
    else:
        response = f"This content is {sample['label']}. Category: {sample['category']}, Severity: {sample['severity']}"
    
    # Create instruction-tuned format
    return {
        "instruction": instruction,
        "input": sample['text'],
        "output": response,
        "label": sample['label'],
        "category": sample['category'],
        "severity": sample['severity'],
        "is_algospeak": sample.get('is_algospeak_variant', False)
    }

# Convert all samples to instruction format
instruction_samples = []
for sample in all_samples:
    instruction_sample = create_instruction_sample(sample)
    instruction_samples.append(instruction_sample)

print("✅ Converted to instruction format!")
print("   Total instruction samples:", len(instruction_samples))

# Show examples
print("\n📝 Instruction format examples:")
for i in range(3):
    sample = instruction_samples[i]
    print(f"\n{i+1}. Instruction: {sample['instruction']}")
    print(f"   Input: \"{sample['input'][:60]}...\"")
    print(f"   Output: {sample['output']}")
    print(f"   Algospeak variant: {sample['is_algospeak']}")
    print("-" * 80)

# Show distribution
labels = [s['label'] for s in instruction_samples]
algospeak_count = sum(1 for s in instruction_samples if s['is_algospeak'])

print("\n📊 Final instruction dataset:")
print(f"   Total samples: {len(instruction_samples):,}")
print(f"   Algospeak variants: {algospeak_count:,} ({algospeak_count/len(instruction_samples)*100:.1f}%)")

from collections import Counter
print("\n📊 Label distribution:")
for label, count in Counter(labels).most_common():
    pct = (count / len(instruction_samples)) * 100
    print(f"   {label}: {count:,} ({pct:.1f}%)")

📝 Converting to instruction-following format...
✅ Converted to instruction format!
   Total instruction samples: 52913

📝 Instruction format examples:

1. Instruction: Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak).
   Input: "This is so cool. It's like, 'would you want your mother to r..."
   Output: This content is safe. Category: general, Severity: 0
   Algospeak variant: False
--------------------------------------------------------------------------------

2. Instruction: Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak).
   Input: "Thank you!! This would make my life a lot less anxiety-induc..."
   Output: This content is safe. Category: general, Severity: 0
   Algospeak variant: False
--------------------------------------------------------------------------------

3. Instruction: Analyze this content for harmfulness and

In [90]:
## 💾 Export Training Dataset for Google Colab

import json

print("💾 Exporting training dataset for Google Colab...")

# Export to JSON format in dataset folder (at root level)
output_file = "../dataset/training_dataset_colab.json"  # Save to dataset/ folder at root

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(instruction_samples, f, ensure_ascii=False, indent=2)

# Get file size
import os
file_size_mb = os.path.getsize(output_file) / (1024 * 1024)

print("✅ Training dataset exported!")
print(f"   File: dataset/{output_file}")
print(f"   Size: {file_size_mb:.1f} MB")
print(f"   Samples: {len(instruction_samples):,}")
print(f"   Algospeak variants: {sum(1 for s in instruction_samples if s['is_algospeak']):,}")

# Show final summary
print("\n🎯 Dataset Summary for TrustLab Interview:")
print("   ✅ Jigsaw dataset processed with Polars")
print("   ✅ Smart cleaning (kept harmful short content)")  
print("   ✅ Content moderation categories mapped")
print("   ✅ Algospeak variants created from Stage 1 patterns")
print("   ✅ Instruction-tuned format for Qwen2.5-3B")
print("   ✅ Ready for Google Colab QLoRA fine-tuning!")

print(f"\n📁 File saved in: {output_file}")
print("🚀 When ready: Copy this file and upload to Google Colab!")

💾 Exporting training dataset for Google Colab...
✅ Training dataset exported!
   File: dataset/training_dataset_colab.json
   Size: 34.0 MB
   Samples: 52,913
   Algospeak variants: 2,913

🎯 Dataset Summary for TrustLab Interview:
   ✅ Jigsaw dataset processed with Polars
   ✅ Smart cleaning (kept harmful short content)
   ✅ Content moderation categories mapped
   ✅ Algospeak variants created from Stage 1 patterns
   ✅ Instruction-tuned format for Qwen2.5-3B
   ✅ Ready for Google Colab QLoRA fine-tuning!

📁 File saved in: /dataset/training_dataset_colab.json
🚀 When ready: Copy this file and upload to Google Colab!


In [None]:
## 📝 Convert to Instruction-Tuned Format

print("📝 Converting to instruction-following format...")

def create_instruction_sample(sample):
    """Convert sample to instruction format for LLM training"""
    
    # Create instruction prompt
    instruction = "Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak)."
    
    # Create the response based on our labels
    if sample['label'] == 'safe':
        response = f"This content is safe. Category: {sample['category']}, Severity: {sample['severity']}"
    else:
        response = f"This content is {sample['label']}. Category: {sample['category']}, Severity: {sample['severity']}"
    
    # Create instruction-tuned format
    return {
        "instruction": instruction,
        "input": sample['text'],
        "output": response,
        "label": sample['label'],
        "category": sample['category'],
        "severity": sample['severity'],
        "is_algospeak": sample.get('is_algospeak_variant', False)
    }

# Convert all samples to instruction format
instruction_samples = []
for sample in all_samples:
    instruction_sample = create_instruction_sample(sample)
    instruction_samples.append(instruction_sample)

print("✅ Converted to instruction format!")
print("   Total instruction samples:", len(instruction_samples))

# Show examples
print("\n📝 Instruction format examples:")
for i in range(3):
    sample = instruction_samples[i]
    print(f"\n{i+1}. Instruction: {sample['instruction']}")
    print(f"   Input: \"{sample['input'][:60]}...\"")
    print(f"   Output: {sample['output']}")
    print(f"   Algospeak variant: {sample['is_algospeak']}")
    print("-" * 80)

# Show distribution
labels = [s['label'] for s in instruction_samples]
algospeak_count = sum(1 for s in instruction_samples if s['is_algospeak'])

print("\n📊 Final instruction dataset:")
print(f"   Total samples: {len(instruction_samples):,}")
print(f"   Algospeak variants: {algospeak_count:,} ({algospeak_count/len(instruction_samples)*100:.1f}%)")

from collections import Counter
print("\n📊 Label distribution:")
for label, count in Counter(labels).most_common():
    pct = (count / len(instruction_samples)) * 100
    print(f"   {label}: {count:,} ({pct:.1f}%)")

📝 Converting to instruction-following format...
✅ Converted to instruction format!
   Total instruction samples: 52913

📝 Instruction format examples:

1. Instruction: Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak).
   Input: "This is so cool. It's like, 'would you want your mother to r..."
   Output: This content is safe. Category: general, Severity: 0
   Algospeak variant: False
--------------------------------------------------------------------------------

2. Instruction: Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak).
   Input: "Thank you!! This would make my life a lot less anxiety-induc..."
   Output: This content is safe. Category: general, Severity: 0
   Algospeak variant: False
--------------------------------------------------------------------------------

3. Instruction: Analyze this content for harmfulness and

In [None]:
## 📝 Convert to Instruction-Tuned Format

print("📝 Converting to instruction-following format...")

def create_instruction_sample(sample):
    """Convert sample to instruction format for LLM training"""
    
    # Create instruction prompt
    instruction = "Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak)."
    
    # Create the response based on our labels
    if sample['label'] == 'safe':
        response = f"This content is safe. Category: {sample['category']}, Severity: {sample['severity']}"
    else:
        response = f"This content is {sample['label']}. Category: {sample['category']}, Severity: {sample['severity']}"
    
    # Create instruction-tuned format
    return {
        "instruction": instruction,
        "input": sample['text'],
        "output": response,
        "label": sample['label'],
        "category": sample['category'],
        "severity": sample['severity'],
        "is_algospeak": sample.get('is_algospeak_variant', False)
    }

# Convert all samples to instruction format
instruction_samples = []
for sample in all_samples:
    instruction_sample = create_instruction_sample(sample)
    instruction_samples.append(instruction_sample)

print("✅ Converted to instruction format!")
print("   Total instruction samples:", len(instruction_samples))

# Show examples
print("\n📝 Instruction format examples:")
for i in range(3):
    sample = instruction_samples[i]
    print(f"\n{i+1}. Instruction: {sample['instruction']}")
    print(f"   Input: \"{sample['input'][:60]}...\"")
    print(f"   Output: {sample['output']}")
    print(f"   Algospeak variant: {sample['is_algospeak']}")
    print("-" * 80)

# Show distribution
labels = [s['label'] for s in instruction_samples]
algospeak_count = sum(1 for s in instruction_samples if s['is_algospeak'])

print("\n📊 Final instruction dataset:")
print(f"   Total samples: {len(instruction_samples):,}")
print(f"   Algospeak variants: {algospeak_count:,} ({algospeak_count/len(instruction_samples)*100:.1f}%)")

from collections import Counter
print("\n📊 Label distribution:")
for label, count in Counter(labels).most_common():
    pct = (count / len(instruction_samples)) * 100
    print(f"   {label}: {count:,} ({pct:.1f}%)")

📝 Converting to instruction-following format...
✅ Converted to instruction format!
   Total instruction samples: 52913

📝 Instruction format examples:

1. Instruction: Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak).
   Input: "This is so cool. It's like, 'would you want your mother to r..."
   Output: This content is safe. Category: general, Severity: 0
   Algospeak variant: False
--------------------------------------------------------------------------------

2. Instruction: Analyze this content for harmfulness and classify it appropriately. Consider both direct language and coded language (algospeak).
   Input: "Thank you!! This would make my life a lot less anxiety-induc..."
   Output: This content is safe. Category: general, Severity: 0
   Algospeak variant: False
--------------------------------------------------------------------------------

3. Instruction: Analyze this content for harmfulness and