In [15]:
#toxicity detection model - want -> a contextual languge model akin BERT/RoBERTa/XLnet
#want -> 
# - detection for obvious slangs
# - not associate repeated puncutation marks with negative scoring
# - ability to provide as possible accurate scoring for sarcastic comments
# - more..? 

In [16]:
#core data handling
import os
import json
from typing import List, Optional, Any

import pandas as pd
import numpy as np

from datasets import load_dataset #rtp streaming

#models + evaluation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error,
    r2_score,
    classification_report,
)

import secrets
RANDOM_SEED = secrets.randbelow(10000000)
#print("Random seed:", RANDOM_SEED)

Helpers for data processing

In [17]:
#Data cleaning
# - if text/comment field is NaN (empty) -> ""
# - " hello world!  " -> "hello world"
# - o/w convert to string + remove spaces
def cleanText(s: Any) -> str:
    if pd.isna(s):
        return ""
    if isinstance(s, str):   
        return s.strip() 
    return str(s).strip() 

#linear scaling: 
# - clip value (if needed) to lie within the [prev_min, prev_max] range -> prevent outliers
# - normalize/scale the value now in [0, 1]  
# - stretch to [1, 5] from prev range!
def toToxicRange(value: float, prev_min: float, prev_max: float) -> float:
    if np.isnan(value):
        return np.nan
    clipped = float(np.clip(value, prev_min, prev_max))
    scaled = (clipped - prev_min) / (prev_max - prev_min)
    return 1 + 4 * scaled

#Removes only:
# - empty strings
# - missing values
# - '[deleted]', '[removed]'
# - pure punctuation - texts with no letters/numbers (e.g. '!!!', '...', '?!!?')
def isInformativeText(s: str) -> bool:
    if not s:
        return False
    t = s.strip().lower()
    if t in {"", "[deleted]", "[removed]"}:
        return False
    #remove strings entirely filled w/ punctuation
    if isPunctuationOnly(t):
        return False
    return True

def isPunctuationOnly(s: str) -> bool:
    t = s.strip()  #removes whitespace
    if len(t) == 0:
        return True
    
    #allowed punctuation chars
    punct_chars = set("!?,.;:-—'\"()[]{}*/\\")
    return all(ch in punct_chars for ch in t)


Jigsaw unintended bias

In [18]:
#load jigsaw data from csv and turn multiple binary toxicity columns into a 1–5 score
# - reads only `max_rows` rows for speed (if given!)
# - returns df with ['text', 'toxicity_score']
def loadJigsaw(path: str, text_col: str = "comment_text", 
    tox_cols: Optional[List[str]] = None,
    max_rows: Optional[int] = 6000) -> pd.DataFrame:
    if tox_cols is None:
        tox_cols = [
            "toxic",
            "severe_toxic",
            "obscene",
            "threat",
            "insult",
            "identity_hate",
        ]

    df = pd.read_csv(path, nrows=max_rows)
    df["text"] = df[text_col].apply(cleanText)

    #flgs error when required column non-existent in dataset (unlikely)
    existing = [c for c in tox_cols if c in df.columns]
    if not existing:
        raise ValueError(
            f"This column is not present! Here's what we DO have -> {list(df.columns)}"
        )

    for c in existing:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)

    sum_max = len(existing)  #max possible sum if all are 1
    df["raw_toxic_sum"] = df[existing].sum(axis=1)

    #maps range [0..sum_max] -> [1..5]
    df["toxicity_score"] = df["raw_toxic_sum"].apply(
        lambda v: toToxicRange(v, 0.0, float(sum_max))
    )

    return df[["text", "toxicity_score"]]

Reddit Annotated (Ruddit)

In [19]:
#loads ruddit csv with each row score in [-1, 1] -> converted 
# to a [1–5] toxicity scale
def loadRuddit(path: str, text_col: str = "body",
    score_col: str = "score",
    #we're using the whole (short) dataset
    max_rows: Optional[int] = None) -> pd.DataFrame:
    df = pd.read_csv(path, nrows=max_rows)
    df["text"] = df[text_col].apply(cleanText)
    df[score_col] = pd.to_numeric(df[score_col], errors="coerce")

    #entries with missing score removed!
    df = df.dropna(subset=[score_col])

    df["toxicity_score"] = df[score_col].apply(
        lambda v: toToxicRange(v, -1.0, 1.0)  #from [-1, 1] -> [1, 5]
    )

    return df[["text", "toxicity_score"]]

Real Toxcicity Prompts (allenai) - processing

In [20]:
#streaming sample of RTP data from huggingface 
# - convert to [1–5] scale
# - uses 'continuation.toxicity' if available, else 'prompt.toxicity
# - toxicity scores are originally in [0,1]
def loadRTP(n_samples=6000, prefer_continuation=True):
    stream = load_dataset(
        "allenai/real-toxicity-prompts",
        split="train",
        streaming=True
    )

    rows = []
    for i, rec in enumerate(stream):
        if i >= n_samples:
            break
        
        p = rec.get("prompt", {})
        c = rec.get("continuation", {})

        prompt_text = cleanText(p.get("text", "")) if isinstance(p, dict) else ""
        cont_text = cleanText(c.get("text", "")) if isinstance(c, dict) else ""

        if prefer_continuation and cont_text:
            text = cont_text
            tox = c.get("toxicity", p.get("toxicity", None))
        else:
            text = (prompt_text + " " + cont_text).strip()
            tox = c.get("toxicity", p.get("toxicity", None))

        rows.append({"text": text, "raw_toxicity": tox})

    df = pd.DataFrame(rows)
    df["raw_toxicity"] = pd.to_numeric(df["raw_toxicity"], errors="coerce")
    df["toxicity_score"] = df["raw_toxicity"].apply(
        lambda v: toToxicRange(v, 0.0, 1.0)
    )

    return df[["text", "toxicity_score"]]


clean and merge helpers

In [21]:
#applies a 2nd round of cleaning:
# - enforces text type,
# - removes empty / '[deleted]' / very short - if any
# - drop duplicates
def cleanAndMergeTrainingData(df: pd.DataFrame):
    df = df.copy()

    df["text"] = df["text"].apply(cleanText)
    df = df[df["text"].apply(isInformativeText)]

    df = df.dropna(subset=["toxicity_score"])
    df = df[df["toxicity_score"].apply(lambda x: not pd.isna(x))]

    df = df.drop_duplicates(subset=["text"])
    df = df.reset_index(drop=True)

    return df

#merge multiple [text, toxicity_score] dfs into one
def mergeDatasets(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    cleaned = []
    for d in dfs:
        d = d.copy()
        if not {"text", "toxicity_score"}.issubset(d.columns):
            raise ValueError("Dataset missing required columns.")
        cleaned.append(d[["text", "toxicity_score"]])

    combined = pd.concat(cleaned, ignore_index=True)
    combined = cleanAndMergeTrainingData(combined)
    combined = combined.sample(frac=1.0, random_state=42)

    return combined.reset_index(drop=True)


In [22]:
jigsawDF = loadJigsaw("jigsaw_train.csv")
rudditDF = loadRuddit("ruddit_train.csv")
rtpDF = loadRTP(6000)

#individual dataframes
print(jigsawDF.shape) #prints -> (# of entries, # of columns) 
print(rudditDF.shape)
print(rtpDF.shape)

#combined dataframe
combinedDF = mergeDatasets([jigsawDF, rudditDF, rtpDF])
print("Combined dataframe looks like this -> ", combinedDF.shape)
print(combinedDF.head())

combinedDF.to_csv("combined_toxicity.csv", index=False)

(6000, 2)
(5966, 2)
(6000, 2)
Combined dataframe looks like this ->  (17667, 2)
                                                text  toxicity_score
0  It's not blind if we are following other count...        2.666000
1  week earlier in a 47-34 loss at Mississippi St...        1.309592
2        Unit Test Project and a sample MVC project.        1.022699
3  I hate spiders, but I hate mosquitos, wasps an...        3.458000
4  I usually excuse myself to the bathroom and wa...        3.852000
