Toxicity detection model 
<span style="font-size: 15px;">
        \
        - a contextual languge model akin BERT/RoBERTa/XLnet\
        - detection for obvious slangs\
        - not associate repeated puncutation marks with negative scoring\
        - ability to provide as possible accurate scoring for sarcastic comments\
        - more..?\
</span>

---------------------------------------------------------------------------------------------------------------------------------------------------

Training data:
<span style="font-size: 15px;">
        \
        - Jigsaw unintended bias - https://www.kaggle.com/datasets/julian3833/jigsaw-toxic-comment-classification-challenge?select=train.csv \
        - Ruddit - https://github.com/hadarishav/Ruddit/tree/main/Dataset \
        - Real toxicity prompts - https://huggingface.co/datasets/allenai/real-toxicity-prompts
</span>

---------------------------------------------------------------------------------------------------------------------------------------------------

Testing data:
<span style="font-size: 15px;">
        \
        - Chatlogs from LoL tribunal (system later phased out) - https://www.kaggle.com/datasets/simshengxue/league-of-legends-tribunal-chatlogs \
        - Friends tv show dialogues - https://www.kaggle.com/datasets/thedevastator/friends-tv-show-dialog-sequences \
        - Jigsaw unint. bias given test data - https://www.kaggle.com/datasets/julian3833/jigsaw-toxic-comment-classification-challenge?select=test.csv
</span>

In [56]:
#core data handling
import os
import json
from typing import List, Optional, Any

import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from torch.utils.data import Dataset

from datasets import load_dataset #rtp streaming

#models + evaluation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error, root_mean_squared_error, r2_score, 
    precision_recall_fscore_support, roc_auc_score
)

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, 
    DataCollatorWithPadding
)

import secrets
RANDOM_SEED = secrets.randbelow(10000000)
#print("Random seed:", RANDOM_SEED)

Helpers for data processing

In [57]:
#Data cleaning
# - if text/comment field is NaN (empty) -> ""
# - " hello world!  " -> "hello world"
# - o/w convert to string + remove spaces
def cleanText(s: Any) -> str:
    if pd.isna(s):
        return ""
    if isinstance(s, str):   
        return s.strip() 
    return str(s).strip() 

#linear scaling: 
# - clip value (if needed) to lie within the [prev_min, prev_max] range -> prevent outliers
# - normalize/scale the value now in [0, 1]  
# - stretch to [1, 5] from prev range!
def toToxicRange(value: float, prev_min: float, prev_max: float) -> float:
    if np.isnan(value):
        return np.nan
    clipped = float(np.clip(value, prev_min, prev_max))
    scaled = (clipped - prev_min) / (prev_max - prev_min)
    return 1 + 4 * scaled

#Removes only:
# - empty strings
# - missing values
# - '[deleted]', '[removed]'
# - pure punctuation - texts with no letters/numbers (e.g. '!!!', '...', '?!!?')
def isInformativeText(s: str) -> bool:
    if not s:
        return False
    t = s.strip().lower()
    if t in {"", "[deleted]", "[removed]"}:
        return False
    #remove strings entirely filled w/ punctuation
    if isPunctuationOnly(t):
        return False
    return True

def isPunctuationOnly(s: str) -> bool:
    t = s.strip()  #removes whitespace
    if len(t) == 0:
        return True
    
    #allowed punctuation chars
    punct_chars = set("!?,.;:-—'\"()[]{}*/\\")
    return all(ch in punct_chars for ch in t)


Jigsaw unintended bias

In [58]:
#load jigsaw data from csv and turn multiple binary toxicity columns into a 1–5 score
# - reads only `max_rows` rows for speed (if given!)
# - returns df with ['text', 'toxicity_score']
def loadJigsaw(path: str, text_col: str = "comment_text", 
    tox_cols: Optional[List[str]] = None,
    max_rows: Optional[int] = 6000) -> pd.DataFrame:
    if tox_cols is None:
        tox_cols = [
            "toxic",
            "severe_toxic",
            "obscene",
            "threat",
            "insult",
            "identity_hate",
        ]

    df = pd.read_csv(path, nrows=max_rows)
    df["text"] = df[text_col].apply(cleanText)

    #flgs error when required column non-existent in dataset (unlikely)
    existing = [c for c in tox_cols if c in df.columns]
    if not existing:
        raise ValueError(
            f"This column is not present! Here's what we DO have -> {list(df.columns)}"
        )

    for c in existing:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)

    sum_max = len(existing)  #max possible sum if all are 1
    df["raw_toxic_sum"] = df[existing].sum(axis=1)

    #maps range [0..sum_max] -> [1..5]
    df["toxicity_score"] = df["raw_toxic_sum"].apply(
        lambda v: toToxicRange(v, 0.0, float(sum_max))
    )

    return df[["text", "toxicity_score"]]

Reddit Annotated (Ruddit)

In [59]:
#loads ruddit csv with each row score in [-1, 1] -> converted 
# to a [1–5] toxicity scale
def loadRuddit(path: str, text_col: str = "body",
    score_col: str = "score",
    #we're using the whole (short) dataset
    max_rows: Optional[int] = None) -> pd.DataFrame:
    df = pd.read_csv(path, nrows=max_rows)
    df["text"] = df[text_col].apply(cleanText)
    df[score_col] = pd.to_numeric(df[score_col], errors="coerce")

    #entries with missing score removed!
    df = df.dropna(subset=[score_col])

    df["toxicity_score"] = df[score_col].apply(
        lambda v: toToxicRange(v, -1.0, 1.0)  #from [-1, 1] -> [1, 5]
    )

    return df[["text", "toxicity_score"]]

Real Toxcicity Prompts (allenai) - processing

In [60]:
#streaming sample of RTP data from huggingface 
# - convert to [1–5] scale
# - uses 'continuation.toxicity' if available, else 'prompt.toxicity
# - toxicity scores are originally in [0,1]
def loadRTP(n_samples=6000, prefer_continuation=True):
    stream = load_dataset(
        "allenai/real-toxicity-prompts",
        split="train",
        streaming=True
    )

    rows = []
    for i, rec in enumerate(stream):
        if i >= n_samples:
            break
        
        p = rec.get("prompt", {})
        c = rec.get("continuation", {})

        prompt_text = cleanText(p.get("text", "")) if isinstance(p, dict) else ""
        cont_text = cleanText(c.get("text", "")) if isinstance(c, dict) else ""

        if prefer_continuation and cont_text:
            text = cont_text
            tox = c.get("toxicity", p.get("toxicity", None))
        else:
            text = (prompt_text + " " + cont_text).strip()
            tox = c.get("toxicity", p.get("toxicity", None))

        rows.append({"text": text, "raw_toxicity": tox})

    df = pd.DataFrame(rows)
    df["raw_toxicity"] = pd.to_numeric(df["raw_toxicity"], errors="coerce")
    df["toxicity_score"] = df["raw_toxicity"].apply(
        lambda v: toToxicRange(v, 0.0, 1.0)
    )

    return df[["text", "toxicity_score"]]


Retraining model using ingame chat log data - GameTox

In [61]:
def loadGameTox(path: str, text_col: str = "message", label_col: str = "label", 
    max_rows: Optional[int] = 6000) -> pd.DataFrame:
    df = pd.read_csv(path, nrows=max_rows)
    df["text"] = df[text_col].apply(cleanText)
    
    #convert multi-class intent (0..5) → [1..5] toxicity range
    df["toxicity_score"] = df[label_col].apply(lambda v: toToxicRange(v, 0.0, 5.0))

    #keeps only required columns
    return df[["text", "toxicity_score"]]

clean and merge helpers

In [62]:
#applies a 2nd round of cleaning:
# - enforces text type,
# - removes empty / '[deleted]' / very short - if any
# - drop duplicates
def cleanAndMergeTrainingData(df: pd.DataFrame):
    df = df.copy()

    df["text"] = df["text"].apply(cleanText)
    df = df[df["text"].apply(isInformativeText)]

    df = df.dropna(subset=["toxicity_score"])
    df = df[df["toxicity_score"].apply(lambda x: not pd.isna(x))]

    df = df.drop_duplicates(subset=["text"])
    df = df.reset_index(drop=True)

    return df

#merge multiple [text, toxicity_score] dfs into one
def mergeDatasets(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    cleaned = []
    for d in dfs:
        d = d.copy()
        if not {"text", "toxicity_score"}.issubset(d.columns):
            raise ValueError("Dataset missing required columns.")
        cleaned.append(d[["text", "toxicity_score"]])

    combined = pd.concat(cleaned, ignore_index=True)
    combined = cleanAndMergeTrainingData(combined)
    combined = combined.sample(frac=1.0, random_state=42)

    return combined.reset_index(drop=True)


In [63]:
jigsawDF = loadJigsaw("jigsaw_train.csv")
rudditDF = loadRuddit("ruddit_train.csv")
rtpDF = loadRTP(6000)
gametoxDF = loadGameTox("gametox_train.csv")

#individual dataframes
print(jigsawDF.shape) #prints -> (# of entries, # of columns) 
print(rudditDF.shape)
print(rtpDF.shape)
print(gametoxDF.shape)

(6000, 2)
(5966, 2)
(6000, 2)
(6000, 2)


In [64]:
#combined dataframe
combinedDF = mergeDatasets([jigsawDF, rudditDF, rtpDF, gametoxDF])
print("Combined dataframe looks like this -> ", combinedDF.shape)
print(combinedDF.head())
combinedDF.to_csv("combined_toxicity.csv", index=False)

Combined dataframe looks like this ->  (22028, 2)
                                                text  toxicity_score
0               play some good footy here long-term.        1.254586
1  with Packers and Cardinals coming in at 10/1 (...        1.409145
2  It's the Palace of Westminster not Buckingham ...        2.292000
3                    ya spotting the hil im guessing        1.000000
4  and I had on Twitter shortly after LeVar and I...        1.165766


Train/Validation/Test  -- splitting - widely recommended 80/10/10 in NLP

In [65]:
X_train, X_temp, y_train, y_temp = train_test_split(
    #80(train)-20(in temp -> later split for validation + test) split
    combinedDF["text"], combinedDF["toxicity_score"], test_size=0.2, random_state=123
)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=123)
print(len(X_train), len(X_val), len(X_test))


17622 2203 2203


Regression metrics - Helpers

In [73]:
def regression_metrics(y_true, y_pred):
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": root_mean_squared_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
    }

def to_binary_labels(y, threshold=3.0):
    return (np.array(y) >= threshold).astype(int)

def classification_metrics(y_true_scores, y_pred_scores, threshold=3.0):
    y_true = to_binary_labels(y_true_scores, threshold)
    y_pred = to_binary_labels(y_pred_scores, threshold)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="binary"
    )

    try:
        auc = roc_auc_score(y_true, y_pred_scores)
    except:
        auc = float("nan")

    return {
        "Precision": precision,
        "Recall": recall,
        "F1": f1,
        "ROC_AUC": auc,
    }

Model 1

In [None]:
#Ridge
# - strong Linear Baseline
# - standard linear model often used in text regression/classification tasks with TF-IDF

tfidf = TfidfVectorizer(
    max_features=40_000,
    ngram_range=(1, 2),
    min_df=3,
)

X_train_vec = tfidf.fit_transform(X_train)
X_val_vec = tfidf.transform(X_val)
X_test_vec = tfidf.transform(X_test)

ridge = Ridge(alpha=1.0)
ridge.fit(X_train_vec, y_train)

ridge_pred = ridge.predict(X_test_vec)


In [None]:
ridge_reg = regression_metrics(y_test, ridge_pred)
ridge_cls = classification_metrics(y_test, ridge_pred)

print("Ridge Regression -> ", ridge_reg)
print("Ridge Classification -> ", ridge_cls)


Model2

In [None]:
#RandomForest
# - non-linear classical ml baseline
# - intended to capture non-linear interactions between TF–IDF features
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    random_state=123,
    n_jobs=-1
)

rf.fit(X_train_vec, y_train)
rf_pred = rf.predict(X_test_vec)

rf_reg = regression_metrics(y_test, rf_pred)
rf_cls = classification_metrics(y_test, rf_pred)

print("RF Regression -> ", rf_reg)
print("RF Classification -> ", rf_cls)

Model3 -> ⭐ Chosen One ⭐

In [66]:
#DistilRoBERTa
# - deep neural contextual language model
# - input representation: token embeddings + attention
# - captures context, sarcasm, slang, punctuation nuance
# - learns toxic patterns beyond simple word frequency
# - represents the modern SOTA transformer approach
# - faster+lightweight compared to BERT/RoBERTa

model_name = "distilroberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1, #regression head
    problem_type="regression",
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
class ToxicityDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = list(texts)
        self.labels = list(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=64,       # -- changed 128 -> 64 (baseline) - some text cutoff due to lowering this
            return_tensors="pt",
        )
        item = {key: val.squeeze(0) for key, val in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_ds = ToxicityDataset(X_train, y_train)
val_ds   = ToxicityDataset(X_val, y_val)
test_ds  = ToxicityDataset(X_test, y_test)


In [68]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Predict w/distilroberta

In [69]:
training_args = TrainingArguments(
    output_dir="./distilroberta_toxicity",
    learning_rate=3e-5,                 #slightly higher for small batch size
    per_device_train_batch_size=16,      #smaller batch (faster, safer)
    per_device_eval_batch_size=16,
    num_train_epochs=1,                 #1 epoch -> enough (and fast)
    weight_decay=0.01,
    fp16=True,                          #use if GPU supports it -> edit:safe to keep!!
    logging_steps=50,
)

In [70]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
)

In [71]:
trainer.train()



Step,Training Loss
50,0.8778
100,0.4039
150,0.3423
200,0.3498
250,0.3607
300,0.3162
350,0.3185
400,0.3206
450,0.2636
500,0.2971


TrainOutput(global_step=1102, training_loss=0.30011083188377147, metrics={'train_runtime': 3571.8422, 'train_samples_per_second': 4.934, 'train_steps_per_second': 0.309, 'total_flos': 291787358683392.0, 'train_loss': 0.30011083188377147, 'epoch': 1.0})

In [74]:
pred_output = trainer.predict(test_ds)
distilroberta_pred = pred_output.predictions.flatten()

distilroberta_reg = regression_metrics(y_test, distilroberta_pred)
distilroberta_cls = classification_metrics(y_test, distilroberta_pred)

print("RoBERTa Regression:", distilroberta_reg)
print("RoBERTa Classification:", distilroberta_cls)


RoBERTa Regression: {'MAE': 0.27435488114883483, 'RMSE': 0.4308894795310864, 'R2': 0.7865369541069497}
RoBERTa Classification: {'Precision': 0.7580071174377224, 'Recall': 0.75, 'F1': 0.7539823008849558, 'ROC_AUC': 0.9627795433360979}


In [77]:
#store the final trained model
save_dir = "final_toxicity_model_adapted"
trainer.save_model(save_dir)

#saves tokenizer files such as vocab, merges, tokenizer.json, etc
tokenizer.save_pretrained(save_dir)

('final_toxicity_model_adapted\\tokenizer_config.json',
 'final_toxicity_model_adapted\\special_tokens_map.json',
 'final_toxicity_model_adapted\\vocab.json',
 'final_toxicity_model_adapted\\merges.txt',
 'final_toxicity_model_adapted\\added_tokens.json',
 'final_toxicity_model_adapted\\tokenizer.json')

Test suite will consist of: - test code cleared, test results stored!\
<span style="font-size: 15px;">
    \
    (more related to our project)\
    - jigsaw test data - same domain as training data\
    - game chats - LoL - tribunal chat logs\
</span>
<span style="font-size: 15px;">
    (fun, side testing)\
    - daily conversation, funny/sarcastic comments - TV shows\
</span>

In [None]:
#load trained model from HuggingFace repo
#model_name = "visha007/ToxiMuncher-Lite" #baseline model
model_name = "visha007/ToxiMuncher-Pro" #baseline model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
#model.eval()