For now we are going to be using Jigsaw as the dataset

In [1]:
!pip install datasets



In [4]:
!pip install -q scikit-learn datasets torch fasttext-wheel psutil

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/4.6 MB[0m [31m10.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m2.4/4.6 MB[0m [31m35.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.6/4.6 MB[0m [31m54.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/293.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.6/293.6 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import time
import psutil
import os
import torch
import torch.nn as nn
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

# Data

Overall, we take the bare Jigsaw comment classification, the only changes are:

- we only take the 0 or 1 toxicity scores, -1 and nans are invalidated
- we trim the comments at max_lenth = 1500 and we ignore comments shorter than 20 chars

In [32]:
from datasets import load_dataset

train = load_dataset("thesofakillers/jigsaw-toxic-comment-classification-challenge", split="train")
# test  = load_dataset("thesofakillers/jigsaw-toxic-comment-classification-challenge", split="test")

In [33]:
train = train.filter(lambda x: x["comment_text"] is not None)

Filter:   0%|          | 0/159571 [00:00<?, ? examples/s]

In [34]:
train = train.filter(lambda x: len(x["comment_text"]) >= 20)

Filter:   0%|          | 0/159571 [00:00<?, ? examples/s]

In [35]:
train = train.map(
    lambda x: {"comment_text": x["comment_text"].strip()}
)

MAX_LEN = 1500

def trim(example):
    text = example["comment_text"]
    return {"comment_text": text if len(text) <= MAX_LEN else text[:MAX_LEN]}

train = train.map(trim)

Map:   0%|          | 0/159287 [00:00<?, ? examples/s]

Map:   0%|          | 0/159287 [00:00<?, ? examples/s]

In [44]:
X = np.array(train["comment_text"])
y = np.array(train["toxic"]).astype(int)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=1337,
    stratify=y
)

In [48]:
len(X_train)

127429

In [49]:
len(y_train)

127429

In [50]:
len(X_test)

31858

In [51]:
len(y_test)

31858

In [8]:
def measure_latency(model, X, predict_fn, n_runs=5000):
    times = []
    for i in range(n_runs):
        x = X[i % len(X)]
        start = time.perf_counter()
        predict_fn(model, x)
        times.append(time.perf_counter() - start)

    times = np.array(times) * 1000  # ms
    return {
        "mean_ms": times.mean(),
        "p95_ms": np.percentile(times, 95),
        "throughput_msg_s": 1000 / times.mean()
    }


In [9]:
def model_size_mb(obj):
    import pickle, tempfile
    with tempfile.NamedTemporaryFile(delete=False) as f:
        pickle.dump(obj, f)
        size = os.path.getsize(f.name)
    return size / (1024 * 1024)

In [10]:
def linear_flops(nnz):
    # dot product: nnz mults + (nnz - 1) adds
    return 2 * nnz

# Baseline 1 — BoW + Logistic Regression

In [52]:
bow = CountVectorizer(
    max_features=100_000,
    ngram_range=(1, 2),
    min_df=5
)

Xtr = bow.fit_transform(X_train)
Xte = bow.transform(X_test)

lr = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)
lr.fit(Xtr, y_train)

probs = lr.predict_proba(Xte)[:, 1]
roc_bow = roc_auc_score(y_test, probs)

In [53]:
def bow_predict(model, text):
    x = bow.transform([text])
    return model.predict_proba(x)[0, 1]

lat_bow = measure_latency(lr, X_test, bow_predict)
avg_nnz = Xtr.nnz / Xtr.shape[0]
flops_bow = linear_flops(avg_nnz)

In [54]:
print(f'BOW + LR ROC_AUC = {roc_bow}')
print(f'BOW + LR Latency = {lat_bow}')
print(f'BOW + LR FLOPS = {flops_bow}')

BOW + LR ROC_AUC = 0.9521565233043973
BOW + LR Latency = {'mean_ms': np.float64(1.3387594617972354), 'p95_ms': np.float64(5.011783050053964), 'throughput_msg_s': np.float64(746.9601736054487)}
BOW + LR FLOPS = 144.93134215916314


# Baseline 2 — TF-IDF + Linear SVM

In [55]:
tfidf = TfidfVectorizer(
    max_features=100_000,
    ngram_range=(1, 2),
    min_df=5,
    sublinear_tf=True
)

Xtr = tfidf.fit_transform(X_train)
Xte = tfidf.transform(X_test)

svm = LinearSVC()
svm.fit(Xtr, y_train)

scores = svm.decision_function(Xte)
roc_svm = roc_auc_score(y_test, scores)

In [56]:
def svm_predict(model, text):
    x = tfidf.transform([text])
    return model.decision_function(x)[0]

lat_svm = measure_latency(svm, X_test, svm_predict)
avg_nnz = Xtr.nnz / Xtr.shape[0]
flops_svm = linear_flops(avg_nnz)

In [57]:
print(f'ROC_AUC = {roc_svm}')
print(f'Latency = {lat_svm}')
print(f'FLOPS = {flops_svm}')

ROC_AUC = 0.9701528507562371
Latency = {'mean_ms': np.float64(1.9603527394000593), 'p95_ms': np.float64(5.752878550083551), 'throughput_msg_s': np.float64(510.1122771945814)}
FLOPS = 144.93134215916314


# Baseline 3 — Character n-gram Logistic Regression

In [58]:
char_vec = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    max_features=200_000,
    min_df=5
)

Xtr = char_vec.fit_transform(X_train)
Xte = char_vec.transform(X_test)

char_lr = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)
char_lr.fit(Xtr, y_train)

probs = char_lr.predict_proba(Xte)[:, 1]
roc_char = roc_auc_score(y_test, probs)

In [59]:
def char_predict(model, text):
    x = char_vec.transform([text])
    return model.predict_proba(x)[0, 1]

lat_char = measure_latency(char_lr, X_test, char_predict)
avg_nnz = Xtr.nnz / Xtr.shape[0]
flops_char = linear_flops(avg_nnz)

In [60]:
print(f'ROC_AUC = {roc_char}')
print(f'Latency = {lat_char}')
print(f'FLOPS = {flops_char}')

ROC_AUC = 0.9747515128352414
Latency = {'mean_ms': np.float64(2.1315872014129127), 'p95_ms': np.float64(5.7825506003609926), 'throughput_msg_s': np.float64(469.13398585671496)}
FLOPS = 1497.0243351199492


# Baseline 4 — FastText-Style Neural Model

In [61]:
from collections import Counter

def tokenize(text):
    return text.lower().split()

counter = Counter()
for t in X_train:
    counter.update(tokenize(t))

vocab = {w:i+1 for i,(w,_) in enumerate(counter.most_common(50_000))}

In [62]:
class FastText(nn.Module):
    def __init__(self, vocab_size, dim=100):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, dim, padding_idx=0)
        self.fc = nn.Linear(dim, 1)

    def forward(self, x):
        emb = self.emb(x)
        pooled = emb.mean(dim=1)
        return self.fc(pooled).squeeze(1)

In [63]:
def encode(text, max_len=100):
    ids = [vocab.get(w, 0) for w in tokenize(text)[:max_len]]
    return ids + [0] * (max_len - len(ids))

Xtr = torch.tensor([encode(t) for t in X_train[:200_000]])
ytr = torch.tensor(y_train[:200_000]).float()

model = FastText(len(vocab)+1)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

for epoch in range(3):
    opt.zero_grad()
    out = model(Xtr)
    loss = loss_fn(out, ytr)
    loss.backward()
    opt.step()

In [64]:
Xte = torch.tensor([encode(t) for t in X_test[:20_000]])
with torch.no_grad():
    probs = torch.sigmoid(model(Xte)).numpy()

roc_ft = roc_auc_score(y_test[:20_000], probs)

In [65]:
def ft_predict(model, text):
    x = torch.tensor([encode(text)])
    with torch.no_grad():
        return torch.sigmoid(model(x)).item()

lat_ft = measure_latency(model, X_test, ft_predict)
flops_ft = 2 * 100 * Xtr.shape[1]  # L * d

In [66]:
print(f'ROC_AUC = {roc_ft}')
print(f'Latency = {lat_ft}')
print(f'FLOPS = {flops_ft}')

ROC_AUC = 0.42390666223454476
Latency = {'mean_ms': np.float64(0.2431024892019195), 'p95_ms': np.float64(0.3371620497091499), 'throughput_msg_s': np.float64(4113.4914055503805)}
FLOPS = 20000


# Baseline 5 — Toxic-BERT

In [67]:
!pip install transformers detoxify

Collecting detoxify
  Downloading detoxify-0.5.2-py3-none-any.whl.metadata (13 kB)
Downloading detoxify-0.5.2-py3-none-any.whl (12 kB)
Installing collected packages: detoxify
Successfully installed detoxify-0.5.2


In [68]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [69]:
model_name = "unitary/toxic-bert"
tok = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [70]:
def toxicbert_predict(texts):
    inputs = tok(texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
    with torch.no_grad():
        outs = model(**inputs).logits.squeeze(-1)
        probs = torch.sigmoid(outs).numpy()
    return probs

In [71]:
probs = []
batch_size = 32
for i in range(0, len(X_test), batch_size):
    batch = X_test[i:i+batch_size]
    probs.extend(toxicbert_predict(batch))

roc_toxicbert = roc_auc_score(y_test, probs)
print("Toxic-BERT ROC-AUC:", roc_toxicbert)

ValueError: text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples).

In [None]:
def bert_latency(model, tokenizer, texts, n_runs=1000):
    import time
    times = []
    for i in range(n_runs):
        text = texts[i % len(texts)]
        start = time.perf_counter()
        _ = model(**tokenizer(text, return_tensors="pt"))
        times.append(time.perf_counter() - start)
    times = np.array(times) * 1000
    return {
        "mean_ms": times.mean(),
        "p95_ms": np.percentile(times, 95),
        "throughput_msg_s": 1000 / times.mean(),
    }

lat_toxicbert = bert_latency(model, tok, X_test[:1000])
print(lat_toxicbert)

In [None]:
# test of autofix nb