In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("../data/train.csv")

# Basic cleaning
df = df.dropna(subset=["comment_text"])
df["clean_comment"] = df["comment_text"].str.lower().str.replace(r"[^a-z\s]", "", regex=True)

# Define features and labels
X = df["clean_comment"]
y = df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

# Split (same as before — make sure random_state matches)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
model_name = "unitary/toxic-bert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
# Convert test comments to a list
test_texts = X_test.tolist()
batch_size = 32

# Store all outputs
all_logits = []

# Run inference in batches (no gradients needed)
with torch.no_grad():
    for i in tqdm(range(0, len(test_texts), batch_size)):
        batch = test_texts[i:i+batch_size]
        encoded = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        outputs = model(**encoded)
        logits = torch.sigmoid(outputs.logits)  # Multi-label sigmoid activation
        all_logits.append(logits.cpu().numpy())

# Combine all batch outputs into one array
predictions = np.vstack(all_logits)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 998/998 [1:31:31<00:00,  5.50s/it]


In [11]:
# Convert probabilities to binary (0/1) with threshold = 0.5
threshold = 0.5
binary_preds = (predictions >= threshold).astype(int)

# Get ground truth
y_true = y_test.values

# Confirm shape matches
print("Predictions shape:", binary_preds.shape)
print("Ground truth shape:", y_true.shape)


Predictions shape: (31915, 6)
Ground truth shape: (31915, 6)


In [13]:
from sklearn.metrics import classification_report

# Evaluate per label
for i, label in enumerate(y_test.columns):
    print(f"\nClassification Report for {label}")
    print(classification_report(y_true[:, i], binary_preds[:, i]))



Classification Report for toxic
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     28859
           1       0.88      0.92      0.90      3056

    accuracy                           0.98     31915
   macro avg       0.93      0.95      0.94     31915
weighted avg       0.98      0.98      0.98     31915


Classification Report for severe_toxic
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     31594
           1       0.51      0.50      0.50       321

    accuracy                           0.99     31915
   macro avg       0.75      0.75      0.75     31915
weighted avg       0.99      0.99      0.99     31915


Classification Report for obscene
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     30200
           1       0.82      0.94      0.88      1715

    accuracy                           0.99     31915
   macro avg       0

In [15]:
np.save("../outputs/toxic_bert_preds.npy", predictions)
