In [13]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score
import time
import os
import re

global_start_time = time.time()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
else:
    print('Running on CPU.')

cell_start = time.time()
print(f'Cell 1 time: {time.time() - cell_start:.2f} seconds')

CUDA available: False
Running on CPU.
Cell 1 time: 0.00 seconds


In [14]:
cell_start = time.time()

# Load dataset
english_data_path = 'train.csv'

if not os.path.exists(english_data_path):
    raise FileNotFoundError(f'Dataset not found at {os.path.abspath(english_data_path)}')

# Load train.csv dataset (sample 10,000 rows)
df = pd.read_csv(english_data_path).sample(10000, random_state=42)
df['language'] = df['comment_text'].apply(lambda x: 'marathi_hindi' if bool(re.search(r'[\u0900-\u097F]', str(x))) else 'english')

texts = df['comment_text'].tolist()
labels = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

print(f'Total dataset size: {len(df)}')
print(f'Label distribution:\n{df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum()}')
print(f'Cell 2 time: {time.time() - cell_start:.2f} seconds')

Total dataset size: 10000
Label distribution:
toxic            904
severe_toxic      91
obscene          498
threat            17
insult           475
identity_hate     87
dtype: int64
Cell 2 time: 23.18 seconds


In [15]:
cell_start = time.time()

# Load tokenizer and model
local_path = './multilingual_toxic_detector_model'
if not os.path.exists(local_path):
    os.makedirs(local_path)
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-multilingual-cased',
        num_labels=6,
        problem_type='multi_label_classification'
    )
    tokenizer.save_pretrained(local_path)
    model.save_pretrained(local_path)
    print(f'Downloaded and saved model to {local_path}')

tokenizer = DistilBertTokenizer.from_pretrained(local_path)
config = DistilBertConfig.from_pretrained(
    local_path,
    num_labels=6,
    problem_type='multi_label_classification',
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)
model = DistilBertForSequenceClassification.from_pretrained(local_path, config=config, ignore_mismatched_sizes=True)
model.to(device)
model.eval()

print(f'Cell 3 time: {time.time() - cell_start:.2f} seconds')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloaded and saved model to ./multilingual_toxic_detector_model
Cell 3 time: 20.48 seconds


In [16]:
cell_start = time.time()

# Split data: 80% train, 10% validation, 10% test
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

# Encode texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

print(f'Train size: {len(train_texts)}, Validation size: {len(val_texts)}, Test size: {len(test_texts)}')
print(f'Cell 4 time: {time.time() - cell_start:.2f} seconds')

Train size: 8000, Validation size: 1000, Test size: 1000
Cell 4 time: 12.47 seconds


In [17]:
cell_start = time.time()

class ToxicDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __getitem__(self, idx):
        item = {key: val[idx].to(device) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].to(device)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ToxicDataset(train_encodings, train_labels)
val_dataset = ToxicDataset(val_encodings, val_labels)
test_dataset = ToxicDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

print(f'Cell 5 time: {time.time() - cell_start:.2f} seconds')

Cell 5 time: 1.10 seconds


In [19]:
cell_start = time.time()

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

print(f'Cell 6 time: {time.time() - cell_start:.2f} seconds')

Cell 6 time: 0.00 seconds


In [20]:
cell_start = time.time()

def train_model(epochs=3, patience=2):
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {key: val for key, val in batch.items() if key != 'labels'}
            labels = batch['labels']
            outputs = model(**inputs).logits
            loss = loss_fn(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0
        val_preds, val_true = [], []
        with torch.no_grad():
            for batch in val_loader:
                inputs = {key: val for key, val in batch.items() if key != 'labels'}
                labels = batch['labels']
                outputs = model(**inputs).logits
                loss = loss_fn(outputs, labels)
                val_loss += loss.item()
                preds = torch.sigmoid(outputs) > 0.5
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = accuracy_score(val_true, val_preds)
        scheduler.step(avg_val_loss)
        
        print(f'Epoch {epoch+1}/{epochs}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
        
        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pt')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print('Early stopping triggered.')
                break

train_model(epochs=3, patience=2)
print('Training completed.')
print(f'Cell 7 time: {time.time() - cell_start:.2f} seconds')

Epoch 1/3: Train Loss: 0.1024, Val Loss: 0.0605, Val Accuracy: 0.9110
Epoch 2/3: Train Loss: 0.0543, Val Loss: 0.0663, Val Accuracy: 0.9110
Epoch 3/3: Train Loss: 0.0424, Val Loss: 0.0550, Val Accuracy: 0.8990
Training completed.
Cell 7 time: 8769.68 seconds


In [21]:
cell_start = time.time()

# Load best model
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val for key, val in batch.items() if key != 'labels'}
        labels = batch['labels']
        outputs = model(**inputs).logits
        preds = torch.sigmoid(outputs) > 0.5
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

precision = precision_score(all_labels, all_preds, average='micro', zero_division=0)
recall = recall_score(all_labels, all_preds, average='micro', zero_division=0)
accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}')

print(f'Cell 8 time: {time.time() - cell_start:.2f} seconds')

Test Precision: 0.7418, Recall: 0.6109, Accuracy: 0.9040
Cell 8 time: 103.02 seconds


In [22]:
cell_start = time.time()

def is_marathi_hindi(text):
    devanagari_regex = r'[\u0900-\u097F]'
    return bool(re.search(devanagari_regex, text))

def score_comment(text):
    print(f'Processing comment: {text!r}')
    text = ' '.join(text.strip().split())
    
    # Check for toxic patterns: asterisk, backslash, forward slash, or ellipsis
    has_toxic_pattern = bool(re.search(r'\b[\w*\\\/]*[\*\\\/\.]{1,}[\w*\\\/]*\b', text))
    
    # Tokenize input
    inputs = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors='pt').to(device)
    
    # Run inference
    with torch.no_grad():
        outputs = model(**inputs).logits
    probs = torch.sigmoid(outputs).cpu().numpy()[0]
    
    # Define categories (aligned with model training)
    categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    scores = {cat: float(prob) for cat, prob in zip(categories, probs)}
    
    # Extract words
    words = re.findall(r'\b[\w\'*\\\/-]+\b', text.lower())
    
    # Determine if text is toxic (model threshold 0.3 or toxic pattern)
    is_toxic = has_toxic_pattern or any(prob > 0.3 for prob in probs)
    
    # Extract toxic words
    toxic_words = []
    if is_toxic:
        for word in words:
            if word in ['tu', 'ahe', 'kya', 'kar', 'raha', 'hai', 'ka', 'se']:
                continue
            if any(p in word for p in ['*', '\\', '/', '...']):
                toxic_words.append(word)
                continue
            word_inputs = tokenizer(word, truncation=True, padding=True, max_length=128, return_tensors='pt').to(device)
            with torch.no_grad():
                word_outputs = model(**word_inputs).logits
            word_probs = torch.sigmoid(word_outputs).cpu().numpy()[0]
            if any(word_prob > 0.3 for word_prob in word_probs):
                toxic_words.append(word)
    
    toxic_words = list(dict.fromkeys(toxic_words))
    
    return {
        'is_toxic': is_toxic,
        'toxic_words': toxic_words,
        'scores': scores
    }

# Test example comments
example_comments = [
    'Nature is beautiful but some people are just awful',
    'are murkha',
    'a/s/h/h/o/l',
    'f..ck',
    'WTH'
]
for comment in example_comments:
    print(f'\nScoring example comment: {comment}')
    result = score_comment(comment)
    print('Result:', result)

print(f'Cell 9 time: {time.time() - cell_start:.2f} seconds')


Scoring example comment: Nature is beautiful but some people are just awful
Processing comment: 'Nature is beautiful but some people are just awful'
Result: {'is_toxic': False, 'toxic_words': [], 'scores': {'toxic': 0.021533053368330002, 'severe_toxic': 8.854931365931407e-05, 'obscene': 0.0016309237107634544, 'threat': 0.0001629415201023221, 'insult': 0.0014743577921763062, 'identity_hate': 0.00025027836090885103}}

Scoring example comment: are murkha
Processing comment: 'are murkha'
Result: {'is_toxic': True, 'toxic_words': ['murkha'], 'scores': {'toxic': 0.9014849662780762, 'severe_toxic': 0.08181779831647873, 'obscene': 0.44995173811912537, 'threat': 0.05206559970974922, 'insult': 0.5787873268127441, 'identity_hate': 0.14748023450374603}}

Scoring example comment: a/s/h/h/o/l
Processing comment: 'a/s/h/h/o/l'
Result: {'is_toxic': True, 'toxic_words': ['a/s/h/h/o/l'], 'scores': {'toxic': 0.14381268620491028, 'severe_toxic': 0.010317770764231682, 'obscene': 0.044958434998989105, 'thr

In [23]:
cell_start = time.time()

model.save_pretrained('./multilingual_toxic_detector_model')
tokenizer.save_pretrained('./multilingual_toxic_detector_model')
print(f'\nTotal execution time: {(time.time() - global_start_time) / 60:.2f} minutes')

print(f'Cell 10 time: {time.time() - cell_start:.2f} seconds')


Total execution time: 362.07 minutes
Cell 10 time: 13.84 seconds
