In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score
import time
import os

global_start_time = time.time()
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU.")

cell_start = time.time()
print(f"Cell 1 time: {time.time() - cell_start:.2f} seconds")

CUDA available: False
Running on CPU.
Cell 1 time: 0.00 seconds


In [2]:
cell_start = time.time()
data_path = "marathi_hindi_toxicity_dataset.csv"
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Dataset not found at {os.path.abspath(data_path)}")

df = pd.read_csv(data_path)
texts = df['comment_text'].tolist()
labels = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values  # Six labels

print(f"Dataset size: {len(df)}")
print(f"Label distribution:\n{df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum()}")
print(f"Cell 2 time: {time.time() - cell_start:.2f} seconds")

Dataset size: 1037
Label distribution:
toxic            520
severe_toxic      80
obscene          441
threat             1
insult           520
identity_hate     28
dtype: int64
Cell 2 time: 0.03 seconds


In [3]:
cell_start = time.time()

# Define local path for model
local_path = "./distilbert_local"
if not os.path.exists(local_path):
    os.makedirs(local_path)
    # Download model and tokenizer
    try:
        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")
        model = DistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-multilingual-cased",
            num_labels=6,
            problem_type="multi_label_classification"
        )
        tokenizer.save_pretrained(local_path)
        model.save_pretrained(local_path)
        print(f"Downloaded and saved model to {local_path}")
    except Exception as e:
        raise Exception(f"Failed to download model: {e}. Ensure internet connection or use pre-downloaded model.")

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(local_path)

# Create config for multi-label classification
config = DistilBertConfig.from_pretrained(
    local_path,
    num_labels=6,
    problem_type="multi_label_classification",
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)

# Load model with ignore_mismatched_sizes
model = DistilBertForSequenceClassification.from_pretrained(
    local_path,
    config=config,
    ignore_mismatched_sizes=True
)
device = torch.device("cpu")  # Force CPU
model.to(device)

print(f"Cell 3 time: {time.time() - cell_start:.2f} seconds")

Cell 3 time: 2.03 seconds


In [4]:
cell_start = time.time()

# Split data: 80% train, 10% validation, 10% test
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

# Encode texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

print(f"Train size: {len(train_texts)}, Validation size: {len(val_texts)}, Test size: {len(test_texts)}")
print(f"Cell 4 time: {time.time() - cell_start:.2f} seconds")

Train size: 829, Validation size: 104, Test size: 104
Cell 4 time: 0.34 seconds


In [5]:
cell_start = time.time()

class ToxicDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __getitem__(self, idx):
        item = {key: val[idx].to(device) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].to(device)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ToxicDataset(train_encodings, train_labels)
val_dataset = ToxicDataset(val_encodings, val_labels)
test_dataset = ToxicDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

print(f"Cell 5 time: {time.time() - cell_start:.2f} seconds")

Cell 5 time: 0.01 seconds


In [6]:
cell_start = time.time()

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

print(f"Cell 6 time: {time.time() - cell_start:.2f} seconds")

Cell 6 time: 5.08 seconds


In [7]:
cell_start = time.time()

def train_model(epochs=3, patience=2):
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {key: val for key, val in batch.items() if key != 'labels'}
            labels = batch['labels']
            outputs = model(**inputs).logits
            loss = loss_fn(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0
        val_preds, val_true = [], []
        with torch.no_grad():
            for batch in val_loader:
                inputs = {key: val for key, val in batch.items() if key != 'labels'}
                labels = batch['labels']
                outputs = model(**inputs).logits
                loss = loss_fn(outputs, labels)
                val_loss += loss.item()
                preds = torch.sigmoid(outputs) > 0.5
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = accuracy_score(val_true, val_preds)
        scheduler.step(avg_val_loss)
        
        print(f"Epoch {epoch+1}/{epochs}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
        
        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "Marathi_hindi_best_model.pt")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

train_model(epochs=3, patience=2)
print(f"Training completed.")
print(f"Cell 7 time: {time.time() - cell_start:.2f} seconds")

Epoch 1/3: Train Loss: 0.2810, Val Loss: 0.0920, Val Accuracy: 0.9615
Epoch 2/3: Train Loss: 0.0668, Val Loss: 0.0306, Val Accuracy: 1.0000
Epoch 3/3: Train Loss: 0.0285, Val Loss: 0.0148, Val Accuracy: 1.0000
Training completed.
Cell 7 time: 444.60 seconds


In [8]:
cell_start = time.time()

# Load best model
model.load_state_dict(torch.load("Marathi_hindi_best_model.pt"))
model.eval()

all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val for key, val in batch.items() if key != 'labels'}
        labels = batch['labels']
        outputs = model(**inputs).logits
        preds = torch.sigmoid(outputs) > 0.5
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

precision = precision_score(all_labels, all_preds, average='micro', zero_division=0)
recall = recall_score(all_labels, all_preds, average='micro', zero_division=0)
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}")

print(f"Cell 8 time: {time.time() - cell_start:.2f} seconds")

Test Precision: 1.0000, Recall: 1.0000, Accuracy: 1.0000
Cell 8 time: 3.37 seconds


In [10]:
cell_start = time.time()

def score_comment(comment):
    inputs = tokenizer(comment, truncation=True, padding=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs).logits
    probs = torch.sigmoid(outputs).cpu().numpy()[0]
    categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    return "\n".join(f"{cat}: {prob:.4f}" for cat, prob in zip(categories, probs))

example_comment = "are murkha"
print("\nScoring example comment:")
print(score_comment(example_comment))

print(f"Cell 9 time: {time.time() - cell_start:.2f} seconds")


Scoring example comment:
toxic: 0.7244
severe_toxic: 0.4342
obscene: 0.7337
threat: 0.3156
insult: 0.7361
identity_hate: 0.3365
Cell 9 time: 0.16 seconds


In [11]:
cell_start = time.time()

model.save_pretrained("./marathi_hindi_toxic_detector_model")
tokenizer.save_pretrained("./marathi_hindi_toxic_detector_model")
print(f"\nTotal execution time: {(time.time() - global_start_time) / 60:.2f} minutes")

print(f"Cell 10 time: {time.time() - cell_start:.2f} seconds")


Total execution time: 8.62 minutes
Cell 10 time: 0.57 seconds
