In [5]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score
import time
import os

global_start_time = time.time()
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU.")

cell_start = time.time()
print(f"Cell 1 time: {time.time() - cell_start:.2f} seconds")

CUDA available: False
Running on CPU.
Cell 1 time: 0.00 seconds


In [6]:
cell_start = time.time()
data_path = "train.csv"  # Update if needed
if not os.path.exists(data_path):
    raise FileNotFoundError(f"train.csv not found at {os.path.abspath(data_path)}")

df = pd.read_csv(data_path).sample(10000, random_state=42)
texts = df['comment_text'].tolist()
labels = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

print(f"Cell 2 time: {time.time() - cell_start:.2f} seconds")

Cell 2 time: 1.56 seconds


In [8]:
cell_start = time.time()
from transformers import DistilBertConfig

local_path = "./distilbert_local"
tokenizer = DistilBertTokenizer.from_pretrained(local_path)

# Create a config with dropout settings
config = DistilBertConfig.from_pretrained(
    local_path,
    num_labels=6,
    problem_type="multi_label_classification",
    hidden_dropout_prob=0.3,  # Set dropout for hidden layers
    attention_probs_dropout_prob=0.3  # Set dropout for attention layers
)

# Load model with the custom config
model = DistilBertForSequenceClassification.from_pretrained(local_path, config=config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Cell 3 time: {time.time() - cell_start:.2f} seconds")

Cell 3 time: 0.73 seconds


In [9]:
cell_start = time.time()
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

print(f"Cell 4 time: {time.time() - cell_start:.2f} seconds")

Cell 4 time: 14.49 seconds


In [10]:
cell_start = time.time()

class ToxicDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __getitem__(self, idx):
        item = {key: val[idx].to(device) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].to(device)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ToxicDataset(train_encodings, train_labels)
val_dataset = ToxicDataset(val_encodings, val_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)  # Smaller batch size
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16)

print(f"Cell 5 time: {time.time() - cell_start:.2f} seconds")

Cell 5 time: 0.02 seconds


In [11]:
cell_start = time.time()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)  # Lower learning rate
loss_fn = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1)

print(f"Cell 6 time: {time.time() - cell_start:.2f} seconds")

Cell 6 time: 0.00 seconds


In [12]:
cell_start = time.time()

def train_model(epochs=3, patience=2):
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {key: val for key, val in batch.items() if key != 'labels'}
            labels = batch['labels']
            outputs = model(**inputs).logits
            loss = loss_fn(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()
            train_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = {key: val for key, val in batch.items() if key != 'labels'}
                labels = batch['labels']
                outputs = model(**inputs).logits
                loss = loss_fn(outputs, labels)
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        scheduler.step(avg_val_loss)
        
        print(f"Epoch {epoch+1}/{epochs}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        
        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_model.pt")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

train_model(epochs=3, patience=2)
print(f"Training completed.")
print(f"Cell 7 time: {time.time() - cell_start:.2f} seconds")

Epoch 1/3: Train Loss: 0.1027, Val Loss: 0.0573
Epoch 2/3: Train Loss: 0.0440, Val Loss: 0.0556
Epoch 3/3: Train Loss: 0.0323, Val Loss: 0.0680
Training completed.
Cell 7 time: 11155.33 seconds


In [13]:
cell_start = time.time()

# Load best model
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

all_preds, all_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        inputs = {key: val for key, val in batch.items() if key != 'labels'}
        labels = batch['labels']
        outputs = model(**inputs).logits
        preds = torch.sigmoid(outputs) > 0.5
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)
precision = precision_score(all_labels, all_preds, average='micro', zero_division=0)
recall = recall_score(all_labels, all_preds, average='micro', zero_division=0)
accuracy = accuracy_score(all_labels, all_preds)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}")

print(f"Cell 8 time: {time.time() - cell_start:.2f} seconds")

Precision: 0.7608, Recall: 0.7091, Accuracy: 0.9135
Cell 8 time: 311.78 seconds


In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import time

cell_start = time.time()

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load sentiment model
sentiment_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
sentiment_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
sentiment_model.to(device)
sentiment_model.eval()

def score_comment_with_sentiment(comment):
    # Toxicity scoring
    inputs = tokenizer(comment, truncation=True, padding=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs).logits
    toxicity_probs = torch.sigmoid(outputs).cpu().numpy()[0]
    toxicity_categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    # Sentiment analysis
    sentiment_inputs = sentiment_tokenizer(comment, truncation=True, padding=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        sentiment_outputs = sentiment_model(**sentiment_inputs).logits
    sentiment_probs = torch.softmax(sentiment_outputs, dim=1).cpu().numpy()[0]
    sentiment_score = sentiment_probs[1] - sentiment_probs[0]  # Positive - Negative score (-1 to 1)
    sentiment_label = "Positive" if sentiment_score > 0 else "Negative" if sentiment_score < 0 else "Neutral"
    
    # Format output
    output = "Toxicity Scores:\n" + "\n".join(f"{cat}: {prob:.4f}" for cat, prob in zip(toxicity_categories, toxicity_probs))
    output += "\n\nSentiment Analysis:\n"
    output += f"Sentiment Label: {sentiment_label}\n"
    output += f"Sentiment Score: {sentiment_score:.4f} (-1 to 1)\n"
    output += f"Positive Probability: {sentiment_probs[1]:.4f}\n"
    output += f"Negative Probability: {sentiment_probs[0]:.4f}"
    return output

# Test with example
example_comment = "Nature is beautiful but some people are just awful"
print("\nScoring example comment with sentiment:")
print(score_comment_with_sentiment(example_comment))

print(f"Cell time: {time.time() - cell_start:.2f} seconds")


Scoring example comment with sentiment:


NameError: name 'tokenizer' is not defined

In [16]:
cell_start = time.time()

def score_comment(comment):
    inputs = tokenizer(comment, truncation=True, padding=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs).logits
    probs = torch.sigmoid(outputs).cpu().numpy()[0]
    categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    return "\n".join(f"{cat}: {prob:.4f}" for cat, prob in zip(categories, probs))

example_comment = "Nature encompasses all the natural world, from the smallest insect to the largest mountain, and the intricate web of life that sustains us all. It provides essential resources like clean air, water, and food, and its beauty and balance inspire and uplift us. Protecting and preserving nature is crucial for the health of our planet and the well-being of future generations"
print("\nScoring example comment:")
print(score_comment(example_comment))

print(f"Cell 9 time: {time.time() - cell_start:.2f} seconds")


Scoring example comment:
toxic: 0.0069
severe_toxic: 0.0008
obscene: 0.0019
threat: 0.0011
insult: 0.0022
identity_hate: 0.0017
Cell 9 time: 11.80 seconds


In [15]:
cell_start = time.time()
model.save_pretrained("./improved_toxic_detector_model")
tokenizer.save_pretrained("./improved_toxic_detector_model")
print(f"\nTotal execution time: {(time.time() - global_start_time) / 60:.2f} minutes")

print(f"Cell 10 time: {time.time() - cell_start:.2f} seconds")


Total execution time: 204.18 minutes
Cell 10 time: 12.05 seconds
