In [1]:
import os
import time
import numpy as np
import csv
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast
from torch.profiler import profile, record_function, ProfilerActivity
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Example usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_and_evaluate_model(model_name, pretrained_model_path, tokenizer_class, model_class, train_data, val_data, batch_size, save_path, max_length=128, accumulation_steps=4, early_stopping_patience=3):
    # Load tokenizer
    tokenizer = tokenizer_class.from_pretrained(model_name)
    
    # Function to tokenize data
    def tokenize_data(texts):
        return tokenizer([str(text) for text in texts], padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

    # Tokenize the train and validation data
    train_encodings = tokenize_data(train_data['comment'].tolist())
    val_encodings = tokenize_data(val_data['comment'].tolist())

    # Create labels tensors
    train_labels = torch.tensor(train_data['isToxic'].values)
    val_labels = torch.tensor(val_data['isToxic'].values)

    # Create TensorDatasets for train and validation sets
    train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
    val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

    # Create DataLoaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True)

    # Load model
    model = model_class.from_pretrained(pretrained_model_path, num_labels=1)
    model.to(device)

    # Initialize optimizer, loss, and gradient scaler
    learning_rate = 2e-5
    weight_decay = 0.01
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.BCEWithLogitsLoss()
    scaler = GradScaler()

    # Training parameters
    num_epochs = 50
    total_batches = len(train_dataloader)
    print_every = max(1, total_batches // 10)
    best_val_loss = float('inf')
    best_epoch = 0
    early_stopping_counter = 0

    # Scheduler: Linear warmup and decay
    num_training_steps = num_epochs * total_batches // accumulation_steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    # Create directory to save the best model
    model_save_dir = '../../temp/'
    os.makedirs(model_save_dir, exist_ok=True)
    best_model_path = os.path.join(model_save_dir, 'best_model.pth')
    custom_save_dir = save_path
    os.makedirs(custom_save_dir, exist_ok=True)

    metrics_filename = os.path.join(custom_save_dir, 'metrics.csv')
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    with open(metrics_filename, 'w', newline='') as csvfile:
        fieldnames = ['Epoch', 'Train Loss', 'Val Loss', 'Train Accuracy', 'Val Accuracy']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for epoch in range(num_epochs):
            epoch_start_time = time.time()
            running_loss = 0.0
            all_predictions = []
            all_targets = []
            optimizer.zero_grad()

            with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
                with record_function("train_epoch"):
                    for batch_idx, (input_ids, attention_mask, target) in enumerate(train_dataloader):
                        input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
                        target = target.unsqueeze(1).float()

                        with autocast():
                            outputs = model(input_ids, attention_mask=attention_mask)
                            loss = criterion(outputs.logits, target)
                            loss = loss / accumulation_steps
                            scaler.scale(loss).backward()

                        if (batch_idx + 1) % accumulation_steps == 0 or (batch_idx + 1) == total_batches:
                            scaler.step(optimizer)
                            scaler.update()
                            scheduler.step()
                            optimizer.zero_grad()

                        running_loss += loss.item() * accumulation_steps

                        preds = torch.sigmoid(outputs.logits).detach().cpu().numpy()
                        preds = (preds > 0.5).astype(int)
                        all_predictions.extend(preds)
                        all_targets.extend(target.detach().cpu().numpy())

                        if batch_idx % print_every == 0:
                            print(f"Epoch {epoch+1}, Batch {batch_idx+1}/{total_batches}: Loss: {loss.item() * accumulation_steps:.4f}")

            epoch_time = time.time() - epoch_start_time

            epoch_loss = running_loss / total_batches
            epoch_accuracy = accuracy_score(np.vstack(all_predictions), np.vstack(all_targets))

            train_losses.append(epoch_loss)
            train_accuracies.append(epoch_accuracy)

            # Validation step
            model.eval()
            val_loss = 0.0
            val_predictions = []
            val_targets = []

            with torch.no_grad():
                for input_ids, attention_mask, target in val_dataloader:
                    input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
                    target = target.unsqueeze(1).float()

                    with autocast():
                        outputs = model(input_ids, attention_mask=attention_mask)
                        loss = criterion(outputs.logits, target)

                    val_loss += loss.item()

                    preds = torch.sigmoid(outputs.logits).detach().cpu().numpy()
                    preds = (preds > 0.5).astype(int)
                    val_predictions.extend(preds)
                    val_targets.extend(target.detach().cpu().numpy())

            val_loss /= len(val_dataloader)
            val_accuracy = accuracy_score(np.vstack(val_predictions), np.vstack(val_targets))

            val_losses.append(val_loss)
            val_accuracies.append(val_accuracy)

            # Save metrics to CSV
            writer.writerow({
                'Epoch': epoch + 1,
                'Train Loss': epoch_loss,
                'Val Loss': val_loss,
                'Train Accuracy': epoch_accuracy,
                'Val Accuracy': val_accuracy
            })

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_epoch = epoch
                early_stopping_counter = 0
                torch.save(model.state_dict(), best_model_path)
            else:
                early_stopping_counter += 1
                if early_stopping_counter >= early_stopping_patience:
                    print("Early stopping triggered")
                    break

            model.train()

            print(f"Epoch {epoch+1} completed in {epoch_time:.2f}s: Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")
            print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    prof.export_stacks("profiler_stacks.txt", "cpu")
    prof.export_stacks("profiler_stacks_gpu.txt", "cuda")

    print(f"Best epoch: {best_epoch + 1}, Best validation loss: {best_val_loss:.4f}")
    print("Training completed.")

    # Load the best model and save it
    model.load_state_dict(torch.load(best_model_path))
    model.save_pretrained(custom_save_dir)

    return {
        'model_name': model_name,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_accuracies': train_accuracies,
        'val_accuracies': val_accuracies,
        'best_epoch': best_epoch + 1,
        'best_val_loss': best_val_loss
    }

def check_overfitting(metrics):
    best_epoch = metrics['best_epoch'] - 1
    if metrics['train_accuracies'][best_epoch] > metrics['val_accuracies'][best_epoch] + 0.05:
        print(f"Model {metrics['model_name']} might be overfitting. Training accuracy is significantly higher than validation accuracy at the best epoch.")
    else:
        print(f"Model {metrics['model_name']} does not show significant signs of overfitting.")

def plot_metrics(models_metrics):
    # Plot validation accuracy per epoch for all models
    plt.figure(figsize=(12, 8))
    for metrics in models_metrics:
        plt.plot(metrics['val_accuracies'], label=f"{metrics['model_name']} - Validation Accuracy")
    plt.title('Validation Accuracy per Epoch for All Models', fontsize=20)
    plt.xlabel('Epoch', fontsize=16)
    plt.ylabel('Validation Accuracy', fontsize=16)
    plt.legend(fontsize=14)
    plt.grid(True)
    plt.show()

    # Plot loss per epoch for each model separately
    for metrics in models_metrics:
        plt.figure(figsize=(12, 8))
        plt.plot(metrics['train_losses'], label='Training Loss', color='blue', linewidth=2, marker='o', markersize=5)
        plt.plot(metrics['val_losses'], label='Validation Loss', color='orange', linewidth=2, marker='s', markersize=5)
        best_epoch_idx = metrics['best_epoch']
        plt.axvline(best_epoch_idx, linestyle='--', color='green', label='Best Epoch', linewidth=1.5)
        plt.annotate(f'Best Epoch: {best_epoch_idx}', xy=(best_epoch_idx, metrics['best_val_loss']), xytext=(best_epoch_idx + 2, metrics['best_val_loss'] + 0.02),
                     arrowprops=dict(facecolor='black', shrink=0.05), fontsize=12, color='green')
        plt.title(f'{metrics["model_name"]} - Loss per Epoch', fontsize=20)
        plt.xlabel('Epoch', fontsize=16)
        plt.ylabel('Loss', fontsize=16)
        plt.legend(fontsize=14)
        plt.grid(True)
        plt.tight_layout()
        plt.show()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

train_data = pd.read_csv('../../Datasets/Dataset_English/train/train.csv')
val_data = pd.read_csv('../../Datasets/Dataset_English/val/val.csv')

In [None]:
# Check for null values
print("Checking for null values...")
print(train_data.isnull().sum())

# Fill or drop null values (choose one based on your preference)
train_data['comment'].fillna('', inplace=True)  # Fill with empty string
# train_data.dropna(subset=['comment'], inplace=True)  # Or drop rows with null comments

# Ensure all comments are strings
train_data['comment'] = train_data['comment'].astype(str)

# Verify the data again
print(train_data.head())

Checking for null values...
comment      102
comment.1    102
isToxic        0
dtype: int64
                                             comment  \
0  outrageous block outrageous un wiki lauren cai...   
1  except never dare say something new neil harbo...   
2  thanks reply explanation clarified issue perfe...   
3                           attempted generalization   
4     seem vandalising moving reference stupid place   

                                           comment.1  isToxic  
0  outrageous block outrageous un wiki lauren cai...        0  
1  except never dare say something new neil harbo...        0  
2  thanks reply explanation clarified issue perfe...        0  
3                           attempted generalization        0  
4     seem vandalising moving reference stupid place        1  


In [None]:
models_metrics = []
models_to_train = ["MLRS/mBERTu", "bert-base-uncased", "xlm-roberta-base", "roberta-base"]
custom_paths = [
    "../../models/Experiments/Validation/Experiment-3/mBERTu_ENG",
    "../../models/Experiments/Validation/Experiment-3/BERT_ENG",
    "../../models/Experiments/Validation/Experiment-3/XLM-R_ENG",
    "../../models/Experiments/Validation/Experiment-3/RoBERTa_ENG"
]
batch_sizes = [16 , 16 , 16 , 16]

for model_name, custom_path , batch in zip(models_to_train, custom_paths , batch_sizes):
    metrics = train_and_evaluate_model(
        model_name=model_name,
        pretrained_model_path=model_name,
        tokenizer_class=AutoTokenizer,
        model_class=AutoModelForSequenceClassification,
        train_data=train_data,
        val_data=val_data,
        batch_size=batch,
        save_path=custom_path,
        max_length=128,
        accumulation_steps=4,
        early_stopping_patience=4
    )
    models_metrics.append(metrics)

# Plot metrics
plot_metrics(models_metrics)

# Check for overfitting
for metrics in models_metrics:
    check_overfitting(metrics)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at MLRS/mBERTu and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Batch 1/8976: Loss: 0.7101
