# 1. Environment Setup & Imports

In [11]:
import re
import torch
import numpy as np
import optuna
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    DistilBertConfig,
    get_scheduler
)
from tqdm.auto import tqdm

In [12]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.benchmark = True  # Enable CuDNN optimizations

# 2. Data Loading and Preprocessing

In [13]:
df = pd.read_csv('datasets/all_it_jobs.csv')
columns_to_keep = ['review_text', 'sentiment']
df = df[columns_to_keep].dropna(subset=['review_text'])

label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
df['label'] = df['sentiment'].map(label_map)

In [16]:
df_sampled = (df.groupby("sentiment")
              .sample(n=500, random_state=42, replace=True)
              .reset_index(drop=True))

In [17]:
# Diagnostics: print sample reviews before cleaning
print("=== Sample reviews (raw) ===")
print(df_sampled['review_text'].head(3).to_list())

=== Sample reviews (raw) ===
['depends on teams avoid proddev bonuses are good on time salary other benefits like good location depends on teams avoid proddev rude senior management', 'do not expect more than 15 of hike while joining company provides the required benefits to the employees like all other big companies do if you are looking for 2040 percent of hike while joining cgi you are not a right candidate for them even though you are very good talented candidate if you negotiate a lot they can give you not more than 15 hike from your current ctc which is ridiculous', 'disrespectful and ancient generally nice people dinosaur it retro 1980s good for understanding how things shouldnt be done secretive great if you want to get in to 1960s spy work james bond stylebletchley park triplicate paper lack of training or when training is provided its interupted no investment in it everything is done on a shoe string']


In [18]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [19]:
df_sampled['cleaned_review'] = df_sampled['review_text'].apply(clean_text)

In [20]:
# Diagnostics: print sample reviews after cleaning
print("\n=== Sample reviews (cleaned) ===")
print(df_sampled['cleaned_review'].head(3).to_list())


=== Sample reviews (cleaned) ===
['depends on teams avoid proddev bonuses are good on time salary other benefits like good location depends on teams avoid proddev rude senior management', 'do not expect more than 15 of hike while joining company provides the required benefits to the employees like all other big companies do if you are looking for 2040 percent of hike while joining cgi you are not a right candidate for them even though you are very good talented candidate if you negotiate a lot they can give you not more than 15 hike from your current ctc which is ridiculous', 'disrespectful and ancient generally nice people dinosaur it retro 1980s good for understanding how things shouldnt be done secretive great if you want to get in to 1960s spy work james bond stylebletchley park triplicate paper lack of training or when training is provided its interupted no investment in it everything is done on a shoe string']


# 3. Optimized Tokenization with Shorter Sequence Length

In [21]:
def tokenize_data(df, max_length=256):
    """Batch tokenization with optimized settings"""
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    return tokenizer(
        text=df['cleaned_review'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt',
        add_special_tokens=True,
        return_attention_mask=True,
        return_token_type_ids=False
    )

In [22]:
tokenized = tokenize_data(df_sampled)
input_ids = tokenized['input_ids']
attention_mask = tokenized['attention_mask']
labels = torch.tensor(df_sampled['label'].values, dtype=torch.long)

# 4. Optimized Data Splitting

In [23]:
def stratified_split(inputs, masks, labels, test_size=0.3):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=SEED)
    train_idx, temp_idx = next(sss.split(inputs, labels))
    return (inputs[train_idx], masks[train_idx], labels[train_idx]), \
           (inputs[temp_idx], masks[temp_idx], labels[temp_idx])

In [24]:
(train_inputs, train_masks, train_labels), (temp_inputs, temp_masks, temp_labels) = stratified_split(input_ids, attention_mask, labels)
(val_inputs, val_masks, val_labels), (test_inputs, test_masks, test_labels) = stratified_split(temp_inputs, temp_masks, temp_labels, test_size=0.5)

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


# 5. Optimized Training Utilities

In [26]:
def create_dataloader(inputs, masks, labels, batch_size, shuffle=False):
    """Ultra-fast DataLoader configuration"""
    return DataLoader(
        TensorDataset(inputs, masks, labels),
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=4,
        pin_memory=True,
        persistent_workers=True,
        prefetch_factor=2
    )

In [27]:
def validate(model, val_loader):
    """Batch-wise validation with mixed precision"""
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16):
        for batch in val_loader:
            inputs, masks, lbls = [t.to(device, non_blocking=True) for t in batch]
            logits = model(inputs, attention_mask=masks).logits
            all_preds.extend(logits.argmax(dim=-1).cpu().numpy())
            all_labels.extend(lbls.cpu().numpy())
    return accuracy_score(all_labels, all_preds)


# 6. Optimized Optuna Objective Function

In [28]:
def objective(trial):
    """Hyper-optimized objective function with early pruning"""
    params = {
        'lr': trial.suggest_float('lr', 1e-5, 5e-5, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-6, 1e-4, log=True),
        'num_epochs': trial.suggest_int('num_epochs', 5, 10),
        'hidden_dropout': trial.suggest_float('hidden_dropout', 0.2, 0.4),
        'attention_dropout': trial.suggest_float('attention_dropout', 0.2, 0.4),
        'batch_size': trial.suggest_categorical('batch_size', [32, 64]),
        'grad_clip': trial.suggest_float('grad_clip', 0.5, 1.5),
        'grad_accum_steps': trial.suggest_int('grad_accum_steps', 1, 2)
    }
    
    print(f"\n--- Starting trial with params: {params} ---")
    
    # Create DataLoaders
    train_loader = create_dataloader(train_inputs, train_masks, train_labels, 
                                     params['batch_size'], shuffle=True)
    val_loader = create_dataloader(val_inputs, val_masks, val_labels, 
                                   params['batch_size'])

    # Model setup
    config = DistilBertConfig.from_pretrained(
        'distilbert-base-uncased',
        num_labels=3,
        hidden_dropout_prob=params['hidden_dropout'],
        attention_probs_dropout_prob=params['attention_dropout']
    )
    model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    config=config
).to(device)
    
    # Optimizer setup
    optimizer = optim.AdamW(model.parameters(), lr=params['lr'], 
                            weight_decay=params['weight_decay'])
    total_steps = (len(train_loader) // params['grad_accum_steps']) * params['num_epochs']
    scheduler = get_scheduler('linear', optimizer, 
                              num_warmup_steps=int(total_steps*0.1), 
                              num_training_steps=total_steps)

    # Training loop
    scaler = torch.cuda.amp.GradScaler() 
    best_acc = 0
    for epoch in range(params['num_epochs']):
        model.train()
        optimizer.zero_grad()
        for step, batch in enumerate(train_loader):
            inputs, masks, lbls = [t.to(device, non_blocking=True) for t in batch]
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(inputs, attention_mask=masks, labels=lbls)
                loss = outputs.loss / params['grad_accum_steps']
            
            scaler.scale(loss).backward()
            
            if (step + 1) % params['grad_accum_steps'] == 0:
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(), params['grad_clip'])
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()
        
        # Early validation and pruning
        val_acc = validate(model, val_loader)
        print(f"Trial {trial.number}, Epoch {epoch+1}/{params['num_epochs']} - Val Acc: {val_acc:.4f}")
        trial.report(val_acc, epoch)
        if trial.should_prune():
            print("Trial pruned!")
            raise optuna.TrialPruned()
        
        if val_acc > best_acc:
            best_acc = val_acc

    print(f"Trial {trial.number} finished with best validation accuracy: {best_acc:.4f}")
    return best_acc

# 7. Parallelized Optuna Study

In [None]:
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=SEED),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=1)
)
study.optimize(objective, n_trials=20, n_jobs=2, show_progress_bar=True)

[I 2025-02-06 16:21:12,561] A new study created in memory with name: no-name-7a22356b-fa6a-407d-875d-bfc72f5826f8


  0%|          | 0/20 [00:00<?, ?it/s]


--- Starting trial with params: {'lr': 2.295679162486461e-05, 'weight_decay': 1.2257352124341086e-06, 'num_epochs': 10, 'hidden_dropout': 0.2175567299478534, 'attention_dropout': 0.2190715109748077, 'batch_size': 64, 'grad_clip': 0.9405184584750129, 'grad_accum_steps': 1} ---

--- Starting trial with params: {'lr': 2.3593578994251797e-05, 'weight_decay': 7.75963334522764e-06, 'num_epochs': 9, 'hidden_dropout': 0.3825242272843598, 'attention_dropout': 0.35805707750424864, 'batch_size': 32, 'grad_clip': 1.0447225346682156, 'grad_accum_steps': 1} ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()


In [None]:
try:
    import optuna.visualization as vis
    vis.plot_optimization_history(study).show()
except Exception as e:
    print("Visualization failed:", e)

# 8. Final Training with Best Parameters

In [18]:
best_params = study.best_params.copy()
print("\nBest Hyperparameters Found:")
print(best_params)

Best hyperparameters: {'lr': 0.0004138040112561013, 'weight_decay': 8.200518402245835e-06, 'num_epochs': 5, 'hidden_dropout': 0.3736932106048628, 'attention_dropout': 0.27606099749584057}


In [None]:
# Create DataLoaders
train_loader = create_dataloader(train_inputs, train_masks, train_labels, 
                               best_params['batch_size'], shuffle=True)
val_loader = create_dataloader(val_inputs, val_masks, val_labels, 
                             best_params['batch_size'])
test_loader = create_dataloader(test_inputs, test_masks, test_labels, 
                              best_params['batch_size'])

In [22]:
# Model initialization
config = DistilBertConfig.from_pretrained(
    'distilbert-base-uncased',
    num_labels=3,
    hidden_dropout_prob=best_params['hidden_dropout'],
    attention_probs_dropout_prob=best_params['attention_dropout']
)
model = DistilBertForSequenceClassification(config).to(device)

In [None]:
# Optimizer setup
optimizer = optim.AdamW(model.parameters(), lr=best_params['lr'], 
                      weight_decay=best_params['weight_decay'])
total_steps = (len(train_loader) // best_params['grad_accum_steps']) * best_params['num_epochs']
scheduler = get_scheduler('linear', optimizer, 
                        num_warmup_steps=int(total_steps*0.1), 
                        num_training_steps=total_steps)

# 9. Optimized Training Loop with Mixed Precision

In [23]:
# 9. Optimized Training Loop with Mixed Precision (Fixed)
def optimized_train(model, train_loader, optimizer, scheduler, params):
    """Ultra-efficient training loop with all optimizations"""
    model.train()
    total_loss = 0
    scaler = torch.cuda.amp.GradScaler()

    optimizer.zero_grad()
    for step, batch in enumerate(tqdm(train_loader, desc="Training")):  # Fixed: Added step counter
        inputs, masks, lbls = [t.to(device, non_blocking=True) for t in batch]

        with torch.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(inputs, attention_mask=masks, labels=lbls)
            loss = outputs.loss / params['grad_accum_steps']

        scaler.scale(loss).backward()

        if (step + 1) % params['grad_accum_steps'] == 0:
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), params['grad_clip'])
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

        total_loss += loss.item() * params['grad_accum_steps']

    return total_loss / len(train_loader)

# 10. Early Stopped Training Execution

In [24]:
train_losses = []
val_accuracies = []

best_val_acc = 0
patience_counter = 0

for epoch in range(best_params['num_epochs']):
    train_loss = optimized_train(model, train_loader, optimizer, scheduler, best_params)
    train_losses.append(train_loss)
    
    val_acc = validate(model, val_loader)
    val_accuracies.append(val_acc)
    
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Acc = {val_acc:.4f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        torch.save(model.state_dict(), 'best_uncased_bert_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= 3:
            print("Early stopping triggered")
            break

In [None]:
plt.figure(figsize=(12, 5))

# Plot Training Loss
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss', color='blue', marker='o')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot Validation Accuracy
plt.subplot(1, 2, 2)
plt.plot(val_accuracies, label='Validation Accuracy', color='green', marker='o')
plt.title('Validation Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# Final Evaluation

In [None]:
def evaluate_with_confusion_matrix(model, test_loader):
    """Evaluate model and generate confusion matrix"""
    model.eval()
    all_preds, all_labels = [], []
    
    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16):
        for batch in test_loader:
            inputs, masks, lbls = [t.to(device, non_blocking=True) for t in batch]
            logits = model(inputs, attention_mask=masks).logits
            all_preds.extend(logits.argmax(dim=-1).cpu().numpy())
            all_labels.extend(lbls.cpu().numpy())
    
    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Final Test Accuracy: {accuracy:.4f}")
    
    # Generate confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    return cm, accuracy

In [None]:
# Load the best model
model.load_state_dict(torch.load('best_uncased_bert_model.pth'))

In [None]:
# Evaluate and plot confusion matrix
cm, test_accuracy = evaluate_with_confusion_matrix(model, test_loader)

In [None]:
# Plot confusion matrix using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Neutral', 'Positive'], 
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title(f'Confusion Matrix (Test Accuracy: {test_accuracy:.2%})')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()