In [1]:
# Imports and Initial Setup
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import pandas as pd
import numpy as np
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import itertools

# Name to use for saving the model
model_path = 'DistilBERT Sentiment Model'

# Dataset (CSV) Column Names
sentence_column_name = "Sentence"
sentiment_column_name = "Final_Sent"

# To Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cpu')

In [2]:
# Data Splitting
# Load the dataset from CSV file
df = pd.read_csv('annotated_dataset.csv')
# Split data into 80% training+validation and 20% test
remaining, test = train_test_split(df, test_size=0.2, random_state=42)
# Split the remaining 80% into 70% training and 10% validation (0.125 of 80% = 10% overall)
train, val = train_test_split(remaining, test_size=0.125, random_state=42)

train, val, test

(                                               Sentence  \
 1292  As the Democratic National Convention approach...   
 2200                    He's like fighter, fighter man.   
 2780  Donald Trump over performed by almost 5% accor...   
 2960  If Kamala wants my vote, I want her to take a ...   
 2870  Around 80% of the African Americans here in Mi...   
 ...                                                 ...   
 2748     If Michigan goes to Donald Trump, he will win.   
 636   And Harris's ability to appeal to them could p...   
 861   Donald Trump, on the other hand, in a town hal...   
 1763  You see Obama won Eaton County twice so did Tr...   
 1177  It's all there publicly, but Donald Trump's go...   
 
      Presidential_Candidate         State  Vote_1  Vote_2  Final_Sent  
 1292          Kamala Harris  Pennsylvania       1       1           1  
 2200           Donald Trump      Michigan       1       1           1  
 2780           Donald Trump      Michigan       1       0 

In [3]:
# Set up the model and compute class weights
def compute_class_weights(labels):
    """
    Calculate weights for each class to handle imbalanced data
    For example, if we have 100 positive but only 10 negative samples,
    negative samples will get higher weight to balance their importance
    """
    # Shift labels for model [-1, 0, 1] to [0, 1, 2]
    mapped_labels = labels + 1
    # Count how many samples we have of each class
    class_counts = np.bincount(mapped_labels)
    # Give higher weights to classes with fewer samples
    weights = 1. / class_counts
    # Normalize weights to sum to number of classes
    weights = weights * len(class_counts) / weights.sum()
    return torch.FloatTensor(weights)

# Calculate weights for each class from training data
class_weights = compute_class_weights(train[sentiment_column_name].values)
class_weights = class_weights.to(device)  # Move weights to GPU if available

class_weights

tensor([1.0569, 1.0804, 0.8626])

In [4]:
# Base DistilBERT model to use
model_name = 'distilbert-base-uncased'

# Create a custom DistilBERT model that can handle weighted loss
class DistilBertWithWeightedLoss(DistilBertForSequenceClassification):
    """
    Custom DistilBERT model that applies different weights to each class
    This helps handle imbalanced datasets better
    """
    def __init__(self, config, class_weights):
        super().__init__(config)
        self.class_weights = class_weights # Store class weights for loss calculation
    
    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get model outputs without computing loss
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None # Set to None to prevent automatic loss calculation
        )
        
        # Calculate weighted loss if labels are provided (training phase)
        if labels is not None:
            # Create loss function with class weights
            loss_fct = CrossEntropyLoss(weight=self.class_weights)
            # Calculate loss using model predictions and true labels
            loss = loss_fct(
                outputs.logits.view(-1, self.num_labels),  # Reshape predictions
                labels.view(-1)                            # Reshape labels
            )
            outputs.loss = loss  # Add loss to outputs
        
        return outputs

# Initialize the tokenizer that will convert text to numbers
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

tokenizer

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

DistilBertTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [5]:
# Data Preparation Functions
def create_data_loader(data, tokenizer, batch_size):
    """
    Convert text data into a format DistilBERT can understand and create batches

    Args:
        data: DataFrame containing text and labels
        tokenizer: DistilBERT tokenizer to convert text to numbers
        batch_size: How many samples to process at once

    Returns:
        DataLoader that yields batches of processed data
    """
    # Convert text to DistilBERT input format with progress bar
    encodings = tokenizer(
        data[sentence_column_name].tolist(), # Convert sentences to list
        truncation=True, # Cut texts longer than max_length
        padding=True, # Pad texts shorter than max_length
        max_length=128, # Maximum sequence length
        return_tensors='pt', # Return PyTorch tensors
        verbose=True # Show progress
    )

    # Create dataset by combining inputs and labels
    dataset = torch.utils.data.TensorDataset(
        encodings['input_ids'], # Tokenized text
        encodings['attention_mask'], # Attention mask for padding
        torch.tensor(data[sentiment_column_name].tolist()) # Labels
    )

    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [6]:
# Evaluation Function
def evaluate_model(model, data_loader, device):
    """
    Evaluate model performance using various metrics
    
    Args:
        model: The DistilBERT model to evaluate
        data_loader: DataLoader containing validation or test data
        device: CPU or GPU
    
    Returns:
        Dictionary containing various performance metrics
    """
    model.eval() # Set model to evaluation mode
    val_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad(): # Don't compute gradients during evaluation
        for batch in tqdm(data_loader, desc="Evaluation"):
            # Move batch to GPU if available
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            adjusted_labels = labels + 1 # Shift labels for model

            # Get model predictions
            outputs = model(input_ids, attention_mask, labels=adjusted_labels)
            val_loss += outputs.loss.item()

            # Store predictions and true labels
            _, predicted = torch.max(outputs.logits, 1)
            all_preds.extend((predicted - 1).cpu().numpy())
            all_labels.extend((adjusted_labels - 1).cpu().numpy())

    # Calculate various performance metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    
    return {
        'loss': val_loss / len(data_loader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [7]:
# Training Function
def train_model(model, train_loader, val_loader, device, epochs, learning_rate):
    """
    Train the model and periodically evaluate its performance
    
    Args:
        model: The DistilBERT model to train
        train_loader: DataLoader with training data
        val_loader: DataLoader with validation data
        device: CPU or GPU
        epochs: Number of times to process all training data
        learning_rate: How quickly the model should learn
    
    Returns:
        Trained model and its best validation metrics
    """
    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    best_metrics = None
    best_model = None
    
    # Training loop
    for epoch in range(epochs):
        model.train() # Set model to training mode
        total_loss = 0
        
        # Process each batch
        pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} Training")
        for batch in pbar:
            # Move batch to GPU if available
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            adjusted_labels = labels + 1 # Shift labels for model [-1, 0, 1] to [0, 1, 2]

            # Training step
            optimizer.zero_grad() # Clear previous gradients
            outputs = model(input_ids, attention_mask, labels=adjusted_labels) # Forward pass
            
            loss = outputs.loss
            total_loss += loss.item() # Accumulate loss

            # Update model weights
            loss.backward() # Backward pass
            optimizer.step() # Update weights

            # Update progress bar with current loss
            pbar.set_postfix(loss=loss.item())

        # Calculate average loss for this epoch
        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Average training loss: {avg_train_loss:.4f}')
        
        val_metrics = evaluate_model(model, val_loader, device)
        print(f'Validation metrics: {val_metrics}')
        
        # Update best parameters if accuracy score improves
        if best_metrics == None or val_metrics['accuracy'] > best_metrics['accuracy']:
            best_metrics = val_metrics
            best_model = model
    
    return best_model, best_metrics

In [8]:
# Define different values to try for each parameter
param_grid = {
    'learning_rate': [2e-5, 5e-5], # DistilBERT works well with similar learning rates as BERT
    'batch_size': [32], # Same batch size as BERT works well
    'epochs': [5] # DistilBERT might converge faster due to its smaller size
}

# Hyperparameter Tuning Function
def hyperparameter_tuning(train_data, val_data, device, class_weights):
    """
    Try different combinations of hyperparameters to find the best ones
    
    Args:
        train_data: Training data DataFrame
        val_data: Validation data DataFrame
        device: CPU or GPU
        class_weights: Weights for each class
    
    Returns:
        Best parameters and their corresponding metrics
    """    
    # Create all possible combinations of parameters
    param_combinations = [
        dict(zip(param_grid.keys(), v)) 
        for v in itertools.product(*param_grid.values())
    ]

    best_model = None
    best_metrics = None
    best_params = None

    # Try each combination of parameters
    for params in param_combinations:
        print(f"\nTrying parameters: {params}")

        # Create data loaders with current batch size
        train_loader = create_data_loader(train_data, tokenizer, params['batch_size'])
        val_loader = create_data_loader(val_data, tokenizer, params['batch_size'])

        # Initialize the custom DistilBERT model
        model = DistilBertWithWeightedLoss.from_pretrained(
            model_name,
            # Configure DistilBERT for classification
            config=DistilBertForSequenceClassification.from_pretrained(
                model_name,
                num_labels=3,
                output_attentions=False, # Don't output attention weights
                output_hidden_states=False, # Don't output hidden states
            ).config,
            class_weights=class_weights
        )
        # Move model to GPU if available
        model.to(device)

        # Train model with current parameters
        model, val_metrics = train_model(
            model, 
            train_loader, 
            val_loader,
            device,
            params['epochs'],
            params['learning_rate']
        )

        # Update best parameters if accuracy score improves
        if best_metrics == None or val_metrics['accuracy'] > best_metrics['accuracy']:
            best_model = model
            best_params = params
            best_metrics = val_metrics
    
    return best_model, best_params, best_metrics

In [9]:
# Run Hyperparameter Tuning
best_model, best_params, best_metrics = hyperparameter_tuning(train, val, device, class_weights)
print(f"\nBest parameters: {best_params}")
print(f"Best validation metrics: {best_metrics}")

# Save the best model
best_model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

best_model


Trying parameters: {'learning_rate': 2e-05, 'batch_size': 32, 'epochs': 5}


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertWithWeightedLoss were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5 Training: 100%|██████████████████████████████████████████████████| 83/83 [06:35<00:00,  4.76s/it, loss=0.617]


Epoch 1/5, Average training loss: 0.9748


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 12/12 [00:12<00:00,  1.00s/it]


Validation metrics: {'loss': 0.8230836888154348, 'accuracy': 0.6375661375661376, 'precision': 0.637078456522901, 'recall': 0.6375661375661376, 'f1': 0.6149331550447501}


Epoch 2/5 Training: 100%|██████████████████████████████████████████████████| 83/83 [06:33<00:00,  4.74s/it, loss=0.555]


Epoch 2/5, Average training loss: 0.7155


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 12/12 [00:11<00:00,  1.02it/s]


Validation metrics: {'loss': 0.6955294162034988, 'accuracy': 0.6984126984126984, 'precision': 0.6895134641302094, 'recall': 0.6984126984126984, 'f1': 0.6883485616098476}


Epoch 3/5 Training: 100%|██████████████████████████████████████████████████| 83/83 [06:31<00:00,  4.71s/it, loss=0.518]


Epoch 3/5, Average training loss: 0.4995


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 12/12 [00:11<00:00,  1.03it/s]


Validation metrics: {'loss': 0.6915995875994364, 'accuracy': 0.7116402116402116, 'precision': 0.707520732256182, 'recall': 0.7116402116402116, 'f1': 0.7088566923039965}


Epoch 4/5 Training: 100%|██████████████████████████████████████████████████| 83/83 [06:35<00:00,  4.76s/it, loss=0.303]


Epoch 4/5, Average training loss: 0.3339


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 12/12 [00:11<00:00,  1.03it/s]


Validation metrics: {'loss': 0.8327804307142893, 'accuracy': 0.6984126984126984, 'precision': 0.6931015963259949, 'recall': 0.6984126984126984, 'f1': 0.6896182944526121}


Epoch 5/5 Training: 100%|█████████████████████████████████████████████████| 83/83 [06:38<00:00,  4.80s/it, loss=0.0448]


Epoch 5/5, Average training loss: 0.2051


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 12/12 [00:11<00:00,  1.01it/s]


Validation metrics: {'loss': 0.8988261272509893, 'accuracy': 0.6957671957671958, 'precision': 0.6926488675392184, 'recall': 0.6957671957671958, 'f1': 0.6927282976972418}

Trying parameters: {'learning_rate': 5e-05, 'batch_size': 32, 'epochs': 5}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertWithWeightedLoss were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5 Training: 100%|██████████████████████████████████████████████████| 83/83 [06:58<00:00,  5.04s/it, loss=0.559]


Epoch 1/5, Average training loss: 0.9082


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 12/12 [00:11<00:00,  1.02it/s]


Validation metrics: {'loss': 0.7382264683643976, 'accuracy': 0.6693121693121693, 'precision': 0.6638640465079928, 'recall': 0.6693121693121693, 'f1': 0.6639456513316695}


Epoch 2/5 Training: 100%|██████████████████████████████████████████████████| 83/83 [06:54<00:00,  4.99s/it, loss=0.595]


Epoch 2/5, Average training loss: 0.5844


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 12/12 [00:12<00:00,  1.01s/it]


Validation metrics: {'loss': 0.7595544954140981, 'accuracy': 0.6746031746031746, 'precision': 0.6994809299781407, 'recall': 0.6746031746031746, 'f1': 0.6812595837668226}


Epoch 3/5 Training: 100%|███████████████████████████████████████████████████| 83/83 [06:54<00:00,  4.99s/it, loss=0.54]


Epoch 3/5, Average training loss: 0.3508


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 12/12 [00:11<00:00,  1.00it/s]


Validation metrics: {'loss': 0.7778427749872208, 'accuracy': 0.716931216931217, 'precision': 0.7128518651080411, 'recall': 0.716931216931217, 'f1': 0.7101108896408577}


Epoch 4/5 Training: 100%|█████████████████████████████████████████████████| 83/83 [06:56<00:00,  5.02s/it, loss=0.0198]


Epoch 4/5, Average training loss: 0.1814


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 12/12 [00:11<00:00,  1.02it/s]


Validation metrics: {'loss': 0.9560512825846672, 'accuracy': 0.701058201058201, 'precision': 0.7094646395984178, 'recall': 0.701058201058201, 'f1': 0.7040260695556704}


Epoch 5/5 Training: 100%|█████████████████████████████████████████████████| 83/83 [06:58<00:00,  5.04s/it, loss=0.0119]


Epoch 5/5, Average training loss: 0.1006


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 12/12 [00:12<00:00,  1.03s/it]


Validation metrics: {'loss': 1.1578800280888875, 'accuracy': 0.7037037037037037, 'precision': 0.7046887879360371, 'recall': 0.7037037037037037, 'f1': 0.7008976250432833}

Best parameters: {'learning_rate': 5e-05, 'batch_size': 32, 'epochs': 5}
Best validation metrics: {'loss': 0.7778427749872208, 'accuracy': 0.716931216931217, 'precision': 0.7128518651080411, 'recall': 0.716931216931217, 'f1': 0.7101108896408577}


DistilBertWithWeightedLoss(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
         

In [None]:
# Training Time
# 06:35 00:12
# 06:33 00:11
# 06:31 00:11
# 06:35 00:11
# 06:38 00:11

# 06:58 00:11
# 06:54 00:12
# 06:54 00:11
# 06:56 00:11
# 06:58 00:12

In [10]:
# Final Evaluation on Test Set
test_loader = create_data_loader(test, tokenizer, best_params['batch_size'])
print("\nEvaluating final model on test set...")
test_metrics = evaluate_model(best_model, test_loader, device)
print(f"Test set metrics: {test_metrics}")


Evaluating final model on test set...


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 24/24 [00:34<00:00,  1.44s/it]

Test set metrics: {'loss': 1.1459480722745259, 'accuracy': 0.6917989417989417, 'precision': 0.6900590897555074, 'recall': 0.6917989417989417, 'f1': 0.6902151615550367}





In [11]:
# Set the model to evaluation mode
best_model.eval()

# Example text for prediction
example_text = "harris leads by 1%"

# Tokenize the input text
encoded_input = tokenizer(
    example_text,
    return_tensors="pt", # Return PyTorch tensors
    truncation=True,
    padding=True
)
# Remove token_type_ids if not used
encoded_input.pop("token_type_ids", None)

# Perform prediction without gradient computation
with torch.no_grad():
    outputs = best_model(**encoded_input)

# Get the logits from the model's output
logits = outputs.logits

# Get the predicted class (0, 1, or 2)
predicted_class = torch.argmax(logits, dim=1).item() - 1
print("Predicted class (-1, 0, 1):", predicted_class)

Predicted class (-1, 0, 1): 1
