In [21]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

sentence_column_name = "Sentence"
sentiment_column_name = "Final_Sent"

In [22]:
# 1. Data Loading and Splitting
# Load the CSV file containing the sentiment data
df = pd.read_csv('annotated_dataset.csv')
# Split data into 80% training+validation and 20% test
remaining, test = train_test_split(df, test_size=0.2, random_state=42)
# Split the remaining 80% into 70% training and 10% validation (0.125 of 80% = 10% overall)
train, val = train_test_split(remaining, test_size=0.125, random_state=42)

train, val, test

(                                               Sentence  \
 1292  As the Democratic National Convention approach...   
 2200                    He's like fighter, fighter man.   
 2780  Donald Trump over performed by almost 5% accor...   
 2960  If Kamala wants my vote, I want her to take a ...   
 2870  Around 80% of the African Americans here in Mi...   
 ...                                                 ...   
 2748     If Michigan goes to Donald Trump, he will win.   
 636   And Harris's ability to appeal to them could p...   
 861   Donald Trump, on the other hand, in a town hal...   
 1763  You see Obama won Eaton County twice so did Tr...   
 1177  It's all there publicly, but Donald Trump's go...   
 
      Presidential_Candidate         State  Vote_1  Vote_2  Final_Sent  
 1292          Kamala Harris  Pennsylvania       1       1           1  
 2200           Donald Trump      Michigan       1       1           1  
 2780           Donald Trump      Michigan       1       0 

In [9]:
# Function to calculate weights for each class to handle class imbalance
def compute_class_weights(labels):
    # Map -1, 0, 1 to 0, 1, 2 for bincount
    mapped_labels = labels + 1  # This shifts -1->0, 0->1, 1->2
    # Count number of samples in each class
    class_counts = np.bincount(mapped_labels)
    # Calculate inverse of frequency (rare classes get higher weights)
    weights = 1. / class_counts
    # Normalize weights to sum to number of classes
    weights = weights * len(class_counts) / weights.sum()
    return torch.FloatTensor(weights)

# Calculate class weights from training data
class_weights = compute_class_weights(train[sentiment_column_name].values)

"Class weights: ", class_weights

('Class weights: ', tensor([1.0569, 1.0804, 0.8626]))

In [11]:
# 2. Model Initialization
# Check if GPU is available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Move class weights to the same device as the model
class_weights = class_weights.to(device)
# Specify which BERT model to use
model_name = 'bert-base-uncased'

# Custom BERT class that implements weighted loss
class BertWithWeightedLoss(BertForSequenceClassification):
    def __init__(self, config, class_weights):
        super().__init__(config)
        self.class_weights = class_weights  # Store class weights for loss calculation
    
    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get model outputs without computing loss
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None  # Set to None to prevent automatic loss calculation
        )
        
        # Calculate weighted loss if labels are provided (training phase)
        if labels is not None:
            # Create loss function with class weights
            loss_fct = CrossEntropyLoss(weight=self.class_weights)
            # Calculate loss using model predictions and true labels
            loss = loss_fct(
                outputs.logits.view(-1, self.num_labels),  # Reshape predictions
                labels.view(-1)                            # Reshape labels
            )
            outputs.loss = loss  # Add loss to outputs
        
        return outputs

# Initialize the custom BERT model
model = BertWithWeightedLoss.from_pretrained(
    model_name,
    # Configure BERT for binary classification
    config=BertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        output_attentions=False,  # Don't output attention weights
        output_hidden_states=False,  # Don't output hidden states
    ).config,
    class_weights=class_weights
)
# Move model to GPU if available
model.to(device)

# Initialize tokenizer for processing text
tokenizer = BertTokenizer.from_pretrained(model_name)

tokenizer

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertWithWeightedLoss were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [17]:
# 3. Data Preparation
# Convert text to BERT input format
train_encodings = tokenizer(
    train[sentence_column_name].tolist(),  # Convert sentences to list
    truncation=True,  # Cut texts longer than max_length
    padding=True,  # Pad texts shorter than max_length
    max_length=128,  # Maximum sequence length
    return_tensors='pt'  # Return PyTorch tensors
)

# Create dataset combining inputs and labels
train_dataset = torch.utils.data.TensorDataset(
    train_encodings['input_ids'],  # Tokenized text
    train_encodings['attention_mask'],  # Attention mask for padding
    torch.tensor(train[sentiment_column_name].tolist())  # Labels
)

# Create data loader for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

train_loader

<torch.utils.data.dataloader.DataLoader at 0x22f13652250>

In [23]:
# 4. Training Function
def train_model(model, train_loader, val_loader=None, epochs=3, learning_rate=2e-5):
    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    best_val_loss = float('inf')  # Track best validation loss
    
    # Training loop
    for epoch in range(epochs):
        model.train()  # Set model to training mode
        total_loss = 0

        # Wrap the training loader with tqdm to display progress
        pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1} Training", leave=False)        
        # Process each batch
        for batch in pbar:
            # Move batch to GPU if available
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            # Remap labels from [-1, 0, 1] to [0, 1, 2]
            labels = labels + 1  # This shifts all labels by 1
            
            optimizer.zero_grad()  # Clear previous gradients
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()  # Accumulate loss
            
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights

            # Update tqdm description with current loss if desired
            pbar.set_postfix(loss=loss.item())
        
        # Calculate average loss for epoch
        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}, Average training loss: {avg_train_loss:.4f}')
        
        # Validation phase
        if val_loader is not None:
            model.eval()  # Set model to evaluation mode
            val_loss = 0
            correct = 0
            total = 0

            # Wrap the validation loop with tqdm:
            val_pbar = tqdm(val_loader, desc=f"Epoch {epoch + 1} Validation", leave=False)
            # No gradient computation needed for validation
            with torch.no_grad():
                for batch in val_pbar:
                    input_ids, attention_mask, labels = [b.to(device) for b in batch]
                    outputs = model(input_ids, attention_mask, labels)
                    val_loss += outputs.loss.item()
                    
                    # Calculate accuracy
                    _, predicted = torch.max(outputs.logits, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

                    val_pbar.set_postfix(loss=outputs.loss.item())
            
            # Calculate validation metrics
            avg_val_loss = val_loss / len(val_loader)
            accuracy = correct / total
            print(f'Validation loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}')
            
            # Save model if it's the best so far
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                model_path = 'sentiment_bert_model'
                model.save_pretrained(model_path)
                tokenizer.save_pretrained(model_path)
                print(f"Saved model with validation loss: {avg_val_loss:.4f}")
    
    return model

model = train_model(model, train_loader)

model

                                                                                                                       

Epoch 1, Average training loss: 0.6814


                                                                                                                       

Epoch 2, Average training loss: 0.4427


                                                                                                                       

Epoch 3, Average training loss: 0.2447




BertWithWeightedLoss(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

Predicted class (-1, 0, 1): 1
