# Import Dependencies

In [None]:
import os, copy, torch, itertools
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import pandas as pd
import numpy as np
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

# Set Configurations

In [19]:
# Dataset (CSV) Column Names
sentence_column_name = "Sentences"
sentiment_column_name = "Final_Sent"
final_dataset_folder_name = os.path.join('..', '4) Sentiment Annotation')
model_evaluation_result_folder_name = os.path.join("Model Results and Actual Data", "BERT")
os.makedirs(model_evaluation_result_folder_name, exist_ok=True)

# To Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cpu')

# Data Splitting (Keep Same Data for Both Model)

In [None]:
def split_data():
    # Create output directory if it doesn't exist
    os.makedirs(final_dataset_folder_name, exist_ok=True)
    
    # Define file paths
    train_path = os.path.join(final_dataset_folder_name, 'train.csv')
    val_path = os.path.join(final_dataset_folder_name, 'validation.csv')
    test_path = os.path.join(final_dataset_folder_name, 'test.csv')
    
    # Check if split files already exist
    if all(os.path.exists(f) for f in [train_path, val_path, test_path]):
        train = pd.read_csv(train_path)
        val = pd.read_csv(val_path)
        test = pd.read_csv(test_path)
        
        # Combine DataFrames vertically
        df = train.append([val, test], ignore_index=True)
        
        return train, val, test, df
    else:
        # Load the dataset
        df = pd.read_csv(os.path.join(final_dataset_folder_name, 'final_dataset.csv'))
        
        # Split data into 80% training+validation and 20% test
        remaining, test = train_test_split(df, test_size=0.2, random_state=42)
        # Split the remaining 80% into 70% training and 10% validation (0.125 of 80% = 10% overall)
        train, val = train_test_split(remaining, test_size=0.125, random_state=42)
        
        # Save splits
        train.to_csv(train_path, index=False)
        val.to_csv(val_path, index=False)
        test.to_csv(test_path, index=False)
    
        return train, val, test, df

train, val, test, full = split_data()

train, val, test, full

# Computing Class Weights

In [None]:
def compute_class_weights(labels):
    # Shift labels for model [-1, 0, 1] to [0, 1, 2]
    mapped_labels = labels + 1
    # Count how many samples we have of each class
    class_counts = np.bincount(mapped_labels)
    # Give higher weights to classes with fewer samples
    weights = 1. / class_counts
    # Normalize weights to sum to number of classes
    weights = weights * len(class_counts) / weights.sum()
    return torch.FloatTensor(weights)

# Calculate weights for each class from training data
class_weights = compute_class_weights(train[sentiment_column_name].values)
class_weights = class_weights.to(device)  # Move weights to GPU if available

class_weights

# Configure Model and Tokenizer (BERT)

In [7]:
# Base BERT model to use
model_name = 'bert-base-uncased'

# Create a custom BERT model that can handle weighted loss
class BertWithWeightedLoss(BertForSequenceClassification):
    def __init__(self, config, class_weights):
        super().__init__(config)
        self.class_weights = class_weights # Store class weights for loss calculation
    
    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get model outputs without computing loss
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None # Set to None to prevent automatic loss calculation
        )
        
        # Calculate weighted loss if labels are provided (training phase)
        if labels is not None:
            # Create loss function with class weights
            loss_fct = CrossEntropyLoss(weight=self.class_weights)
            # Calculate loss using model predictions and true labels
            loss = loss_fct(
                outputs.logits.view(-1, self.num_labels),  # Reshape predictions
                labels.view(-1)                            # Reshape labels
            )
            outputs.loss = loss  # Add loss to outputs
        
        return outputs

# Initialize the tokenizer that will convert text to numbers
tokenizer = BertTokenizer.from_pretrained(model_name)

tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

# Define Model Evaluation Function

In [None]:
def evaluate_model(model, eval_data, device):
    model.eval()
    
    # Create a copy of eval_data to avoid modifying the original
    results_df = eval_data.copy()
    # Add new column for predictions
    results_df['Predicted_Sent'] = None
        
    
    all_preds = []
    all_labels = []
    val_loss = 0
    
    with torch.no_grad():
        for idx, row in tqdm(results_df.iterrows(), total=len(results_df), desc="Evaluating"):
            # Tokenize single sentence
            encoding = tokenizer(
                row[sentence_column_name],
                truncation=True,
                padding=True,
                max_length=128,
                return_tensors='pt'
            )
            
            # Move inputs to device
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            label = torch.tensor([row[sentiment_column_name]]).to(device)
            adjusted_label = label + 1
            
            # Get model predictions
            outputs = model(input_ids, attention_mask, labels=adjusted_label)
            val_loss += outputs.loss.item()
            
            # Get prediction
            _, predicted = torch.max(outputs.logits, 1)
            pred = (predicted - 1).cpu().numpy()[0]
            
            # Store prediction in DataFrame
            results_df.at[idx, 'Predicted_Sent'] = int(pred)
            
            # Store for metrics calculation
            all_preds.append(pred)
            all_labels.append(row[sentiment_column_name])
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    metrics = {
        'loss': val_loss / len(results_df),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1-score': f1
    }
    
    return metrics, results_df

# Define Model Training and Validation Function

In [12]:
def train_model(model, train_loader, val_data, device, epochs, learning_rate):
    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    val_metrics = None
    train_loss = []
    val_loss = []
    val_accuracy = []
    
    # Training loop
    for epoch in range(epochs):
        model.train() # Set model to training mode
        total_loss = 0
        
        # Process each batch
        pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} Training")
        for batch in pbar:
            # Move batch to GPU if available
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            adjusted_labels = labels + 1 # Shift labels for model [-1, 0, 1] to [0, 1, 2]

            # Training step
            optimizer.zero_grad() # Clear previous gradients
            outputs = model(input_ids, attention_mask, labels=adjusted_labels) # Forward pass
            
            loss = outputs.loss
            total_loss += loss.item() # Accumulate loss

            # Update model weights
            loss.backward() # Backward pass
            optimizer.step() # Update weights

            # Update progress bar with current loss
            pbar.set_postfix(loss=loss.item())

        # Calculate training loss for this epoch
        current_train_loss = total_loss / len(train_loader)
        train_loss.append(current_train_loss)

        # Calculate validation loss and accuracy for this epoch
        val_metrics, _ = evaluate_model(model, val_data, device)
        val_loss.append(val_metrics['loss'])
        val_accuracy.append(val_metrics['accuracy'])

        print(f'Training loss: {current_train_loss}')
        print(f'Validation metric: {val_metrics}')
    
    return model, {
        'train_loss': train_loss,
        'val_loss': val_loss,
        'val_accuracy': val_accuracy
    }

# Define Model Training with Hyperparameters Function

In [None]:
# Function to Batch Text Data
def create_data_loader(data, tokenizer, batch_size):
    # Create a copy of data to avoid modifying the original
    data_copy = data.copy()
    
    # Convert text to BERT input format with progress bar
    encodings = tokenizer(
        data_copy[sentence_column_name].tolist(), # Convert sentences to list
        truncation=True, # Cut texts longer than max_length
        padding=True, # Pad texts shorter than max_length
        max_length=128, # Maximum sequence length
        return_tensors='pt', # Return PyTorch tensors
        verbose=True # Show progress
    )

    # Create dataset by combining inputs and labels
    dataset = torch.utils.data.TensorDataset(
        encodings['input_ids'], # Tokenized text
        encodings['attention_mask'], # Attention mask for padding
        torch.tensor(data_copy[sentiment_column_name].tolist()) # Labels
    )

    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Function to Train Model with Specific Hyperparameters
def train_model_with_hyperparameters(params, train_data, val_data, device, class_weights):
    print(f"Parameters: {params}")

    # Create data loaders with current batch size
    train_loader = create_data_loader(train_data, tokenizer, params['batch_size'])

    # Initialize the custom BERT model
    model = BertWithWeightedLoss.from_pretrained(
        model_name,
        # Configure BERT for classification
        config=BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=3,
            output_attentions=False, # Don't output attention weights
            output_hidden_states=False, # Don't output hidden states
        ).config,
        class_weights=class_weights
    )
    # Move model to GPU if available
    model.to(device)

    # Train model with current parameters
    model, train_metric_seq = train_model(
        model, 
        train_loader, 
        val_data,
        device,
        params['epochs'],
        params['learning_rate']
    )
    
    return model, train_metric_seq

# Model Training

In [90]:
# Train model using different values to try for each hyperparameters
params = {
    'hyperparameter_id': 1,
    'learning_rate': 2e-5,
    'epochs': 2,
    'batch_size': 16
}
model, train_metric_seq = train_model_with_hyperparameters(params, train, val, device, class_weights)

# Create Hyperparameter Directory
model_evaluation_result_folder_name = os.path.join(model_evaluation_result_folder_name, f'Hyperparameter-{params['hyperparameter_id']}')
os.makedirs(model_evaluation_result_folder_name, exist_ok=True)

# Name of Parameters as Filename
file_name = f'LR {params['learning_rate']}, E {params['epochs']}, BS {params['batch_size']}'

# Save the model
model_folder_name = os.path.join(model_evaluation_result_folder_name, f'{file_name} - Model')
model.save_pretrained(model_folder_name)
tokenizer.save_pretrained(model_folder_name)

train_metric_seq


Trying parameters: {'learning_rate': 2e-05, 'epochs': 3, 'batch_size': 16}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertWithWeightedLoss were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3 Training: 100%|████████████████████████████████████████████████| 252/252 [19:16<00:00,  4.59s/it, loss=0.594]


Epoch 1/3, Average training loss: 0.8366


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 36/36 [00:32<00:00,  1.09it/s]


Validation metrics: {'loss': 0.7976726690928141, 'accuracy': 0.6608695652173913, 'precision': 0.6733385716780164, 'recall': 0.6608695652173913, 'f1': 0.6566915358888604}


Epoch 2/3 Training: 100%|█████████████████████████████████████████████████| 252/252 [19:12<00:00,  4.57s/it, loss=0.34]


Epoch 2/3, Average training loss: 0.5353


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 36/36 [00:32<00:00,  1.12it/s]


Validation metrics: {'loss': 0.7761238022810883, 'accuracy': 0.6852173913043478, 'precision': 0.6838237676923322, 'recall': 0.6852173913043478, 'f1': 0.6834893133780899}


Epoch 3/3 Training: 100%|███████████████████████████████████████████████| 252/252 [19:12<00:00,  4.57s/it, loss=0.0473]


Epoch 3/3, Average training loss: 0.3074


Evaluation: 100%|██████████████████████████████████████████████████████████████████████| 36/36 [00:32<00:00,  1.12it/s]


Validation metrics: {'loss': 0.9791919423474206, 'accuracy': 0.6921739130434783, 'precision': 0.6870751794133023, 'recall': 0.6921739130434783, 'f1': 0.6885514413949225}

Parameters: {'learning_rate': 2e-05, 'epochs': 3, 'batch_size': 16}
Validation metrics: {'loss': 0.9791919423474206, 'accuracy': 0.6921739130434783, 'precision': 0.6870751794133023, 'recall': 0.6921739130434783, 'f1': 0.6885514413949225}


(BertWithWeightedLoss(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(30522, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0-11): 12 x BertLayer(
           (attention): BertAttention(
             (self): BertSdpaSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): LayerNorm((

# Model Validation and Test Results

In [10]:
print(f"Parameters: {params}")

# Evaluation on Test Set
test_metrics, predicted_labels_df = evaluate_model(model, test, device)
_, full_predicted_labels_df = evaluate_model(model, full, device)

# Save Validation Metrics
metric_results_file_name = os.path.join(
    model_evaluation_result_folder_name,
    f'{file_name} - Validation Metric.csv'
)
pd.DataFrame(train_metric_seq).to_csv(metric_results_file_name, index=False)

# Save Test Metrics
metric_results_file_name = os.path.join(
    model_evaluation_result_folder_name,
    f'{file_name} - Test Metric.csv'
)
pd.DataFrame([test_metrics]).to_csv(metric_results_file_name, index=False)

# Save Predicted Labels
sentiment_results_file_name = os.path.join(
    model_evaluation_result_folder_name,
    f'{file_name} - Predicted Dataset.csv'
)
predicted_labels_df.to_csv(sentiment_results_file_name, index=False)

# Save Full Predicted Labels
full_sentiment_results_file_name = os.path.join(
    model_evaluation_result_folder_name,
    f'{file_name} - Full Predicted Dataset.csv'
)
full_predicted_labels_df.to_csv(full_sentiment_results_file_name, index=False)

print(f"Test set metrics: {test_metrics}")
predicted_labels_df

Evaluating: 100%|██████████████████████████████████████████████████████████████████| 5745/5745 [04:19<00:00, 22.11it/s]
