# BERT Finetuning Tutorial for Sentiment Analysis

In [1]:
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW  
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.manual_seed(42)
    
# Set device (GPU if available, else CPU)
device = (
    "mps" 
    if torch.backends.mps.is_available() 
    else "cuda" 
    if torch.cuda.is_available() 
    else "cpu"
)
device = torch.device(device)
print(f"Using device: {device}")

Using device: cpu


In [None]:
data = pd.read_csv('compiled_stocks_df_vol.csv', encoding='utf-8', encoding_errors='ignore').drop(columns='Unnamed: 0')
data = data.rename(columns = {'headline': 'text','vol_label':'label'})
data

Unnamed: 0,text,label
0,RPT-FOCUS-Goldman Sachs faces rocky exit from ...,1
1,Newscasts - Wall Street ends down as megacaps ...,0
2,Newscasts - U.S. Day Ahead: Treasury yields r...,0
3,Newscasts - U.S. Morning Call: Elon Musk curse...,1
4,Newscasts - U.S. stocks little changed ahead o...,0
...,...,...
1496,FACTBOX-List of UK competition regulator cases...,1
1497,UPDATE 3-UK boots out antitrust boss for faili...,0
1498,RPT-BREAKINGVIEWS-Mega-merger boom threatens a...,0
1499,BREAKINGVIEWS-Mega-merger boom threatens a sha...,0


In [8]:
# Split data ensuring indices are reset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['text'].reset_index(drop=True),
    data['label'].reset_index(drop=True),
    test_size=0.3,
    random_state=42
)
# train_labels

In [9]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")

In [10]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        # Convert to list to ensure sequential indexing
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else list(texts)
        self.labels = labels.tolist() if hasattr(labels, 'tolist') else list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
# Create datasets
train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [12]:
# Create the BERT-based model class
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes=3):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('ProsusAI/finbert')
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.classifier(output)


In [13]:
# Initialize model
model = SentimentClassifier()
model = model.to(device)

# Initialize optimizer
optimizer = AdamW([
    {'params': model.bert.parameters(), 'lr': 2e-5},
    {'params': model.classifier.parameters(), 'lr': 1e-3}
])

# Training function
def train_epoch(model, data_loader, optimizer, device):
    """
    Trains the model for one epoch and returns the average loss.
    """
    model.train()
    total_loss = 0
    
    # Use tqdm for a nice progress bar
    for batch in tqdm(data_loader, desc="Training"):
        # Move batch to device (CPU/GPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Clear gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Calculate loss
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        total_loss += loss.item()
    
    # Return average loss
    return total_loss / len(data_loader)

# Evaluation function
def evaluate(model, data_loader, device):
    """
    Evaluates the model on the provided data loader.
    Returns accuracy and average loss.
    """
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(outputs, labels)
            
            _, predictions = torch.max(outputs, dim=1)
            
            total_loss += loss.item()
            correct_predictions += torch.sum(predictions == labels)
            total_predictions += labels.shape[0]
    
    # Use float32 instead of double/float64
    accuracy = (correct_predictions.float() / total_predictions) * 100  
    average_loss = total_loss / len(data_loader)
    
    return accuracy, average_loss

# Main training loop
def train_model(model, train_loader, val_loader, optimizer, device, epochs=3):
    """
    Main training loop that handles the entire training process.
    """
    best_accuracy = 0
    
    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')
        
        # Train one epoch
        train_loss = train_epoch(model, train_loader, optimizer, device)
        
        # Evaluate
        val_accuracy, val_loss = evaluate(model, val_loader, device)
        
        # Print metrics
        print(f'Training Loss: {train_loss:.4f}')
        print(f'Validation Loss: {val_loss:.4f}')
        print(f'Validation Accuracy: {val_accuracy:.4f}')
        
        # Save best model
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model.pt')
            print('Best model saved!')

In [14]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import BertModel, get_scheduler

# Initialize the optimizer with different learning rates
optimizer = AdamW([
    {'params': model.bert.parameters(), 'lr': 2e-5},   # Lower LR for BERT layers
    {'params': model.classifier.parameters(), 'lr': 1e-3}  # Higher LR for the classifier
])

# Set up loss function
loss_fn = nn.CrossEntropyLoss()

# Set number of epochs
num_epochs = 3

# Calculate the total number of training steps (batches * epochs)
num_training_steps = len(train_loader) * num_epochs
num_warmup_steps = int(0.1 * num_training_steps)  # 10% warmup steps

# Set up the learning rate scheduler
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)

# ===================== Gradual Unfreezing =====================

# Phase 1: Freeze all layers except the classifier head
for param in model.bert.parameters():
    param.requires_grad = False

# Training Loop for Phase 1
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = loss_fn(outputs, batch['labels'])
        
        # Backward pass
        loss.backward()
        
        optimizer.step()  # Update parameters
        lr_scheduler.step()  # Update learning rate based on scheduler
    
    print(f"Phase 1 - Epoch {epoch+1}/{num_epochs} completed. Loss: {loss.item():.4f}")
    torch.save(model.state_dict(), f'phase1_epoch_{epoch+1}.pth')

# Phase 2: Unfreeze the last BERT layer
for param in model.bert.encoder.layer[-1].parameters():
    param.requires_grad = True

# Training Loop for Phase 2
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = loss_fn(outputs, batch['labels'])
        
        # Backward pass
        loss.backward()
        
        optimizer.step()  # Update parameters
        lr_scheduler.step()  # Update learning rate based on scheduler
    
    print(f"Phase 2 - Epoch {epoch+1}/{num_epochs} completed. Loss: {loss.item():.4f}")
    torch.save(model.state_dict(), f'phase2_epoch_{epoch+1}.pth')

# Phase 3: Unfreeze the top N layers (e.g., last 2 layers)
for i in range(-2, 0):  # Unfreeze the last 2 layers
    for param in model.bert.encoder.layer[i].parameters():
        param.requires_grad = True

# Training Loop for Phase 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = loss_fn(outputs, batch['labels'])
        
        # Backward pass
        loss.backward()
        
        optimizer.step()  # Update parameters
        lr_scheduler.step()  # Update learning rate based on scheduler
    
    print(f"Phase 3 - Epoch {epoch+1}/{num_epochs} completed. Loss: {loss.item():.4f}")
    torch.save(model.state_dict(), f'phase3_epoch_{epoch+1}.pth')

print("Training completed.")

Phase 1 - Epoch 1/3 completed. Loss: 0.6981
Phase 1 - Epoch 2/3 completed. Loss: 0.5399
Phase 1 - Epoch 3/3 completed. Loss: 0.4037
Phase 2 - Epoch 1/3 completed. Loss: 0.7062
Phase 2 - Epoch 2/3 completed. Loss: 0.5453
Phase 2 - Epoch 3/3 completed. Loss: 0.4796
Phase 3 - Epoch 1/3 completed. Loss: 0.6937
Phase 3 - Epoch 2/3 completed. Loss: 0.5813
Phase 3 - Epoch 3/3 completed. Loss: 0.7107
Training completed.


In [None]:
# %%time

# # Train the model
# train_model(model, train_loader, val_loader, optimizer, device)


Epoch 1/3


Training: 100%|██████████| 25/25 [07:33<00:00, 18.14s/it]
Evaluating: 100%|██████████| 11/11 [00:49<00:00,  4.51s/it]


Training Loss: 0.6634
Validation Loss: 0.4578
Validation Accuracy: 82.8488
Best model saved!

Epoch 2/3


Training: 100%|██████████| 25/25 [06:01<00:00, 14.46s/it]
Evaluating: 100%|██████████| 11/11 [00:49<00:00,  4.53s/it]


Training Loss: 0.5325
Validation Loss: 0.4761
Validation Accuracy: 82.8488

Epoch 3/3


Training: 100%|██████████| 25/25 [06:13<00:00, 14.95s/it]
Evaluating: 100%|██████████| 11/11 [00:52<00:00,  4.73s/it]

Training Loss: 0.5036
Validation Loss: 0.4730
Validation Accuracy: 82.8488
CPU times: total: 1h 56min 10s
Wall time: 22min 20s



