# BERT fine-tune

Fine tune BERT with a few sentences on Bart and Homer Simpson. Given a sentence, BERT will return the correct label


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('data/simpson/dataset_homer_bart.csv')
data = data[['Name',  'Sentence']]

# Split the data
train_data, temp_data = train_test_split(data, test_size=0.2, stratify=data['Name'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['Name'])

# Tokenise

In [9]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def prepare_data(data):
    # Tokenize sentences and map to tokens IDs
    inputs = tokenizer(data['Sentence'].tolist(), padding=True, truncation=True, return_tensors="pt", max_length=512)
    # Map labels to [0, 1]
    labels = torch.tensor(data['Name'].map({'Homer': 0, 'Bart': 1}).tolist())
    return TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)

train_dataset = prepare_data(train_data)
val_dataset = prepare_data(val_data)
test_dataset = prepare_data(test_data)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


# Fine-tune

In [31]:
from transformers import BertForSequenceClassification, AdamW
from torch.nn import CrossEntropyLoss

# Initialize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.train()

# Define loss and optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_function = CrossEntropyLoss()

# Function to compute the loss on validation data
def compute_val_loss(val_loader):
    model.eval()  # set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_function(outputs.logits, labels)
            total_loss += loss.item()
    model.train()  # set the model back to training mode
    return total_loss / len(val_loader)

# Training loop
epochs = 3
for epoch in range(epochs):
    total_train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_function(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    val_loss = compute_val_loss(val_loader)
    print(f'Epoch: {epoch + 1}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}')



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1, Training Loss: 0.6813, Validation Loss: 0.6658
Epoch: 2, Training Loss: 0.5506, Validation Loss: 0.6191
Epoch: 3, Training Loss: 0.3787, Validation Loss: 0.6420


# Test the mdoel

In [33]:
def evaluate(test_loader):
    model.eval()  # set the model to evaluation mode
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            
            # Compute loss
            loss = loss_function(outputs.logits, labels)
            total_loss += loss.item()
            
            # Compute number of correct predictions
            predictions = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    avg_test_loss = total_loss / len(test_loader)
    accuracy = correct_predictions / total_predictions
    
    return avg_test_loss, accuracy

test_loss, test_accuracy = evaluate(test_loader)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Test Loss: 0.5806
Test Accuracy: 0.7174


# Save and Load

In [34]:
# Save the model and tokenizer
model.save_pretrained('model/simpsons')
tokenizer.save_pretrained('model/simpsons')

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained('model/simpsons')
tokenizer = BertTokenizer.from_pretrained('model/simpsons')

# Inference

In [22]:
model.eval()

def predict(sentence):
    inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    label = torch.argmax(outputs.logits).item()
    return 'Homer' if label == 0 else 'Bart'


In [19]:
# Test the model
sample_sentence = "Yeah. Skating is cool!"
prediction = predict(sample_sentence)
print(f'Predicted Label: {prediction}')


Predicted Label: Bart


In [21]:
# Test the model
sample_sentence = "Marge, where are my shoes?"
prediction = predict(sample_sentence)
print(f'Predicted Label: {prediction}')

Predicted Label: Homer
