In [None]:
# -*- coding: utf-8 -*-
"""Fine-tuning BERT for NER (Named Entity Recognition)"""

# Install necessary libraries for Colab
!pip install transformers torch

# Import libraries
import csv
import os
import numpy as np
import pandas as pd
import torch
import random
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Download the MIT Movie Corpus files
!wget -q https://sls.csail.mit.edu/downloads/movie/engtrain.bio -O engtrain.bio
!wget -q https://sls.csail.mit.edu/downloads/movie/engtest.bio -O engtest.bio

print("Downloaded dataset files.")

# Helper function to load and process data
def get_sentences_and_labels(file_path):
    sentences = []  # Store tokenized words lists for each sentence
    labels = []     # Store labels for each word in a sentence
    unique_labels = set()  # Track unique labels

    # Temporary storage for the current sentence
    tokens = []
    token_labels = []

    with open(file_path, newline='', encoding='utf-8') as file:
        line_reader = csv.reader(file, delimiter='\t')

        for line in line_reader:
            # Ignore empty or improperly formatted lines
            if not line or len(line) < 2:
                if tokens:  # End of a sentence
                    sentences.append(tokens)
                    labels.append(token_labels)
                    tokens = []
                    token_labels = []
                continue

            # Extract token and label
            token, label = line[1], line[0]
            tokens.append(token)
            token_labels.append(label)
            unique_labels.add(label)

    # Append the last sentence if not empty
    if tokens:
        sentences.append(tokens)
        labels.append(token_labels)

    return sentences, labels, unique_labels

# Paths to the downloaded dataset files
train_file_path = 'engtrain.bio'
test_file_path = 'engtest.bio'

# Extract sentences, labels, and unique labels for train and test datasets
train_sentences, train_labels, unique_labels_train = get_sentences_and_labels(train_file_path)
test_sentences, test_labels, unique_labels_test = get_sentences_and_labels(test_file_path)

# Combine unique labels from train and test
unique_labels = unique_labels_train.union(unique_labels_test)

# Print dataset details
print(f"Number of sentences in training data: {len(train_sentences)}")
print(f"Number of sentences in testing data: {len(test_sentences)}")
print(f"Number of unique labels: {len(unique_labels)}")

# Create a label-to-ID mapping
label_map = {label: idx for idx, label in enumerate(sorted(unique_labels))}
print(f"Label Map: {label_map}")

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Helper function to tokenize and encode sentences
def get_input_ids_and_attention_masks(sentences):
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        joined_sentence = ' '.join(sentence)
        encoded = tokenizer.encode_plus(
            joined_sentence,
            add_special_tokens=True,
            max_length=59,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'][0])
        attention_masks.append(encoded['attention_mask'][0])

    return input_ids, attention_masks

# Prepare input IDs and attention masks
train_input_ids, train_attention_masks = get_input_ids_and_attention_masks(train_sentences)
test_input_ids, test_attention_masks = get_input_ids_and_attention_masks(test_sentences)

# Helper function to pad labels to match input length
def pad_labels(input_ids, labels, label_map):
    padded_labels = []

    for sent, orig_labels in zip(input_ids, labels):
        curr_labels = []
        label_idx = 0

        for token_id in sent:
            token_id = token_id.item()

            if token_id in {tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id}:
                curr_labels.append(-100)  # Ignore label for special tokens
            elif tokenizer.decode([token_id]).startswith("##"):  # Ignore subwords
                curr_labels.append(-100)
            else:
                curr_labels.append(label_map[orig_labels[label_idx]])
                label_idx += 1

        assert len(sent) == len(curr_labels)
        padded_labels.append(curr_labels)

    return padded_labels

# Pad labels
train_padded_labels = pad_labels(train_input_ids, train_labels, label_map)
test_padded_labels = pad_labels(test_input_ids, test_labels, label_map)

# Convert to tensors
train_input_ids_tensor = torch.stack(train_input_ids)
train_attention_masks_tensor = torch.stack(train_attention_masks)
train_padded_labels_tensor = torch.tensor(train_padded_labels)

# Create DataLoader
dataset = TensorDataset(train_input_ids_tensor, train_attention_masks_tensor, train_padded_labels_tensor)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 32
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# Initialize the model
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label_map) + 1  # +1 for padding label
)
model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
total_steps = len(train_dataloader) * 4  # Assuming 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

epochs = 4
loss_values = []

for epoch in range(epochs):
    print(f"======== Epoch {epoch + 1} / {epochs} ========")
    print("Training...")
    total_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)
    print(f"  Average training loss: {avg_train_loss:.2f}")

# Plot training loss
plt.plot(range(1, epochs + 1), loss_values)
plt.xlabel("Epochs")
plt.ylabel("Training Loss")
plt.title("Training Loss Over Epochs")
plt.show()

# Testing the model
test_sentence = "Characterized by its use of Technicolor, fantasy storytelling, musical score, and memorable characters, the film has become an American pop culture icon."
encoded_test = tokenizer.encode_plus(
    test_sentence,
    add_special_tokens=True,
    max_length=59,
    truncation=True,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt'
)

input_ids = encoded_test['input_ids'].to(device)
attention_mask = encoded_test['attention_mask'].to(device)

model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=2).cpu().numpy()

# Decode predictions
for token, label_id in zip(tokenizer.convert_ids_to_tokens(input_ids[0]), predictions[0]):
    if label_id != -100:
        print(f"{token}: {list(label_map.keys())[list(label_map.values()).index(label_id)]}")