In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from transformers import LongformerTokenizer, LongformerForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset

**Hyperparameters**

In [26]:
# transformer_model = "BERT"
transformer_model = "Longformer"

if transformer_model == "BERT":
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=20,  # Number of labels (20) corresponds to the 20 newsgroups dataset
        output_attentions=False, 
        output_hidden_states=False,  
    )

elif transformer_model == "Longformer":
    model_name = 'allenai/longformer-base-4096'
    tokenizer = LongformerTokenizer.from_pretrained(model_name)

    model = LongformerForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=20
    )


device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
model.to(device)
max_seq_len = 256
batch_size = 32
num_epochs = 2

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
data = pd.DataFrame({'text_data': newsgroups.data, 'label': newsgroups.target})

# entry_index = 0
# print(f"Text:\n{newsgroups['data'][entry_index]}\n\n")
# print(f"Label index: {newsgroups['target'][entry_index]}")
# print(f"Label name: {newsgroups['target_names'][newsgroups['target'][entry_index]]}")

data = data.sample(frac=1).reset_index(drop=True)

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

def tokenize_data(data, tokenizer, max_seq_len):
    input_ids, attention_masks, labels = [], [], []

    for index, row in tqdm(data.iterrows(), total=len(data)):
        encoded = tokenizer.encode_plus(
            row["text_data"],
            add_special_tokens=True,  
            max_length=max_seq_len,  
            padding="max_length",  
            truncation=True,  
            return_attention_mask=True,  
        )

        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
        labels.append(row["label"])

    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(labels)

train_input_ids, train_attention_masks, train_labels = tokenize_data(train_data, tokenizer, max_seq_len)
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_data, tokenizer, max_seq_len)

100%|██████████| 15076/15076 [00:18<00:00, 793.56it/s] 
100%|██████████| 3770/3770 [00:04<00:00, 810.91it/s] 


In [28]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

**BERT Model Training**

In [29]:
total_steps = len(train_dataloader) * num_epochs

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training", position=0, leave=True)

    for batch in progress_bar:
        input_ids, attention_masks, labels = [t.to(device) for t in batch]

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        progress_bar.set_description(f"Training - Loss: {loss.item():.4f}")

    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_eval_accuracy = 0

    progress_bar = tqdm(dataloader, desc="Evaluation", position=0, leave=True)

    for batch in progress_bar:
        input_ids, attention_masks, labels = [t.to(device) for t in batch]

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)

        logits = outputs[0].detach().cpu().numpy()
        label_ids = labels.cpu().numpy()

        batch_accuracy = accuracy_score(label_ids, logits.argmax(axis=-1))
        total_eval_accuracy += batch_accuracy

        progress_bar.set_description(f"Evaluation - Batch Accuracy: {batch_accuracy:.4f}")

    return total_eval_accuracy / len(dataloader)

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    val_accuracy = evaluate(model, val_dataloader, device)

    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Loss: {train_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")

Training:   0%|          | 0/472 [00:00<?, ?it/s]

Training - Loss: 0.8262: 100%|██████████| 472/472 [11:32<00:00,  1.47s/it]
Evaluation - Batch Accuracy: 0.8077: 100%|██████████| 118/118 [00:43<00:00,  2.71it/s]



Epoch 1/2
Loss: 1.4284 - Validation Accuracy: 0.7041


Training - Loss: 1.0384:  44%|████▍     | 210/472 [05:09<06:26,  1.48s/it]


KeyboardInterrupt: 

**Evaluating BERT Model Using Performance Metrics**

In [30]:
def get_predictions(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)

        logits = outputs[0].detach().cpu().numpy()
        label_ids = labels.cpu().numpy()

        predictions.extend(logits.argmax(axis=-1))
        true_labels.extend(label_ids)

    return np.array(predictions), np.array(true_labels)

predictions, true_labels = get_predictions(model, val_dataloader, device)

accuracy = accuracy_score(true_labels, predictions)

report = classification_report(true_labels, predictions, digits=4)

print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Evaluating: 100%|██████████| 118/118 [00:43<00:00,  2.71it/s]

Validation Accuracy: 0.7111
Classification Report:
              precision    recall  f1-score   support

           0     0.4171    0.5355    0.4689       155
           1     0.7572    0.6550    0.7024       200
           2     0.5982    0.6700    0.6321       200
           3     0.5372    0.6701    0.5963       194
           4     0.7971    0.5419    0.6452       203
           5     0.7193    0.8410    0.7754       195
           6     0.8830    0.7685    0.8218       216
           7     0.8750    0.7440    0.8042       207
           8     0.7538    0.7737    0.7636       190
           9     0.8289    0.7949    0.8115       195
          10     0.8878    0.9146    0.9010       199
          11     0.7539    0.7742    0.7639       186
          12     0.6589    0.7382    0.6963       191
          13     0.8785    0.8595    0.8689       185
          14     0.6609    0.7755    0.7136       196
          15     0.6667    0.7826    0.7200       207
          16     0.6614    0.7




In [31]:
output_dir = "./model/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.json',
 './model/merges.txt',
 './model/added_tokens.json')

**Classifying a News Article with the BERT Model**

In [32]:
def classify_news_article(model, tokenizer, device, text, max_len=128):
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",  
    )

    input_ids, attention_mask = encoded["input_ids"].to(device), encoded["attention_mask"].to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask)

    predicted_label_index = logits[0].argmax(-1).item()

    return predicted_label_index


sample_news_article = "The Orion spacecraft will be launched on a new mission to explore deep space."

predicted_label_index = classify_news_article(model, tokenizer, device, sample_news_article, max_len=max_seq_len)

print(f"Predicted Label Index: {predicted_label_index}")
print(f"Predicted Class: {newsgroups.target_names[predicted_label_index]}")

Predicted Label Index: 14
Predicted Class: sci.space
