Imports

In [10]:
!pip install transformers
!pip install torch
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, BertTokenizer, BertForSequenceClassification, RobertaForSequenceClassification, RobertaTokenizer, XLMForSequenceClassification, XLMTokenizer, AdamW
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# hyperparams
max_len = 256
num_labels = 5

# BBC News Dataset

In [12]:
# load data
bbc = pd.read_csv('/content/drive/MyDrive/bbc-news-data.csv', sep='\t')

# only doing half because of computational constraints
length = len(bbc)
bbc = bbc.iloc[: length // 2]

In [13]:
bbc.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


## BERT

In [None]:
# load the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = num_labels)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# preprocess texts
texts = (bbc['title'] + bbc['content']).tolist()
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=256, truncation=True) for text in texts]
padded_texts = [tokenized_text + [0] * (max_len - len(tokenized_text)) for tokenized_text in tokenized_texts]

In [None]:
# convert vectors to tensors
labels = bbc['category'].tolist()
input_ids = torch.tensor(padded_texts)

In [None]:
# do the same for the labels
unique_labels = list(set(labels))  # Get unique labels
label_map = {label: idx for idx, label in enumerate(unique_labels)}  # Create a mapping dictionary

# Convert string labels to integers using the mapping
label_ids = [label_map[label] for label in labels]

# Convert label_ids to a PyTorch tensor
labels_tensor = torch.tensor(label_ids)

In [None]:
# Create attention masks
attention_masks = [[1 if token != 0 else 0 for token in text] for text in padded_texts]
attention_masks = torch.tensor(attention_masks)

In [None]:
# make train, test dataloader
dataset = TensorDataset(input_ids, attention_masks, labels_tensor)
train_size = 0.8
train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=42)
train_dataset, val_dataset = train_test_split(train_dataset, train_size=0.9, random_state=42)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)



In [None]:
# fine tuning
train_losses = []
val_losses = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0.0
    for batch in train_dataloader:
        input_ids, attention_mask, lbls = batch
        input_ids, attention_mask, lbls = (
            input_ids.to(device),
            attention_mask.to(device),
            lbls.to(device)
        )

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=lbls)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # validation
    model.eval()
    total_val_loss = 0.0
    for val_batch in val_dataloader:
        val_input_ids, val_attention_mask, val_labels = val_batch
        val_input_ids, val_attention_mask, val_labels = (
            val_input_ids.to(device),
            val_attention_mask.to(device),
            val_labels.to(device)
        )

        with torch.no_grad():
            outputs = model(input_ids=val_input_ids, attention_mask=val_attention_mask, labels=val_labels)

        val_loss = outputs.loss
        total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)

    print(f"Epoch {epoch + 1}: Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")


# save the fine-tuned model
torch.save(model.state_dict(), 'fine_tuned_bert_model.pth')

Epoch 1: Training Loss: 1.2062, Validation Loss: 0.7059
Epoch 2: Training Loss: 0.4545, Validation Loss: 0.2207
Epoch 3: Training Loss: 0.1493, Validation Loss: 0.1140


In [None]:
# test on test data
model.eval()
predictions = []
true_labels = []

for test_batch in test_dataloader:
    test_input_ids, test_attention_mask, test_labels = test_batch
    test_input_ids, test_attention_mask, test_labels = (
        test_input_ids.to(device),
        test_attention_mask.to(device),
        test_labels.to(device)
    )

    with torch.no_grad():
        outputs = model(input_ids=test_input_ids, attention_mask=test_attention_mask)

    logits = outputs.logits
    batch_predictions = torch.argmax(logits, dim=1)

    predictions.extend(batch_predictions.cpu().detach().numpy())
    true_labels.extend(test_labels.cpu().detach().numpy())

# Calculate F1 score for the test set
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"F1 Score on Test Set: {f1:.4f}")

F1 Score on Test Set: 0.9911


## DistilBERT

In [None]:
# load model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# preprocess texts
texts = (bbc['title'] + bbc['content']).tolist()
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=256, truncation=True) for text in texts]
padded_texts = [tokenized_text + [0] * (max_len - len(tokenized_text)) for tokenized_text in tokenized_texts]

In [None]:
# convert vectors to tensors
labels = bbc['category'].tolist()
input_ids = torch.tensor(padded_texts)

In [None]:
# do the same for the labels
unique_labels = list(set(labels))  # Get unique labels
label_map = {label: idx for idx, label in enumerate(unique_labels)}  # Create a mapping dictionary

# Convert string labels to integers using the mapping
label_ids = [label_map[label] for label in labels]

# Convert label_ids to a PyTorch tensor
labels_tensor = torch.tensor(label_ids)

In [None]:
# Create attention masks
attention_masks = [[1 if token != 0 else 0 for token in text] for text in padded_texts]
attention_masks = torch.tensor(attention_masks)

In [None]:
# make train, test dataloader
dataset = TensorDataset(input_ids, attention_masks, labels_tensor)
train_size = 0.8
train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=42)
train_dataset, val_dataset = train_test_split(train_dataset, train_size=0.9, random_state=42)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)



In [None]:
# fine tuning
train_losses = []
val_losses = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0.0
    for batch in train_dataloader:
        input_ids, attention_mask, lbls = batch
        input_ids, attention_mask, lbls = (
            input_ids.to(device),
            attention_mask.to(device),
            lbls.to(device)
        )

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=lbls)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # validation
    model.eval()
    total_val_loss = 0.0
    for val_batch in val_dataloader:
        val_input_ids, val_attention_mask, val_labels = val_batch
        val_input_ids, val_attention_mask, val_labels = (
            val_input_ids.to(device),
            val_attention_mask.to(device),
            val_labels.to(device)
        )

        with torch.no_grad():
            outputs = model(input_ids=val_input_ids, attention_mask=val_attention_mask, labels=val_labels)

        val_loss = outputs.loss
        total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)

    print(f"Epoch {epoch + 1}: Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")


# save the fine-tuned model
torch.save(model.state_dict(), 'fine_tuned_distilbert_model.pth')

Epoch 1: Training Loss: 1.1461, Validation Loss: 0.6189
Epoch 2: Training Loss: 0.3700, Validation Loss: 0.1475
Epoch 3: Training Loss: 0.1296, Validation Loss: 0.0785


In [None]:
# test on test data
model.eval()
predictions = []
true_labels = []

for test_batch in test_dataloader:
    test_input_ids, test_attention_mask, test_labels = test_batch
    test_input_ids, test_attention_mask, test_labels = (
        test_input_ids.to(device),
        test_attention_mask.to(device),
        test_labels.to(device)
    )

    with torch.no_grad():
        outputs = model(input_ids=test_input_ids, attention_mask=test_attention_mask)

    logits = outputs.logits
    batch_predictions = torch.argmax(logits, dim=1)

    predictions.extend(batch_predictions.cpu().detach().numpy())
    true_labels.extend(test_labels.cpu().detach().numpy())

# Calculate F1 score for the test set
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"F1 Score on Test Set: {f1:.4f}")

F1 Score on Test Set: 0.9819


## RoBERTa

In [None]:
# load model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# preprocess texts
texts = (bbc['title'] + bbc['content']).tolist()
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=256, truncation=True) for text in texts]
padded_texts = [tokenized_text + [0] * (max_len - len(tokenized_text)) for tokenized_text in tokenized_texts]

In [None]:
# convert vectors to tensors
labels = bbc['category'].tolist()
input_ids = torch.tensor(padded_texts)

In [None]:
# do the same for the labels
unique_labels = list(set(labels))  # Get unique labels
label_map = {label: idx for idx, label in enumerate(unique_labels)}  # Create a mapping dictionary

# Convert string labels to integers using the mapping
label_ids = [label_map[label] for label in labels]

# Convert label_ids to a PyTorch tensor
labels_tensor = torch.tensor(label_ids)

In [None]:
# Create attention masks
attention_masks = [[1 if token != 0 else 0 for token in text] for text in padded_texts]
attention_masks = torch.tensor(attention_masks)

In [None]:
# make train, test dataloader
dataset = TensorDataset(input_ids, attention_masks, labels_tensor)
train_size = 0.8
train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, random_state=42)
train_dataset, val_dataset = train_test_split(train_dataset, train_size=0.9, random_state=42)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)



In [None]:
# fine tuning
train_losses = []
val_losses = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0.0
    for batch in train_dataloader:
        input_ids, attention_mask, lbls = batch
        input_ids, attention_mask, lbls = (
            input_ids.to(device),
            attention_mask.to(device),
            lbls.to(device)
        )

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=lbls)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # validation
    model.eval()
    total_val_loss = 0.0
    for val_batch in val_dataloader:
        val_input_ids, val_attention_mask, val_labels = val_batch
        val_input_ids, val_attention_mask, val_labels = (
            val_input_ids.to(device),
            val_attention_mask.to(device),
            val_labels.to(device)
        )

        with torch.no_grad():
            outputs = model(input_ids=val_input_ids, attention_mask=val_attention_mask, labels=val_labels)

        val_loss = outputs.loss
        total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)

    print(f"Epoch {epoch + 1}: Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")


# save the fine-tuned model
torch.save(model.state_dict(), 'fine_tuned_distilbert_model.pth')

Epoch 1: Training Loss: 1.0230, Validation Loss: 0.1928
Epoch 2: Training Loss: 0.1366, Validation Loss: 0.0515
Epoch 3: Training Loss: 0.0510, Validation Loss: 0.0501


In [None]:
# test on test data
model.eval()
predictions = []
true_labels = []

for test_batch in test_dataloader:
    test_input_ids, test_attention_mask, test_labels = test_batch
    test_input_ids, test_attention_mask, test_labels = (
        test_input_ids.to(device),
        test_attention_mask.to(device),
        test_labels.to(device)
    )

    with torch.no_grad():
        outputs = model(input_ids=test_input_ids, attention_mask=test_attention_mask)

    logits = outputs.logits
    batch_predictions = torch.argmax(logits, dim=1)

    predictions.extend(batch_predictions.cpu().detach().numpy())
    true_labels.extend(test_labels.cpu().detach().numpy())

# Calculate F1 score for the test set
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"F1 Score on Test Set: {f1:.4f}")

F1 Score on Test Set: 0.9866


## SVM

In [21]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [22]:
# data cleaning steps
bbc.dropna(inplace=True)

# remove numbers and special characters and turn to lower case
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

bbc['title'] = bbc['title'].apply(preprocess_text)
bbc['content'] = bbc['content'].apply(preprocess_text)

# tokenize and lemmatize data
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

bbc['title'] = bbc['title'].apply(tokenize_and_lemmatize)
bbc['content'] = bbc['content'].apply(tokenize_and_lemmatize)


In [23]:
# convert outputs to numerical data
labels = bbc['category'].tolist()
unique_labels = list(set(labels))  # Get unique labels
label_map = {label: idx for idx, label in enumerate(unique_labels)}  # Create a mapping dictionary

# Convert string labels to integers using the mapping
label_ids = [label_map[label] for label in labels]

In [24]:
X = (bbc['title'] + bbc['content']).tolist()
y = label_ids
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [26]:
svm_model = SVC(kernel='linear')

# Fit the SVM model
svm_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = svm_model.predict(X_test_tfidf)

# Calculate accuracy
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1}")

F1 Score: 0.9731729244982156
