In [1]:
import pandas as pd
import numpy as np
!pip install transformers
!pip install torch

import transformers
import torch

import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from matplotlib import pyplot as plt
from datetime import datetime
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset



In [2]:
data=pd.read_csv('news-dataset.csv',sep='\t')
df=pd.DataFrame(data)
print(df)

           category                              title  \
0          business    UK house prices dip in November   
1          business  LSE 'sets date for takeover deal'   
2             sport    Harinordoquy suffers France axe   
3          business  Barclays shares up on merger talk   
4          politics   Campaign 'cold calls' questioned   
...             ...                                ...   
1552       business  Hariri killing hits Beirut shares   
1553       politics  MPs issued with Blackberry threat   
1554  entertainment  Bollywood DVD fraudster is jailed   
1555          sport                Ireland v USA (Sat)   
1556           tech  Row brewing over peer-to-peer ads   

                                                content  
0      UK house prices dipped slightly in November, ...  
1      The London Stock Exchange (LSE) is planning t...  
2      Number eight Imanol Harinordoquy has been dro...  
3      Shares in UK banking group Barclays have rise...  
4      Labour

In [3]:
# Split the dataset
X = df['content']
y = df['category']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
seq_len = 128
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_tokens = tokenizer(X_train.tolist(), padding='max_length', max_length=seq_len, truncation=True, return_tensors="pt")
val_tokens = tokenizer(X_val.tolist(), padding='max_length', max_length=seq_len, truncation=True, return_tensors="pt")

X_train_ids, X_train_mask = train_tokens['input_ids'], train_tokens['attention_mask']
X_val_ids, X_val_mask = val_tokens['input_ids'], val_tokens['attention_mask']

In [5]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

train_dataset = TensorDataset(X_train_ids, X_train_mask, torch.tensor(y_train_encoded, dtype=torch.long))
val_dataset = TensorDataset(X_val_ids, X_val_mask, torch.tensor(y_val_encoded, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=16, drop_last=True)

In [6]:
config = transformers.DistilBertConfig(dropout=0.2, attention_dropout=0.2)
dbert_pt = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

## Build an 2-layer classifier

In [8]:

class SimpleClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(SimpleClassifier, self).__init__()
       
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()  # Use ReLU activation function
       
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        # Output layer uses Softmax for classification
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)   
        x = self.relu(x)  
        x = self.fc2(x)  
        x = self.softmax(x)  
        return x

embedding_dim = 768  # Embedding dimension from DistilBERT
hidden_dim = 128     # Dimension of the hidden layer, adjustable parameter
num_classes = 5      # Number of classes to predict
classifier = SimpleClassifier(embedding_dim, hidden_dim, num_classes)


## adjuist the Hyperparameter

In [10]:
learning_rate = 1e-4 
batch_size = 16       
epochs = 10

optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss() 

## train the model

In [12]:
epochs = 10
for epoch in range(epochs):
    classifier.train()
    total_train_loss = 0

    for batch in train_loader:
        X_batch, mask_batch, labels_batch = batch
        outputs = dbert_pt(X_batch, attention_mask=mask_batch)
        hidden_X = outputs['last_hidden_state']
    
        input_embedding = hidden_X[:, 0, :]
        input_embedding = input_embedding.float()
        
        optimizer.zero_grad()

        outputs = classifier(input_embedding)
        loss = criterion(outputs, labels_batch)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {total_train_loss/len(train_loader):.4f}')

Epoch 1/10, Train Loss: 1.5604
Epoch 2/10, Train Loss: 1.4201
Epoch 3/10, Train Loss: 1.2698
Epoch 4/10, Train Loss: 1.1670
Epoch 5/10, Train Loss: 1.0945
Epoch 6/10, Train Loss: 1.0481
Epoch 7/10, Train Loss: 1.0213
Epoch 8/10, Train Loss: 1.0037
Epoch 9/10, Train Loss: 0.9917
Epoch 10/10, Train Loss: 0.9828


## evaluate the model

In [14]:
classifier.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        inputs, masks, labels = batch
        
        outputs = dbert_pt(inputs, attention_mask=masks)
        hidden_X = outputs['last_hidden_state']
        
        input_embedding = hidden_X[:, 0, :] 
        input_embedding = input_embedding.float()  

        outputs = classifier(input_embedding)
        _, preds = torch.max(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

print(classification_report(all_labels, all_preds, target_names=['business', 'entertainment', 'politics', 'sport', 'tech']))

weighted_f1 = f1_score(all_labels, all_preds, average='weighted')
print(f'Weighted F1-score: {weighted_f1:.4f}')


               precision    recall  f1-score   support

     business       0.98      0.91      0.95        68
entertainment       0.88      0.95      0.91        39
     politics       0.94      0.98      0.96        59
        sport       0.96      1.00      0.98        80
         tech       0.96      0.90      0.93        58

     accuracy                           0.95       304
    macro avg       0.95      0.95      0.95       304
 weighted avg       0.95      0.95      0.95       304

Weighted F1-score: 0.9505
