https://www.kaggle.com/datasets/timilsinabimal/newsarticlecategories

In [3]:
# 5b
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset with 'category', 'title', and 'body' columns
df = pd.read_csv('/kaggle/input/newsarticlecategories/news-article-categories.csv')

# Convert categories to numerical labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])

# Ensure the 'body' column contains strings (in case of NaNs or other issues)
df['body'] = df['body'].astype(str)

# Split the dataset into training and testing sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['body'], df['category'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode sequences
max_length = 128  # Maximum token length
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=max_length)

# Convert labels to tensors
train_labels = torch.tensor(train_labels.tolist())
val_labels = torch.tensor(val_labels.tolist())

# Convert encodings to tensors
train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_masks = torch.tensor(train_encodings['attention_mask'])

val_input_ids = torch.tensor(val_encodings['input_ids'])
val_attention_masks = torch.tensor(val_encodings['attention_mask'])

# Create PyTorch dataset
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

# Create DataLoader for efficient batch processing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['category'].unique()))
model = model.to('cuda')  # If using a GPU

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training Loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_masks, labels = [b.to('cuda') for b in batch]
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Loss: {avg_train_loss}')

# Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_masks, labels = [b.to('cuda') for b in batch]
        
        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, predictions)
print(f'Validation Accuracy: {accuracy}')
print(classification_report(true_labels, predictions, target_names=le.classes_))

# Sample input and prediction
sample_text = "Researchers have discovered a new species of butterfly in the Amazon rainforest."
encoded_sample = tokenizer(sample_text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
encoded_sample = {key: value.to('cuda') for key, value in encoded_sample.items()}

# Predict category
model.eval()
with torch.no_grad():
    output = model(**encoded_sample)
    predicted_class = torch.argmax(output.logits, dim=1).item()

predicted_category = le.inverse_transform([predicted_class])
print(f'Predicted Category: {predicted_category[0]}')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 1.8205220279998557
Epoch 2, Loss: 0.8643427667922752
Epoch 3, Loss: 0.5662715161262557
Validation Accuracy: 0.7776162790697675
                precision    recall  f1-score   support

ARTS & CULTURE       0.84      0.89      0.87       206
      BUSINESS       0.77      0.65      0.70       116
        COMEDY       0.81      0.74      0.77        73
         CRIME       0.72      0.78      0.75        54
     EDUCATION       0.86      0.77      0.81       101
 ENTERTAINMENT       0.77      0.80      0.79        99
   ENVIRONMENT       0.89      0.76      0.82       110
         MEDIA       0.73      0.71      0.72        63
      POLITICS       0.78      0.66      0.71       102
      RELIGION       0.77      0.85      0.81        93
       SCIENCE       0.69      0.83      0.75        58
        SPORTS       0.95      0.84      0.89       100
          TECH       0.68      0.81      0.74        95
         WOMEN       0.58      0.70      0.64       106

      accuracy  