In [1]:
from torch.utils.data import DataLoader, random_split
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Function to download AG_NEWS dataset explicitly
def download_AG_NEWS():
    print("Downloading AG_NEWS dataset...")
    train_iter, test_iter = AG_NEWS(split=('train', 'test'))
    print("Download complete.")
    return train_iter, test_iter

# Constants
BATCH_SIZE = 16
MAX_LENGTH = 512
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
TEST_RATIO = 0.1

# Download AG_NEWS dataset
train_iter, test_iter = download_AG_NEWS()

# Tokenizer from transformers
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Preprocessing function to tokenize and encode the sentences
def preprocess(batch):
    texts = [item[1] for item in batch]
    labels = [item[0] - 1 for item in batch]  # Labels are now 0-3
    encoding = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH)
    return encoding['input_ids'], encoding['attention_mask'], torch.tensor(labels)

# Create DataLoader
train_list = list(train_iter)
test_list = list(test_iter)

# Split dataset into train, val, test
n_total = len(train_list)
n_train = int(n_total * TRAIN_RATIO)
n_val = int(n_total * VAL_RATIO)
n_test = n_total - n_train - n_val

train_data, val_data, test_data = random_split(train_list, [n_train, n_val, n_test])

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=preprocess)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=preprocess)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=preprocess)

Downloading AG_NEWS dataset...
Download complete.


In [2]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
import torch

# Constants
BATCH_SIZE = 16
MAX_LENGTH = 512
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
TEST_RATIO = 0.1
n_epochs = 5
lr = 1e-5

# Assuming you've defined your train_loader and val_loader

# Initialize model, optimizer, and loss function
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = CrossEntropyLoss()

# Check for CUDA availability and move model to GPU if available
if torch.cuda.is_available():
    model = model.to('cuda')

# Loop through epochs
for epoch in range(n_epochs):
    # Training loop
    model.train()
    for i, (input_ids, attention_mask, labels) in enumerate(train_loader):
        
        input_ids, attention_mask, labels = input_ids.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


    # Validation loop
    model.eval()
    val_loss = 0.0
    correct = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_loader:
            input_ids, attention_mask, labels = input_ids.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct / len(val_data)
    print(f'Validation Loss: {avg_val_loss}')
    print(f'Validation Accuracy: {val_acc}')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Batch: 0, Loss: 1.3898851871490479
Epoch: 0, Batch: 10, Loss: 1.3489104509353638
Epoch: 0, Batch: 20, Loss: 1.261159062385559
Epoch: 0, Batch: 30, Loss: 1.1664700508117676
Epoch: 0, Batch: 40, Loss: 1.0481914281845093
Epoch: 0, Batch: 50, Loss: 0.9107452630996704
Epoch: 0, Batch: 60, Loss: 0.8558134436607361
Epoch: 0, Batch: 70, Loss: 0.680022656917572
Epoch: 0, Batch: 80, Loss: 0.5621520280838013
Epoch: 0, Batch: 90, Loss: 0.3674430251121521
Epoch: 0, Batch: 100, Loss: 0.2757079601287842
Epoch: 0, Batch: 110, Loss: 0.3599507808685303
Epoch: 0, Batch: 120, Loss: 0.5559625029563904
Epoch: 0, Batch: 130, Loss: 0.6676216125488281
Epoch: 0, Batch: 140, Loss: 0.34674960374832153
Epoch: 0, Batch: 150, Loss: 0.3998372256755829
Epoch: 0, Batch: 160, Loss: 0.2914513945579529
Epoch: 0, Batch: 170, Loss: 0.5660255551338196
Epoch: 0, Batch: 180, Loss: 0.4051343500614166
Epoch: 0, Batch: 190, Loss: 0.12345974147319794
Epoch: 0, Batch: 200, Loss: 0.19575443863868713
Epoch: 0, Batch: 210, L

In [3]:
from sklearn.metrics import classification_report
import numpy as np

# Initialize arrays for true and predicted labels
y_true = np.array([])
y_pred = np.array([])

# Evaluate the model on the test set
model.eval()
test_loss = 0.0
correct = 0

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        test_loss += loss.item()

        preds = torch.argmax(outputs.logits, dim=1)
        correct += (preds == labels).sum().item()

        # Store true and predicted labels for classification report
        y_true = np.concatenate((y_true, labels.cpu().numpy()))
        y_pred = np.concatenate((y_pred, preds.cpu().numpy()))

# Average test loss
avg_test_loss = test_loss / len(test_loader)
print(f'Test Loss: {avg_test_loss}')

# Test accuracy
test_acc = correct / len(test_data)
print(f'Test Accuracy: {test_acc}')

# Generate the classification report
target_names = ['World', 'Sports', 'Business', 'Sci/Tech']
print(classification_report(y_true, y_pred, target_names=target_names))

Test Loss: 0.21353960193061114
Test Accuracy: 0.94225
              precision    recall  f1-score   support

       World       0.95      0.95      0.95      3053
      Sports       0.99      0.98      0.99      2960
    Business       0.92      0.90      0.91      2958
    Sci/Tech       0.91      0.93      0.92      3029

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000



In [4]:
# Save the model
torch.save(model.state_dict(), 'agnews_distilbert.pth')

# Save the tokenizer
tokenizer.save_pretrained('agnews_tokenizer')
# To load the model and tokenizer later, you can use:
# loaded_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
# loaded_model.load_state_dict(torch.load('yes_no_bert.pth'))
# loaded_tokenizer = DistilBertTokenizer.from_pretrained('tokenizer_dir')

('agnews_tokenizer/tokenizer_config.json',
 'agnews_tokenizer/special_tokens_map.json',
 'agnews_tokenizer/vocab.txt',
 'agnews_tokenizer/added_tokens.json')