In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from itertools import chain


### Load the dataset


In [None]:
df = pd.read_csv('bbc-text.csv')  # Replace with your dataset path
print(df.head())

        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...


### Preprocess the data


In [None]:
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# Split into training and test sets
train_df, test_df = train_test_split(df, test_size=0.1)

# Tokenizer function
def basic_english_tokenizer(text):
    return text.lower().split()  # Basic whitespace and lowercase tokenizer

# Build vocabulary manually
def build_vocab(data_iter, tokenizer, specials=["<unk>"]):
    counter = Counter(chain.from_iterable(tokenizer(text) for text in data_iter))
    sorted_vocab = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    vocab = {word: idx + len(specials) for idx, (word, _) in enumerate(sorted_vocab)}
    for idx, special in enumerate(specials):
        vocab[special] = idx
    return vocab

# Yield tokens
train_texts = train_df['text'].tolist()
vocab = build_vocab(train_texts, basic_english_tokenizer)
vocab["<unk>"] = 0  # Set <unk> as the default index
print(f"Vocabulary size: {len(vocab)}")


Vocabulary size: 41741


The `build_vocab` function returns a dictionary (`dict`) where:
- **Keys**: Unique tokens (words) from the training dataset.
- **Values**: Integer indices assigned to each token, starting with the indices for special tokens.


In [None]:
list(vocab.items())[0:5]

[('the', 1), ('to', 2), ('of', 3), ('and', 4), ('a', 5)]

### Custom Dataset Class


In [None]:
class BBCDataset(Dataset):
    def __init__(self, dataframe, vocab, tokenizer, max_length=500):
        self.dataframe = dataframe
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['text']
        label = self.dataframe.iloc[idx]['category_encoded']
        tokens = [self.vocab.get(token, self.vocab["<unk>"]) for token in self.tokenizer(text)]
        if len(tokens) < self.max_length:
            tokens += [0] * (self.max_length - len(tokens))  # Padding
        else:
            tokens = tokens[:self.max_length]  # Truncating
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Prepare Dataloaders
train_dataset = BBCDataset(train_df, vocab, basic_english_tokenizer)
test_dataset = BBCDataset(test_df, vocab, basic_english_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Print vocabulary size
print(f"Vocabulary size: {len(vocab)}")


Vocabulary size: 41741


### Summary of `__getitem__` Method in `BBCDataset`

The `__getitem__` method of the `BBCDataset` class returns a **tuple** containing:
1. **Tokenized and Padded/Truncated Text**:
   - A PyTorch tensor representing the numerical sequence of tokens for the text at a given index.
   - Tokens are mapped to their vocabulary indices.
   - The sequence is padded with `0`s or truncated to a fixed length (`max_length`).

2. **Encoded Label**:
   - A PyTorch tensor representing the encoded category label of the text.




In [None]:
train_dataset[1]

(tensor([  433,   867,  6943, 16869,   374,   356,   338,   252,    19,   648,
           454,   149,     3,  3236, 16869,    64,   631,    12,   271,   804,
            81, 13751,     1,    75,    19,  1116,    57, 16870,     1,  1278,
           310,    36,  2098,  3466,   804,     6,  7555,  7556,    33,   252,
          2960,     1,   318,  4595,  2510,    11,     9,    28,  8240,    45,
            97,    15, 11677,    11,     7,     1, 13751,   138,   371,   589,
          1200,    17,   618,    38,  1104,     2,    40,  3582,     2,  1384,
             1,  2057,  5039,   391,  1878,   541,   372,  4052,    65,   359,
          1228,     5,   374,   990,     2,   416,     1,   310,    79,    83,
          8240,   271, 10210,  9081,   804,    61,    15, 16871,     1,   707,
          2296,     1,   262,   159,     2,   148,     1,   391,     4,  2724,
          2868,   203,    37,    15,  7557,    18,   143,     3,     5,  1384,
             3,     1,  2057,    14,    91, 11678,  

These are then used by a `DataLoader` to create batches for training or evaluation.

In [None]:
for texts, labels in train_loader:
    print(texts.shape)
    print(labels.shape)
    break

torch.Size([32, 500])
torch.Size([32])


In [None]:
texts

tensor([[ 5271,   736,     8,  ...,     0,     0,     0],
        [ 5609,   853,     7,  ...,     0,     0,     0],
        [ 4984,  4286,  2714,  ...,    41,  1530,  7345],
        ...,
        [  200,  6303,  6894,  ...,   262,   159,   454],
        [ 7451,  1153,   142,  ...,     0,     0,     0],
        [12244,  1900,   463,  ...,     0,     0,     0]])

### Define the Model


In [None]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)
        self.pool = nn.AdaptiveAvgPool1d(1)

    def forward(self, x):
        embedded = self.embedding(x)  # (batch_size, seq_length, embed_dim)
        embedded = embedded.permute(0, 2, 1)  # (batch_size, embed_dim, seq_length)
        pooled = self.pool(embedded).squeeze(2)  # (batch_size, embed_dim)
        output = self.fc(pooled)  # (batch_size, num_classes)
        return output

In [None]:
vocab_size = len(vocab)
embed_dim = 100
num_classes = len(label_encoder.classes_)
embedding = nn.Embedding(vocab_size, embed_dim)

In [None]:
embedded = embedding(texts)
embedded.shape

torch.Size([32, 500, 100])

In [None]:
embedded = embedded.permute(0,2,1)
embedded.shape

torch.Size([32, 100, 500])

In [None]:
pool = nn.AdaptiveAvgPool1d(1)
pooled = pool(embedded).squeeze(2)
pooled.shape

torch.Size([32, 100])

In [None]:
fc = nn.Linear(embed_dim, num_classes)
output = fc(pooled)
output.shape


torch.Size([32, 5])

### Train model

In [None]:
# Model Parameters
vocab_size = len(vocab)
embed_dim = 100
num_classes = len(label_encoder.classes_)

model = TextClassificationModel(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')


Epoch 1/20, Loss: 1.5828
Epoch 2/20, Loss: 1.5030
Epoch 3/20, Loss: 1.4481
Epoch 4/20, Loss: 1.3828
Epoch 5/20, Loss: 1.3070
Epoch 6/20, Loss: 1.2106
Epoch 7/20, Loss: 1.1003
Epoch 8/20, Loss: 0.9777
Epoch 9/20, Loss: 0.8540
Epoch 10/20, Loss: 0.7385
Epoch 11/20, Loss: 0.6347
Epoch 12/20, Loss: 0.5460
Epoch 13/20, Loss: 0.4664
Epoch 14/20, Loss: 0.4013
Epoch 15/20, Loss: 0.3460
Epoch 16/20, Loss: 0.3016
Epoch 17/20, Loss: 0.2647
Epoch 18/20, Loss: 0.2313
Epoch 19/20, Loss: 0.2051
Epoch 20/20, Loss: 0.1823


### Evaluation


In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.4f}')


Test Accuracy: 0.9552


### Adaptive Pooling Example

In [None]:
import torch
import torch.nn as nn

# Example input tensor of shape (batch_size=1, channels=1, length=8)
input_tensor = torch.tensor([[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]]])
print("Input tensor:")
print(input_tensor)

# Create an AdaptiveAvgPool1d layer that outputs a size of 4
adaptive_avg_pool = nn.AdaptiveAvgPool1d(1)

# Apply the layer
output_tensor = adaptive_avg_pool(input_tensor)

print("\nOutput tensor after applying AdaptiveAvgPool1d:")
print(output_tensor)


Input tensor:
tensor([[[1., 2., 3., 4., 5., 6., 7., 8.]]])

Output tensor after applying AdaptiveAvgPool1d:
tensor([[[4.5000]]])


In [None]:
output_tensor.shape

torch.Size([1, 1, 1])

In [None]:
output_tensor.squeeze(2)

tensor([[4.5000]])