
<div>
<img src="./images/text_embedding.png" width="800"/>
</div>

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')
X_news, y_news = newsgroups.data, newsgroups.target

# Preprocess text using CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_news_vec = vectorizer.fit_transform(X_news).toarray()

# Train-test split
X_train_news, X_test_news, y_train_news, y_test_news = train_test_split(X_news_vec, y_news, test_size=0.2, random_state=42)



### With embeddings

In [3]:
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)


# Define the tokenizer
tokenizer = get_tokenizer('basic_english')

# Tokenize and encode the data using torchtext
train_data_tokens = [tokenizer(text) for text in X_train]
test_data_tokens = [tokenizer(text) for text in X_test]

# Build the vocabulary from the training data
vocab = build_vocab_from_iterator(train_data_tokens, specials=["<unk>"])  # Handle OOV tokens with "<unk>"



In [4]:
train_data_tokens[0]

['from',
 'mahan@tgv',
 '.',
 'com',
 '(',
 'patrick',
 'l',
 '.',
 'mahan',
 ')',
 'subject',
 're',
 'is',
 'it',
 'just',
 'me',
 ',',
 'or',
 'is',
 'this',
 'newsgroup',
 'dead',
 '?',
 'organization',
 'the',
 'internet',
 'lines',
 '24',
 'nntp-posting-host',
 'enterpoop',
 '.',
 'mit',
 '.',
 'edu',
 'to',
 'xpert@expo',
 '.',
 'lcs',
 '.',
 'mit',
 '.',
 'edu',
 ',',
 'rlm@helen',
 '.',
 'surfcty',
 '.',
 'com',
 '#',
 '#',
 'i',
 "'",
 've',
 'gotten',
 'very',
 'few',
 'posts',
 'on',
 'this',
 'group',
 'in',
 'the',
 'last',
 'couple',
 'days',
 '.',
 '(',
 'i',
 '#',
 'recently',
 'added',
 'it',
 'to',
 'my',
 'feed',
 'list',
 '.',
 ')',
 'is',
 'it',
 'just',
 'me',
 ',',
 'or',
 'is',
 'this',
 'group',
 '#',
 'near',
 'death',
 '?',
 '#',
 'seen',
 'from',
 'the',
 'mailing',
 'list',
 'side',
 ',',
 'i',
 "'",
 'm',
 'getting',
 'about',
 'the',
 'right',
 'amount',
 'of',
 'traffic',
 '.',
 'patrick',
 'l',
 '.',
 'mahan',
 '---',
 'tgv',
 'window',
 'washer',
 '--

In [5]:
X_train[0] # email one

"From: mahan@TGV.COM (Patrick L. Mahan)\nSubject: Re: Is it just me, or is this newsgroup dead?\nOrganization: The Internet\nLines: 24\nNNTP-Posting-Host: enterpoop.mit.edu\nTo: xpert@expo.lcs.mit.edu, rlm@helen.surfcty.com\n\n#\n# I've gotten very few posts on this group in the last couple days.  (I\n# recently added it to my feed list.)  Is it just me, or is this group\n# near death?\n#\n\nSeen from the mailing list side, I'm getting about the right amount of\ntraffic.\n\nPatrick L. Mahan\n\n--- TGV Window Washer ------------------------------- Mahan@TGV.COM ---------\n\nWaking a person unnecessarily should not be considered  - Lazarus Long\na capital crime.  For a first offense, that is            From the Notebooks of\n\t\t\t\t\t\t\t  Lazarus Long\n\nPatrick L. Mahan\n\n--- TGV Window Washer ------------------------------- Mahan@TGV.COM ---------\n\nWaking a person unnecessarily should not be considered  - Lazarus Long\na capital crime.  For a first offense, that is            From

In [7]:

# Pad sequences to a fixed length
max_seq_length = 100
train_data_padded = [torch.tensor([vocab[token] if token in vocab else vocab["<unk>"] for token in tokens[:max_seq_length]]) for tokens in train_data_tokens]
test_data_padded = [torch.tensor([vocab[token] if token in vocab else vocab["<unk>"] for token in tokens[:max_seq_length]]) for tokens in test_data_tokens]

X_train_tensor = pad_sequence(train_data_padded, batch_first=True, padding_value=0)
X_test_tensor = pad_sequence(test_data_padded, batch_first=True, padding_value=0)

y_train_tensor = torch.LongTensor(y_train)
y_test_tensor = torch.LongTensor(y_test)


In [8]:
X_train_tensor[0]

tensor([   22, 13640,     1,    47,     8,  2323,   367,     1, 10692,     7,
           39,    46,    13,    16,    84,    70,     2,    34,    13,    25,
          985,   684,    15,    41,     3,   378,    40,   368,    93,  5570,
            1,   418,     1,    30,     5,  5549,     1,  1862,     1,   418,
            1,    30,     2, 33149,     1, 29497,     1,    47,   194,   194,
           11,     4,   139,  2104,   129,   250,  1534,    23,    25,   275,
           12,     3,   198,   652,   516,     1,     8,    11,   194,   881,
         1695,    16,     5,    51,  3660,   305,     1,     7,    13,    16,
           84,    70,     2,    34,    13,    25,   275,   194,   941,   625,
           15,   194,   360,    22,     3,  1495,   305,   612,     2,    11])

In [9]:
vocab.lookup_token(22)

'from'

In [10]:
class NewsGroupsDataset(Dataset):
    def __init__(self, data_tokens, targets, vocab, max_seq_length=100):
        self.data_tokens = data_tokens
        self.targets = targets
        self.vocab = vocab
        self.max_seq_length = max_seq_length
    
    def __len__(self):
        return len(self.data_tokens)
    
    def __getitem__(self, idx):
        tokens = self.data_tokens[idx][:self.max_seq_length]
        token_ids = [self.vocab[token] if token in self.vocab else self.vocab["<unk>"] for token in tokens]
        return torch.tensor(token_ids), torch.tensor(self.targets[idx])
        
     

In [11]:
# Create instances of the custom Dataset for training and testing
train_dataset = NewsGroupsDataset(train_data_tokens, y_train, vocab, max_seq_length)
test_dataset = NewsGroupsDataset(test_data_tokens, y_test, vocab, max_seq_length)

In [12]:
train_dataset.__getitem__(0)

(tensor([   22, 13640,     1,    47,     8,  2323,   367,     1, 10692,     7,
            39,    46,    13,    16,    84,    70,     2,    34,    13,    25,
           985,   684,    15,    41,     3,   378,    40,   368,    93,  5570,
             1,   418,     1,    30,     5,  5549,     1,  1862,     1,   418,
             1,    30,     2, 33149,     1, 29497,     1,    47,   194,   194,
            11,     4,   139,  2104,   129,   250,  1534,    23,    25,   275,
            12,     3,   198,   652,   516,     1,     8,    11,   194,   881,
          1695,    16,     5,    51,  3660,   305,     1,     7,    13,    16,
            84,    70,     2,    34,    13,    25,   275,   194,   941,   625,
            15,   194,   360,    22,     3,  1495,   305,   612,     2,    11]),
 tensor(5))

In [13]:

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: (pad_sequence([item[0] for item in batch], batch_first=True, padding_value=0), torch.tensor([item[1] for item in batch])))
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=lambda batch: (pad_sequence([item[0] for item in batch], batch_first=True, padding_value=0), torch.tensor([item[1] for item in batch])))



# Pretrained Models

In [17]:

from torchtext.vocab import GloVe

# Load pre-trained GloVe embeddings
glove = GloVe(name='6B', dim=50)  # You can choose different dimensions (e.g., 50, 100, 200, 300)

# Create an embedding matrix
pretrained_embeddings = glove.vectors



.vector_cache/glove.6B.zip: 862MB [07:54, 1.82MB/s]                             
100%|███████████████████████████████▉| 399999/400000 [00:08<00:00, 45144.97it/s]


In [22]:

# Define the model
class FFNN(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_dim, output_dim):
        super(FFNN, self).__init__()
        vocab_size, embedding_dim = pretrained_embeddings.shape
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded_avg = torch.mean(embedded, dim=1)
        x = self.fc1(embedded_avg)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x


In [23]:
y_train_news.max()

19

In [24]:

# Define hyperparameters
vocab_size = len(vocab)
embedding_dim = 500
hidden_dim = 128
#output_dim = len(label_encoder.classes_)
output_dim = y_train_news.max() + 1
learning_rate = 0.01
epochs = 10
# Example usage

model = FFNN(pretrained_embeddings, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [25]:


# Training loop
for epoch in range(epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')


Epoch 1/10, Loss: 0.15729960799217224
Epoch 2/10, Loss: 0.8579656481742859
Epoch 3/10, Loss: 0.0033414126373827457
Epoch 4/10, Loss: 0.0010539244394749403
Epoch 5/10, Loss: 0.0019872228149324656
Epoch 6/10, Loss: 4.663909930968657e-05
Epoch 7/10, Loss: 0.00010102357919095084
Epoch 8/10, Loss: 0.003932616673409939
Epoch 9/10, Loss: 0.00012886294280178845
Epoch 10/10, Loss: 1.2785061699105427e-05


In [26]:

# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_tensor.numpy(), predicted.numpy())
    print(f'Accuracy on test set: {accuracy}')

Accuracy on test set: 0.8196286472148541
