In [77]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from nltk.stem.snowball import SnowballStemmer

Load data

In [27]:
train = fetch_20newsgroups(subset='train', shuffle=True)
print(dir(train))
print(len(train.data))
print("Target names:", train.target_names)
print("------------------------------------")
for text, label in zip(train.data[:2], train.target[:2]):
    print("Text:", text[:100])
    print("Label:", train.target_names[label])
    print("------------------------------------")

['DESCR', 'data', 'filenames', 'target', 'target_names']
11314
Target names: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
------------------------------------
Text: From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.
Label: rec.autos
------------------------------------
Text: From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final ca
Label: comp.sys.mac.hardware
------------------------------------


Preprocessing

In [28]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
preprocess_pipeline = Pipeline([
    ("vect", StemmedCountVectorizer(stop_words="english", max_features=200)),
    ("tfidf", TfidfTransformer()),
])

preprocess_pipeline_svd = Pipeline([
    ("vect", StemmedCountVectorizer(stop_words="english")),
    ("tfidf", TfidfTransformer()),
    ("svd", TruncatedSVD(n_components=200))
])

In [29]:
train_data = preprocess_pipeline.fit_transform(train.data)
train_data_svd = preprocess_pipeline_svd.fit_transform(train.data)

In [68]:
print(f"({len(train.data)}, {len(train.data[0])})")
print(train_data.shape)
print(train_data_svd.shape)
train_labels = train.target
print(train_labels.shape)

(11314, 721)
(11314, 200)
(11314, 200)
(11314,)


Model definition

In [64]:
class DenseTextClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )
        
    def forward(self, x):
        x = self.layers(x)
        return nn.Softmax(dim=1)(x)

Learning

In [79]:
input_size = train_data.shape[1]
hidden_size = 512
output_size = len(train_labels)
batch_size = 16
lr = 0.001
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


train_data_tensor = torch.from_numpy(train_data.toarray()).float().to(device)
train_labels_tensor = torch.from_numpy(train_labels).long().to(device)
train_dataset = TensorDataset(train_data_tensor, train_labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model = DenseTextClassifier(input_size, hidden_size, output_size).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(num_epochs):
    print(f"Epoch: {epoch}")
    for inputs, targets in tqdm(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

Epoch: 0


  0%|          | 0/708 [00:00<?, ?it/s]

  4%|▍         | 29/708 [00:02<01:04, 10.57it/s]


KeyboardInterrupt: 