In [146]:
import torch
import pandas as pd
import numpy as np
import os 

#Download to current path 
#current_dir = os.getcwd()
#path = kagglehub.dataset_download("crawford/20-newsgroups")

#print("Path to dataset files:", path)

from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [147]:
from pprint import pprint 
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [148]:
#Doing data pre-processing in this block

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

#pprint(newsgroups_train)
train_texts = newsgroups_train.data
train_labels = newsgroups_train.target
test_texts = newsgroups_test.data
test_labels = newsgroups_test.target
print("--------Train---------")
print("Label= ",train_labels[0])
print(train_texts[0])

print("--------Test---------")
print("Label= ",test_labels[0])
print(test_texts[0])
print("---------------------")

#Tokenize the input data 
def tokenize(text):
    return text.lower().split()

#Tokenize all docs
train_tokens = []
for txt in train_texts:
    tokens = tokenize(txt)
    train_tokens.append(tokens)

test_tokens = []
for txt in test_texts:
    tokens = tokenize(txt)
    test_tokens.append(tokens)


#Build vocabulary
word_to_idx = {}
word_to_idx['<PAD>'] = 0 
word_to_idx['<UNK>'] = 1 

#Add words from your tokenized texts 
current_idx = 2 
for tokens in train_tokens: 
    for word in tokens: 
        if word not in word_to_idx:
            word_to_idx[word] = current_idx
            current_idx +=1
vocab_size = len(word_to_idx)

print("Vocabulary size:"+ str(vocab_size))

#Convert tokens to sequences of indices
train_sequences = []
for tokens in train_tokens:
    sequence = []
    for word in tokens:
        #Get the idx of the word, or the UNK token if it's not in val
        sequence.append(word_to_idx.get(word,word_to_idx['<UNK>']))
    train_sequences.append(sequence)
    
#Pad sequences to the same length
max_length = 100 #This max len is random choosen

train_padded = []
for sequence in train_sequences:
    #Truncate the sequence if it's longer than max_length
    if len(sequence) > max_length:
        padded_sequence = sequence[:max_length]
    else:
        padded_sequence = sequence + [word_to_idx['<PAD>']] * (max_length - len(sequence))
    train_padded.append(padded_sequence)

train_sequences_tensor = torch.LongTensor(train_padded)
train_labels_tensor = torch.LongTensor(train_labels)
print("Train Matrix size = " + str(train_sequences_tensor.shape) )






--------Train---------
Label=  7
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.
--------Test---------
Label=  7
I am a little confused on all of the models of the 88-89 bonnevilles.
I have heard of the LE SE LSE SSE SSEI. Could someone tell me the
differences are far as features or performance. I am also curious to
know what the book value is for prefereably the 89 model. And how much
less than book value can you usually get them for. In other words how
much are they in demand this time of year. I have heard that the mid-spring
early summer is the best time to buy.

In [149]:
#After get token , do embedding
#For the embedding I choose to use w2v because it's simple to use
import gensim.downloader as api

#Load the word vectors
word_vectors = api.load("word2vec-google-news-300")

embedding_dim = 300
embedding_matrix = np.zeros((len(word_to_idx), embedding_dim))

for word, idx in word_to_idx.items():
    if word in word_vectors:
        embedding_matrix[idx] = word_vectors[word]



In [150]:
#Build RNN
import torch 
import torch.nn as nn
import torch.optim as optim
class SimpleRNN(nn.Module):
    def __init__(self,vocab_size, embedding_dim, hidden_dim, output_dim,embedding_matrix=None):
        super(SimpleRNN, self).__init__()
        #Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #Add a Rnn layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        #Add a fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self,text):
        #Text shape:[Batch_size, seq_length]
        
        #Run throught the embedding layer
        embedded = self.embedding(text)
        #Emvedded sape : [Batck_size, seq_length, embedding_dim]
        
        #Run through RNN
        output,  hidden = self.rnn(embedded)
        #output shape: [batch_size, seq_length, hidden_dim]
        #hidden shape: [1, batch_size, hidden_dim]
        return self.fc(hidden.squeeze(0))
        

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [151]:
#Define model shapes
vocab_size = len(word_to_idx)

hidden_dim = 128  # Size of the RNN hidden state
output_dim = len(newsgroups_train.target_names)  # Number of classes (20 for 20 Newsgroups)

# Create dataset and dataloader
from torch.utils.data import TensorDataset, DataLoader


batch_size = 64  # You can adjust this number
train_dataset = TensorDataset(train_sequences_tensor, train_labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


#
# Initialize the model
model = SimpleRNN(vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix)
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#Train the rnn
epochs = 1000 


for epoch in range(epochs):
    model.train()
    total_loss = 0 
    for texts,labels in train_loader: 
        #Move to GPU
        texts = texts.to(device)
        labels = labels.to(device)
        #Clear gradients
        optimizer.zero_grad()
        
        #Forward pass 
        predictions = model(texts)
        
        #Calculate loss
        loss = criterion(predictions, labels)
                
        #Backward pass
        loss.backward()
        
        #Update parameters
        optimizer.step()
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

    

Epoch 1 Loss: 3.0674
Epoch 2 Loss: 2.8735
Epoch 3 Loss: 2.6482
Epoch 4 Loss: 2.3450
Epoch 5 Loss: 2.0451
Epoch 6 Loss: 1.8154
Epoch 7 Loss: 1.7431
Epoch 8 Loss: 1.7200
Epoch 9 Loss: 1.7137
Epoch 10 Loss: 1.6999
Epoch 11 Loss: 1.7068
Epoch 12 Loss: 1.8764
Epoch 13 Loss: 1.9027
Epoch 14 Loss: 1.7515
Epoch 15 Loss: 1.7179
Epoch 16 Loss: 1.7119
Epoch 17 Loss: 1.6993
Epoch 18 Loss: 1.7130
Epoch 19 Loss: 1.6969
Epoch 20 Loss: 1.6976
Epoch 21 Loss: 1.6894
Epoch 22 Loss: 1.7982
Epoch 23 Loss: 2.9235
Epoch 24 Loss: 2.8558
Epoch 25 Loss: 2.6485
Epoch 26 Loss: 2.3725
Epoch 27 Loss: 2.1186
Epoch 28 Loss: 1.9494
Epoch 29 Loss: 1.8387
Epoch 30 Loss: 1.7765
Epoch 31 Loss: 1.7340
Epoch 32 Loss: 1.7414
Epoch 33 Loss: 1.7143
Epoch 34 Loss: 1.7060
Epoch 35 Loss: 1.7141
Epoch 36 Loss: 1.7184
Epoch 37 Loss: 1.7208
Epoch 38 Loss: 1.7199
Epoch 39 Loss: 1.8964
Epoch 40 Loss: 1.8134
Epoch 41 Loss: 1.7446
Epoch 42 Loss: 1.7193
Epoch 43 Loss: 1.7143
Epoch 44 Loss: 1.7113
Epoch 45 Loss: 1.7051
Epoch 46 Loss: 1.75

In [152]:
#Evaluation
#Prepair test data

test_sequences = []
for tokens in test_tokens:
    sequence = []
    for word in tokens:
        sequence.append(word_to_idx.get(word, word_to_idx['<UNK>']))
    test_sequences.append(sequence)

#Pad sequences
test_padded = []
for sequence in test_sequences: 
    if len(sequence) > max_length: 
        padded_sequence = sequence[:max_length]
    else:
        padded_sequence = sequence+[word_to_idx['<PAD>']] * (max_length - len(sequence))
    test_padded.append(padded_sequence)


#Convert to tensors
test_sequences_tensor = torch.LongTensor(test_padded)
test_labels_tensor = torch.LongTensor(test_labels)

#Create test dataset and dataloader
test_dataset = TensorDataset(test_sequences_tensor, test_labels_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


model.eval()
correct =0 
total = 0
    
with torch.no_grad():
    for texts, labels in test_loader:
        #move test to GPU
        texts = texts.to(device)
        labels = labels.to(device)
        
        #Forward pass
        outputs = model(texts)
        
        #get prediction
        _, predicted = torch.max(outputs.data, 1)
        
        #Count correct predictions
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
#Calculate and print accuracy
accuracy = 100* correct/total
print("Test accuracy= ["+str(accuracy)+ "]%")


        
        
        

Test accuracy= [5.7222517259691985]%
