In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# Fetch the 20 newsgroups dataset
ng20 = fetch_20newsgroups(subset='all', categories= ['comp.graphics','sci.space','rec.sport.baseball','talk.politics.guns','alt.atheism'],remove=('headers', 'footers', 'quotes'))

ng_text = ng20.data
ng_labels = ng20.target

In [3]:
# Tokenize the text
ng20_text_tokens = []
for text in ng_text:
  ng20_text_tokens.append(simple_preprocess(text))


In [4]:
glove_path = '/content/glove.6B.100d.txt'

ng_vector_idx = torch.LongTensor([doc2ind(doc) for doc in ng_text])

where `ng_vector_idx` is a `torch.tensor` of integers representing the indices of the GloVe vectors from above, and `doc2ind` is a function you need. Note that you should not form the matrix of word embeddings explicitly, but simply specific vector-indices representing the words in the text (see `torch.Embedding` for more details)

In [5]:
# Load Glove
embeddings = []
word2index = {}

with open(glove_path, 'r') as word_embedding:
  for index, wb in enumerate(word_embedding):
    values = wb.split()
    word = values[0]
    temp = []
    for val in values[1:]:
      temp.append(float(val))
    vector = torch.tensor(temp)
    embeddings.append(vector)
    word2index[word] = index


In [6]:
glove_word2index = word2index

In [23]:
ng20_indexFromGlove = {}
glove_keys = list(glove_word2index.keys())

for doc in ng20_text_tokens:
  for word in doc:
    if word in glove_keys:
      ng20_indexFromGlove[word] = glove_word2index[word]
    else:


In [7]:
embeddings

[tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
         -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
          0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
          0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
          0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
         -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
         -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
          0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
          1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
         -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
          0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
          0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
         -0.5203, -0.1459,  

In [8]:
embedding_dim = len(embeddings[0])
embeddings_tensor = torch.zeros((len(embeddings) + 1, embedding_dim))
embeddings_tensor[1:] = torch.stack(tuple(embeddings))

embedding_layer = nn.Embedding.from_pretrained(embeddings_tensor, freeze=False)

In [12]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
from sklearn.metrics import accuracy_score

max_length = 200

def tokens_to_indices(tokens_list, word2index, max_length):
  indices_list = [[word2index[token] if token in word2index else 0 for token in tokens] for tokens in tokens_list]
  return pad_sequence([torch.tensor(seq) for seq in indices_list], batch_first=True, padding_value=0)[:, :max_length]

ng20_indices = tokens_to_indices(ng20_text_tokens, glove_word2index, max_length)

X_train, X_test, y_train, y_test = train_test_split(ng20_indices, torch.tensor(ng_labels), test_size=0.2, random_state = 69)

train_data = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader=DataLoader(train_data, shuffle=True, batch_size=32)
test_loader=DataLoader(test_dataset, batch_size=32)
num_classes = 5

model = nn.Sequential(
    embedding_layer,
    nn.Flatten(),
    nn.Linear(embedding_dim * max_length, 128),
    nn.ReLU(),
    nn.Linear(128, num_classes),
    nn.LogSoftmax(dim=1),
)

criterion = nn.NLLLoss()
optimizer = Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, criterion, optimizer, num_epochs=3):
  for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
      optimizer.zero_grad()
      outputs = model(inputs)
      loss =criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

train_model(model, train_loader, criterion, optimizer, num_epochs=3)

def evaluate_model(model, test_loader):
  model.eval()
  all_preds=[]
  true_labels = []
  with torch.no_grad():
    for inputs, labels in test_loader:
      outputs = model(inputs)
      _, predicted = torch.max(outputs, 1)
      all_preds.extend(predicted.numpy())
      true_labels.extend(labels.numpy())
  accuracy = accuracy_score(true_labels, all_preds)
  print(f"Test Accuracy: {accuracy}")

evaluate_model(model, test_loader)




Epoch 1/3, Loss: 1.1376654617806785
Epoch 2/3, Loss: 0.25507142248316705
Epoch 3/3, Loss: 0.10010167106221883
Test Accuracy: 0.6827438370846731
