<a href="https://colab.research.google.com/github/wizard339/education/blob/main/nlp_classification_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import torch
import torchtext

In [2]:
from torch.utils.data import DataLoader

train_dataset, test_dataset = torchtext.datasets.AG_NEWS()

In [3]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

def build_vocabulary(datasets):
  for dataset in datasets:
    for _, text in dataset:
      yield tokenizer(text)

vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), specials=['<UNK>'])
vocab.set_default_index(vocab['<UNK>'])

In [4]:
tokens = tokenizer('Text for tokenization test...')
indexes = vocab(tokens)

tokens, indexes

(['text', 'for', 'tokenization', 'test', '.', '.', '.'],
 [4003, 11, 0, 287, 1, 1, 1])

In [5]:
from torch.types import Device
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

train_dataset, test_dataset = torchtext.datasets.AG_NEWS()
train_dataset, test_dataset = to_map_style_dataset(train_dataset), to_map_style_dataset(test_dataset)

target_classes = ['World', 'Sports', 'Business', 'Sci/Tech']

max_words = 25

def vectorize_batch(batch):
  Y, X = list(zip(*batch))
  X = [vocab(tokenizer(text)) for text in X] # tokenize and map tokens to indexes
  X = [tokens+([0]*(max_words - len(tokens))) if len(tokens) < max_words else tokens[:max_words] for tokens in X]
  return torch.tensor(X, dtype=torch.int32, device='cpu'), torch.tensor(Y, device='cpu') - 1


train_loader = DataLoader(train_dataset, batch_size=1024, collate_fn=vectorize_batch, shuffle=True)
test_loader  = DataLoader(test_dataset , batch_size=1024, collate_fn=vectorize_batch)

In [6]:
from prompt_toolkit import output
from torch import nn
from torch.nn import functional as F

EMBED_LEN  = 50
HIDDEN_DIM = 75
N_LAYERS   = 1


class LSTMClassifier(nn.Module):
  def __init__(self):
    super(LSTMClassifier, self).__init__()
    self.emb_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=EMBED_LEN)
    self.lstm = nn.LSTM(input_size=EMBED_LEN, hidden_size=HIDDEN_DIM, num_layers=N_LAYERS, batch_first=True)
    self.fc = nn.Linear(in_features=HIDDEN_DIM, out_features=len(target_classes))

  def forward(self, X_batch):
    emb = self.emb_layer(X_batch)
    output, (h, c) = self.lstm(emb)
    return self.fc(output[:, -1])

In [39]:
lstm_classifier = LSTMClassifier()

lstm_classifier

LSTMClassifier(
  (emb_layer): Embedding(98635, 50)
  (lstm): LSTM(50, 75, batch_first=True)
  (fc): Linear(in_features=75, out_features=4, bias=True)
)

In [40]:
for layer in lstm_classifier.children():
  print(f'Layer: {layer}')
  print('Parameters: ')
  for param in layer.parameters():
    print(param.shape)
  print()

Layer: Embedding(98635, 50)
Parameters: 
torch.Size([98635, 50])

Layer: LSTM(50, 75, batch_first=True)
Parameters: 
torch.Size([300, 50])
torch.Size([300, 75])
torch.Size([300])
torch.Size([300])

Layer: Linear(in_features=75, out_features=4, bias=True)
Parameters: 
torch.Size([4, 75])
torch.Size([4])



In [7]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcValLossAndAccuracy(model, loss_fn, val_loader):
  with torch.no_grad():
    Y_shuffled, Y_preds, losses = [], [], []
    for X, Y in val_loader:
      preds = model(X)
      loss = loss_fn(preds, Y)
      losses.append(loss.item())

      Y_shuffled.append(Y)
      Y_preds.append(preds.argmax(dim=-1))

    Y_shuffled = torch.cat(Y_shuffled)
    Y_preds = torch.cat(Y_preds)

    print(f'Valid Loss: {torch.tensor(losses).cpu().mean():.3f}')
    print(f'Valid Acc : {accuracy_score(Y_shuffled.cpu().detach().numpy(), Y_preds.cpu().detach().numpy()):.3f}')


def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, device, epochs=10):

  for i in range(1, epochs+1):
    losses = []
    for X, Y in tqdm(train_loader):
      X.to(device)
      Y.to(device)
      model.to(device)

      Y_preds = model(X)

      loss = loss_fn(Y_preds, Y)
      losses.append(loss.item())

      optimizer.zero_grad()

      loss.backward()
      optimizer.step()


    print(f'Train Loss: {torch.tensor(losses).mean():.3f}')
    CalcValLossAndAccuracy(model, loss_fn, val_loader)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [59]:
from torch.optim import Adam

EPOCHS  = 10
LR      = 1e-3
OPT     = Adam(lstm_classifier.parameters(), lr=LR)
LOSS_FN = nn.CrossEntropyLoss()

TrainModel(model=lstm_classifier, loss_fn=LOSS_FN, optimizer=OPT, train_loader=train_loader, val_loader=test_loader, device=device, epochs=EPOCHS)

100%|██████████| 118/118 [00:53<00:00,  2.22it/s]


Train Loss: 0.290
Valid Loss: 0.352
Valid Acc : 0.875


100%|██████████| 118/118 [00:50<00:00,  2.34it/s]


Train Loss: 0.258
Valid Loss: 0.343
Valid Acc : 0.879


100%|██████████| 118/118 [01:03<00:00,  1.87it/s]


Train Loss: 0.233
Valid Loss: 0.345
Valid Acc : 0.882


100%|██████████| 118/118 [00:43<00:00,  2.74it/s]


Train Loss: 0.211
Valid Loss: 0.342
Valid Acc : 0.882


100%|██████████| 118/118 [00:42<00:00,  2.76it/s]


Train Loss: 0.191
Valid Loss: 0.338
Valid Acc : 0.884


100%|██████████| 118/118 [00:42<00:00,  2.80it/s]


Train Loss: 0.171
Valid Loss: 0.345
Valid Acc : 0.888


100%|██████████| 118/118 [00:43<00:00,  2.74it/s]


Train Loss: 0.155
Valid Loss: 0.352
Valid Acc : 0.887


100%|██████████| 118/118 [00:42<00:00,  2.76it/s]


Train Loss: 0.139
Valid Loss: 0.375
Valid Acc : 0.885


100%|██████████| 118/118 [00:42<00:00,  2.78it/s]


Train Loss: 0.125
Valid Loss: 0.377
Valid Acc : 0.884


100%|██████████| 118/118 [00:45<00:00,  2.59it/s]


Train Loss: 0.113
Valid Loss: 0.404
Valid Acc : 0.885


In [61]:
def MakePredictions(model, loader, device):
  Y_shuffled, Y_preds= [], []
  for X, Y in loader:
    X.to(device)
    Y.to(device)
    preds = model(X)
    Y_preds.append(preds)
    Y_shuffled.append(Y)
  gc.collect()
  Y_preds, Y_shuffled = torch.cat(Y_preds), torch.cat(Y_shuffled)

  return Y_shuffled.cpu().detach().numpy(), F.softmax(Y_preds, dim=-1).argmax(dim=-1)

Y_actual, Y_preds = MakePredictions( lstm_classifier, test_loader, device)

In [62]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(f'Test Accuracy: {accuracy_score(Y_actual, Y_preds)}')
print('\nClassification report: ')
print(classification_report(Y_actual, Y_preds, target_names=target_classes))
print('\nConfusion Matrix: ')
print(confusion_matrix(Y_actual, Y_preds))

Test Accuracy: 0.8846052631578948

Classification report: 
              precision    recall  f1-score   support

       World       0.89      0.90      0.89      1900
      Sports       0.94      0.95      0.94      1900
    Business       0.84      0.86      0.85      1900
    Sci/Tech       0.87      0.83      0.85      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600


Confusion Matrix: 
[[1717   57   70   56]
 [  39 1796   30   35]
 [  89   24 1641  146]
 [  94   27  210 1569]]


In [None]:
import gensim.downloader

w2v = gensim.downloader.load('word2vec-google-news-300')

In [9]:
w2v_vectors = w2v.wv

NameError: ignored

In [93]:
class LSTMClassifier2(nn.Module):
  def __init__(self):
    super(LSTMClassifier2, self).__init__()
    self.emb_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=300)
    self.lstm = nn.LSTM(input_size=300, hidden_size=HIDDEN_DIM, num_layers=N_LAYERS, batch_first=True)
    self.fc = nn.Linear(in_features=HIDDEN_DIM, out_features=len(target_classes))
    self.init_embeds()

  def init_embeds(self):
    weights = torch.tensor(w2v_vectors.vectors)
    self.emb_layer = nn.Embedding.from_pretrained(weights)

  def forward(self, X_batch):
    emb = self.emb_layer(X_batch)
    output, (h, c) = self.lstm(emb)
    return self.fc(output[:, -1])

In [None]:
lstm_classifier_2 = LSTMClassifier2()
lstm_classifier_2

In [92]:
from torch.optim import Adam

EPOCHS  = 5
LR      = 1e-3
OPT     = Adam(lstm_classifier.parameters(), lr=LR)
LOSS_FN = nn.CrossEntropyLoss()

TrainModel(model=lstm_classifier_2, loss_fn=LOSS_FN, optimizer=OPT, train_loader=train_loader, val_loader=test_loader, device=device, epochs=EPOCHS)

  0%|          | 0/118 [00:00<?, ?it/s]


RuntimeError: ignored