## Importing required libraries

In [1]:
# suppressing warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from itertools import accumulate
from pprint import pprint

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split

from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer 
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset

from sklearn.manifold import TSNE
import plotly.graph_objs as go
from IPython.display import Markdown as md

#### Checking if CUDA is available

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


#### Defining helper functions

In [4]:
def plot(COST,ACC):
    fig, ax1 = plt.subplots()
    color = 'tab:red'
    ax1.plot(COST, color=color)
    ax1.set_xlabel('epoch', color=color)
    ax1.set_ylabel('total loss', color=color)
    ax1.tick_params(axis='y', color=color)

    ax2 = ax1.twix()
    color = 'tab:blue'
    ax2.set_ylabel('accuracy', color=color)
    ax2.plot(ACC, color=color)
    ax2.tick_params(axis='y', color=color)
    fig.tight_layout()

    plt.show()

## The Dataset

#### Printing an example document from the AG NEWS dataset

In [5]:
# class labels
ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}
num_class = 4

# dataset iterable object from "torchtext" library
train_iter = iter(AG_NEWS(split='train'))

# the 1st example document
y, text = next(train_iter)
print(f"Class: {ag_news_label[y]}\nText: {text}")

Class: Business
Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.


#### Tokenizing and building the vocabulary

In [6]:
# dataset iterable object from "torchtext" library
train_iter = iter(AG_NEWS(split='train'))

# The "basic_english" tokenizer from "torchtext" library
tokenizer = get_tokenizer("basic_english")

# a function to get tokenized text for one document at a time
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# building the vocabulary using 'bulid_vocab_from_iterator' function from "torchtext" library
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"]) # This index will be returned when OOV token is queried
vocab_size = len(vocab)
print(f"Vocab Size:- {vocab_size}\n")

# printing the tokenized text and token indices of 1st example document
train_iter = iter(AG_NEWS(split='train'))
tokenized_train_iter = yield_tokens(train_iter)
tokenized_text = next(tokenized_train_iter)
print("Tokenized text:-\n",tokenized_text,"\n")
print("Token indices:-\n", vocab(tokenized_text))


Vocab Size:- 95811

Tokenized text:-
 ['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short-sellers', ',', 'wall', 'street', "'", 's', 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.'] 

Token indices:-
 [431, 425, 1, 1605, 14838, 113, 66, 2, 848, 13, 27, 14, 27, 15, 50725, 3, 431, 374, 16, 9, 67507, 6, 52258, 3, 42, 4009, 783, 325, 1]


#### Spliting dataset into train, validation and test data

In [7]:
# spliting dataset into train and test iterators.
train_iter, test_iter = AG_NEWS()

# converting the iterators into map-style datasets using "to_map_style_dataset" function from "torchtext" library
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# 95:5 split of train_dataset for training and validation using "random_split" function from "pytorch" library
num_train = int(len(train_dataset)*0.95)
split_train_dataset, split_valid_dataset = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

# printing no.of samples in each
print(f"No.of samples in:-\n\ntrain: {num_train}\nvalidation: {len(train_dataset) - num_train}\ntest: {len(test_dataset)}")

No.of samples in:-

train: 114000
validation: 6000
test: 7600


#### Pre-processing pipline

In [8]:
# pipline to convert raw text into token indices using the "tokenizer" and "vocab" functions defined about
def text_pipeline(x):
    return vocab(tokenizer(x))

# pipline to convert label values to start from "0" insted of '1'
def label_pipeline(x):
    return int(x) -1

# a function to convert the pre-processed data returned from "text_pipeline" and "label_pipeline" into tensors for each "batch" from the "dataloader"
def collate_batch(batch):
    
    label_list, text_list, offsets = [], [], [0]
    
    for label, text in batch:
        label_list.append(label_pipeline(label))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

    return label_list.to(device), text_list.to(device), offsets.to(device) 

#### Creating dataloaders for ML model

In [9]:
# creating dataloaders using "DataLoader" function from "pytorch" library
BATCH_SIZE = 64

# train dataloader
train_dataloader = DataLoader(
    split_train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch    
)

# validation dataloader
valid_dataloader = DataLoader(
    split_valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch    
)

# test dataloader
test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch    
)

# printing the 1st batch of the train dataloader
label_tensor, text_token_indices_tensor, offsets_tensor = next(iter(train_dataloader))
print(f"label_tensor:-\n{label_tensor}\n\ntext_tensor:-\n{text_token_indices_tensor}\n\noffsets_tensor:-\n{offsets_tensor}")


label_tensor:-
tensor([3, 1, 0, 2, 3, 1, 2, 0, 1, 1, 2, 3, 3, 2, 1, 3, 1, 0, 3, 1, 2, 1, 2, 1,
        0, 0, 0, 1, 2, 0, 1, 0, 3, 2, 3, 0, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 3, 0,
        2, 1, 0, 3, 0, 0, 0, 3, 3, 2, 3, 0, 0, 1, 2, 1], device='cuda:0')

text_tensor:-
tensor([25366,  3212,   569,  ...,   160,   563,     1], device='cuda:0')

offsets_tensor:-
tensor([   0,   68,  104,  139,  175,  220,  268,  307,  356,  401,  453,  496,
         538,  581,  623,  668,  710,  747,  800,  821,  867,  933,  970, 1006,
        1054, 1108, 1154, 1198, 1243, 1288, 1323, 1380, 1455, 1496, 1539, 1569,
        1623, 1651, 1687, 1713, 1759, 1803, 1844, 1894, 1935, 1979, 2032, 2090,
        2139, 2178, 2227, 2267, 2310, 2346, 2383, 2420, 2487, 2528, 2555, 2591,
        2623, 2661, 2696, 2735], device='cuda:0')


## Neural Network (NN)

#### Defining the NN architecture

In [10]:
#a feed forward 2 layer NN implemented using "nn.EmbeddingBag", "nn.Linear" functions from "pytorch" library
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [11]:
# instantiating the model
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

#### Example of one forward pass

In [14]:
# getting the embedding of the text token indices for the 64 documents in the batch
embedded_tensor = model.embedding(text_token_indices_tensor, offsets_tensor) 
# passing the embedding through the fully connected linear layer for all 64 documents in the batch
nn_output_tensor = model.fc(embedded_tensor) 
print(f'Input text token indices for the 1st document in the batch:-\n{text_token_indices_tensor[:offsets_tensor[1]]}\n')
print(f'Embeddings of 1st document in the batch:-\n{embedded_tensor[0]}\n')
print(f'NN output of the 1st document in the batch:-\n{nn_output_tensor[0]}\n')
print(f'actual label:- {label_tensor[0]}') 

Input text token indices for the 1st document in the batch:-
tensor([25366,  3212,   569,  4123, 11311,   165,   668,     3,   634,    23,
           73,  2675,   578,  4685,    28,  1116,    17,   156, 25366,  3212,
           42,     4, 17945,    10,    32,   150,    24,     2,   997,     6,
          686,    11,     2,  1630,  1933,     3,    49,   104,    19,   582,
          939,   245,    66,  2607,     1,    41,  3394,    41,  1465,  1542,
            3,  1177,  1429,  1547,     3,  1539,     3,  1543,  3011,  1540,
           41,   164,  2798,   144,  2751,   163,    41,  1532],
       device='cuda:0')

Embeddings of 1st document in the batch:-
tensor([-5.5004e-02, -2.4664e-02,  4.7826e-02,  3.1811e-02, -7.1547e-02,
        -4.4972e-02, -9.3846e-03, -5.2800e-05, -5.9693e-04, -1.4254e-01,
        -2.6383e-03,  6.9667e-02,  8.5986e-02, -3.4836e-02, -1.0520e-03,
         8.8616e-03,  2.3099e-02,  5.6930e-02,  7.8598e-02, -6.9795e-02,
         4.1910e-02, -2.8994e-03,  4.5005e-02, 

#### A function to predict the class of new text

In [24]:
def predict(text, text_pipeline):
    with torch.no_grad():
        text_token_indices_tensor = torch.tensor(text_pipeline(text)).to(device)
        offset_tensor = torch.tensor([0]).to(device)
        nn_output_tensor = model(text_token_indices_tensor, offset_tensor)
        return ag_news_label[nn_output_tensor.argmax(1).item()+1]

print('New input text doc:- "I like sports"')
print(f'Models prediction: - {predict("I like sports", text_pipeline)}')

New input text doc:- "I like sports"
Models prediction: - Business


#### A function to evaluate the models performance

In [None]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label_tensor, text_token_indices_tensor, offsets_tensor) in enumerate(dataloader):
            predicted_label_tensor = model(text_token_indices_tensor, offsets_tensor)
            total_acc += (predicted_label_tensor.argmax(1) == label_tensor).sum().item()
            total_count += label_tensor.size(0)

In [26]:
t1 = torch.tensor([0, 1, 2, 4, 5])
t2 = torch.tensor([0, 1, 3, 4, 6])

In [30]:
t1 == t2

tensor([ True,  True, False,  True, False])

In [31]:
(t1 == t2).sum()

tensor(3)

In [33]:
(t1 == t2).sum().item()

3