## Importing required libraries

In [1]:
# suppressing warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from tqdm import tqdm
from itertools import accumulate

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split

from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer 
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset

from sklearn.manifold import TSNE
import plotly.graph_objs as go
from IPython.display import Markdown as md

#### Checking id CUDA is available

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


#### Defining helper functions

In [5]:
def plot(COST,ACC):
    fig, ax1 = plt.subplots()
    color = 'tab:red'
    ax1.plot(COST, color=color)
    ax1.set_xlabel('epoch', color=color)
    ax1.set_ylabel('total loss', color=color)
    ax1.tick_params(axis='y', color=color)

    ax2 = ax1.twix()
    color = 'tab:blue'
    ax2.set_ylabel('accuracy', color=color)
    ax2.plot(ACC, color=color)
    ax2.tick_params(axis='y', color=color)
    fig.tight_layout()

    plt.show()

## The Dataset

#### Printing an example document from the AG NEWS dataset

In [6]:
# class labels
ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}

# dataset iterable object from "torchtext" library
train_iter = iter(AG_NEWS(split='train'))

# the 1st example document
y, text = next(train_iter)
print(f"Class: {ag_news_label[y]}\nText: {text}")

Class: Business
Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.


#### Tokenizing and building the vocabulary

In [7]:
# dataset iterable object from "torchtext" library
train_iter = iter(AG_NEWS(split='train'))

# The "basic_english" tokenizer from "torchtext" library
tokenizer = get_tokenizer("basic_english")

# a function to get tokenized text for one document at a time
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# building the vocabulary using 'bulid_vocab_from_iterator' function from "torchtext" library
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"]) # This index will be returned when OOV token is queried

# printing some example token indices
print(vocab(["age","hello","<unk>","vijayabalan"]))

[2120, 12544, 0, 0]


#### Spliting dataset into train, validation and test data

In [8]:
# spliting dataset into train and test iterators.
train_iter, test_iter = AG_NEWS()

# converting the iterators into map-style datasets using "to_map_style_dataset" function from "torchtext" library
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# 95:5 split of train_dataset for training and validation using "random_split" function from "pytorch" library
num_train = int(len(train_dataset)*0.95)
split_train_dataset, split_valid_dataset = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

# printing no.of samples in each
print(f"No.of samples in each:-\n\ntrain: {num_train}\nvalidation: {len(train_dataset) - num_train}\ntest: {len(test_dataset)}")

No.of samples in each:-

train: 114000
validation: 6000
test: 7600


#### Pre-processing pipline

In [14]:
# pipline to convert raw text into token indices using the "tokenizer" and "vocab" functions defined about
def text_pipeline(x):
    return vocab(tokenizer(x))

# pipline to convert label values to start from "0" insted of '1'
def label_pipeline(x):
    return int(x) -1

# a function to convert the pre-processed data returned from "text_pipeline" and "label_pipeline" into tensors for each "batch" from the "dataloader"
def collate_batch(batch):
    
    label_list, text_list, offsets = [], [], [0]
    
    for label, text in batch:
        label_list.append(label_pipeline(label))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

    return label_list.to(device), text_list.to(device), offsets.to(device) 

#### Creating dataloaders for ML model

In [15]:
# creating dataloaders using "DataLoader" function from "pytorch" library
BATCH_SIZE = 64

# train dataloader
train_dataloader = DataLoader(
    split_train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch    
)

# validation dataloader
valid_dataloader = DataLoader(
    split_valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch    
)

# test dataloader
test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch    
)

## Neural Network (NN)

#### Defining the NN architecture

In [None]:
#a feed forward 2 layer NN implemented using "nn.EmbeddingBag", "nn.Linear" functions from "pytorch" library
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weights.data.uniform_(-initrange, initrange)
        self.fc.weights.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
