In [0]:
"""
Load the AG_NEWS dataset in bi-gram features format.
"""
!pip install torchtext==0.4
import torch
import torchtext
from torchtext.datasets import text_classification
import os

NGRAMS = 2

if not os.path.isdir('./.data'):
    os.mkdir('./.data')

train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', ngrams=NGRAMS, vocab=None)

BATCH_SIZE = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

#This is Task 2, here we have implemented an LSTM Model
class TextClassifier(nn.Module):
  def __init__(self, vocab_size, dim_embedd, dim_hidden, num_class):
    super().__init__()
    self.dim_embedd = dim_embedd
    self.dim_hidden = dim_hidden
    self.vocab_size = vocab_size
    self.embedding = nn.EmbeddingBag(vocab_size, dim_embedd)
    self.lstm = nn.LSTM(dim_embedd, dim_hidden, num_layers=1)
    
    self.hidden2out = nn.Linear(dim_hidden, num_class)
    self.softmax = nn.LogSoftmax()

    self.embedding.weight.data.uniform_(-0.5, 0.5)


  def forward(self, text, lengths):
    offsets=(torch.tensor([0]+list(lengths)))[:-1].cumsum(dim=0)

    print("DEBUG POINT 1:Successfully initialized the hidden units from The Model")
    print("text =",text)
    print("Lengths =",lengths)
    print("offsets =",offsets)

    #self.hidden = self.init_hidden(batch.size(-1))

    embeds = self.embedding(text, offsets)
    print("DEBUG POINT 2:No probem with the embedding layer")
    print("embeds =",embeds)
    print("embeds shape =",embeds.shape)

    h0 = torch.zeros(1, BATCH_SIZE, self.dim_hidden).to(device)
    c0 = torch.zeros(1, BATCH_SIZE, self.dim_hidden).to(device)

    #packed_input = pack_padded_sequence(embeds, lengths, batch_first=True, enforce_sorted=False)
    #print("DEBUG POINT 3:There shouldn't be any problem now.")
    #print("packed input =",packed_input)

    output, _ = self.lstm(embeds.view(len(embeds),1,-1))
    print("DEBUG POINT 4:AFTER LSTM")
    print("Output of LSTM =",output)
    print("Output of LSTM =",output.shape)

    output = (output.view(BATCH_SIZE,self.dim_hidden))
    print("output before SOFTMAX =",output)

    #output = self.hidden2out(output[:, -1, :])
    output = self.softmax(output)
    return output

In [0]:
VOCAB_SIZE = len(train_dataset.get_vocab())
NUM_CLASS = len(train_dataset.get_labels())
LABELS = train_dataset.get_labels()

print(VOCAB_SIZE)
print(NUM_CLASS)
print(LABELS)

1308844
4
{0, 1, 2, 3}


In [0]:
'''
Paramters and model instance creation.
'''

# TODO: Instantiate the Vocabulary size and the number of classes
# from the training dataset that we loaded for you.

# Hint: Remember that these are PyTorch datasets. So, there should be 
# readily available functions that you can use to save time. ;)

VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUM_CLASS = len(train_dataset.get_labels())
HID_DIM = 4

# TODO: Instantiate the model with the parameters you defined above. 
# Remember to allocate it to your 'device' variable.


# TASK 1
#model = TextClassifier(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device)

#TASK 2
model = TextClassifier(VOCAB_SIZE, EMBED_DIM, HID_DIM, NUM_CLASS).to(device)

In [0]:
# TODO: Finish the function definition.

def generate_batch(batch):
    
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    #FOR TASK1
    #offsets = [0] + [len(entry) for entry in text]

    #FOR TASK 2
    offsets = [0] + [len(entry) for entry in text]
    lengths = torch.tensor([len(entry) for entry in text])

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    
    return text, lengths, label

In [0]:
from torch.utils.data import DataLoader

def train(train_data):

    # Initial values of training loss and training accuracy
    
    train_loss = 0
    train_acc = 0

    # TODO: Use the PyTorch DataLoader class to load the data 
    # into shuffled batches of appropriate sizes into the variable 'data'.
    # Remember, this is the place where you need to generate batches.
    
    data = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
    
    for i, (text, lengths, cls) in enumerate(data):
        # TODO: What do you need to do in order to perform backprop on the optimizer?
        #print(len(offsets))
        #print("From Train function, length of text =",len(text))
        #print("text =",len(text))
        #print("text =",text)
        #print("lengths =",len(offsets))
        #print("lengths =",offsets)
        #print("lengths =",sum(offsets))
        #print("cls =",len(cls))
        #print("cls =",cls)
        optimizer.zero_grad()
        
        text, lengths, cls = text.to(device), lengths.to(device), cls.to(device)
        
        # TODO: Store the output of the model in variable 'output'
        output = model(text, lengths)
        #print("From train function",output,type(output))
        # TODO: Define the 'loss' variable (with respect to 'output' and 'cls').
        # Also calculate the total loss in variable 'train_loss'
        loss = criterion(output, cls)
        train_loss += loss.item()
        
        # TODO: Perform the backward propagation on 'loss' and 
        # optimize it through the 'optimizer' step
        loss.backward()
        optimizer.step()
        
        # TODO: Calculate and store the total training accuracy
        # in the variable 'total_acc'.
        # Remember, you need to find the 
        train_acc = train_acc + (output.argmax(1) == cls).sum().item()
        

    # TODO: Adjust the learning rate here using the scheduler step
    scheduler.step()
    
    avg_loss = train_loss / len(train_data)
    avg_acc = train_acc / len(train_data)

    return avg_loss, avg_acc

In [0]:
def test(test_data):
    
    # Initial values of test loss and test accuracy
    
    test_loss = 0
    test_acc = 0
    
    # TODO: Use DataLoader class to load the data
    # into non-shuffled batches of appropriate sizes.
    # Remember, you need to generate batches here too.
    data = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    
    for text, offsets, cls in data:
        
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        
        # Hint: There is a 'hidden hint' here. Let's see if you can find it :)
        with torch.no_grad():
 
            # TODO: Get the model output
            output = model(text, offsets)
            
            # TODO: Calculate and add the loss to find total 'loss'
            test_loss = criterion(output, cls)
            test_loss += test_loss.item()
            
            # TODO: Calculate the accuracy and store it in the 'acc' variable
            test_acc = test_acc + (output.argmax(1) == cls).sum().item()
            
    avg_loss = test_loss / len(test_data)
    avg_acc = test_acc / len(test_data)
            
    return avg_loss, avg_acc

In [0]:
import time
from torch.utils.data.dataset import random_split

# TODO: Set the number of epochs and the learning rate to 
# their initial values here

N_EPOCHS = 5
LEARNING_RATE = 4.0
TRAIN_RATIO = 0.9

# TODO: Set the intial validation loss to positive infinity
valid_loss_min = float('inf')

# TODO: Use the appropriate loss function
criterion = torch.nn.CrossEntropyLoss().to(device)

# TODO: Use the appropriate optimization algorithm with parameters (Suggested: SGD)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

# TODO: Use a scheduler function
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

# TODO: Split the data into train and validation sets using random_split()
train_len = int(len(train_dataset) * TRAIN_RATIO)
train_data, test_data = random_split(train_dataset, [train_len, len(train_dataset) - train_len])

# TODO: Finish the rest of the code below

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train(train_data)
    valid_loss, valid_acc = test(test_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

DEBUG POINT 1:Successfully initialized the hidden units from The Model
text = tensor([    38,   4263,    290,  ..., 247336,  86856, 116044])
Lengths = tensor([ 83, 103,  95,  85, 101,  65,  55,  89, 111, 133,  79,  91,  93, 101,
         65,  75])
offsets = tensor([   0,   83,  186,  281,  366,  467,  532,  587,  676,  787,  920,  999,
        1090, 1183, 1284, 1349])
DEBUG POINT 2:No probem with the embedding layer
embeds = tensor([[ 4.7097e-03,  1.3811e-03,  4.6784e-02, -2.7026e-02,  4.4589e-02,
          1.4234e-02, -2.6971e-02, -1.1700e-02,  1.7154e-02, -3.0876e-02,
         -2.0685e-02,  1.2867e-03,  1.7332e-02, -2.7873e-02,  1.0785e-02,
         -4.5182e-02,  8.7253e-03, -3.4995e-02,  1.5521e-02, -5.3446e-02,
         -1.6377e-02,  5.5877e-02, -2.5817e-03,  3.3367e-02,  8.1712e-03,
         -1.9763e-02,  2.0135e-02,  8.6350e-05, -1.1080e-02,  1.0803e-02,
         -9.5773e-03,  4.9702e-02],
        [ 2.1970e-02,  7.6243e-03,  2.2098e-02, -4.2491e-02,  6.0779e-02,
          4.2268e



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
         -2.7481e-02, -1.8263e-02,  4.6369e-02,  1.5929e-02, -3.7190e-02,
         -9.6155e-03,  1.0426e-01,  1.1568e-01,  1.7868e-03, -2.4366e-02,
         -1.3392e-02, -9.1543e-02,  1.4341e-02, -8.6632e-02, -1.4088e-03,
         -2.9805e-02, -7.5250e-02, -9.6306e-02, -3.5837e-02, -1.0084e-01,
         -5.3773e-02,  7.6109e-02],
        [-1.4771e-01,  1.1052e-01, -4.0171e-02, -9.1116e-02,  1.4776e-01,
         -5.1379e-02, -6.8849e-02, -8.7652e-02,  5.6671e-02,  1.6553e-02,
          1.4179e-01,  3.1023e-02,  9.7950e-02, -4.9003e-02,  9.0382e-02,
         -1.2834e-01, -1.3288e-01, -8.2701e-02,  6.3252e-02,  8.8950e-03,
          1.0752e-01, -4.4427e-02,  7.7344e-02,  4.6989e-02,  8.6212e-02,
          5.1428e-02,  2.0388e-02,  6.5697e-02, -1.3553e-01, -2.6983e-02,
          2.3554e-01, -8.4934e-02],
        [-5.8653e-02,  1.1110e-01,  1.0419e-02, -2.7614e-03, -3.0261e-03,
         -1.1820e-01,  1.7651e-03, -3.7128e-02,  

## Let's  check the test loss and test accuracy

So you have trained your model and seen how well it performs on the training and validation datasets. Now, you need to check your model's performance against the test dataset. Using the test dataset as input, report the test loss and test accuracy scores of your model.

In [0]:
# TODO: Compete the code below to find 
# the results (loss and accuracy) on the test data

print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0003(test)	|	Acc: 91.7%(test)


In [0]:
# importing necessary libraries

import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

# labels for the AG_NEWS dataset

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

vocab = train_dataset.get_vocab()
model = model.to("cpu")

# TODO: Predict the topic of the above given random text (use bigrams)
# Use the proper paramters in the predict() function

print("This is a '%s' news" % ag_news_label[predict(ex_text_str, model, vocab, 2)])

# If you have done everything correctly in this task,
# then the output of this cell should be - "This is a 'Sports' news".

This is a 'Sports' news


# Congratulations! You just designed your first neural classifier!

And probably you have achieved a good accuracy score too. Great job!

## Question 2:
You just tested your model with a new sample text. Try to feed some more random examples of similar text (which you think are related to at least one of the four topics _"World", "Sports", "Business", "Sci/Tec"_ of our problem) to the model and see how your model reacts. Give at least 3 such examples (You are free to include more examples if you wish to).

## Answer 2:

## Question 3:
Okay, probably the model still works great with the examples you fed to it in the previous question. How about a twist in the plot? Let's feed it some more random text data from completely different genres/topics (not belonging to the 4 topics which we talk about the in the first question). How does your model react now? Give at least 3 such examples (You are free to include more examples if you wish to).

Of course the predictions will be limited to the four class labels that your model is trained on. Can you somehow justify the labels that your model predicted now for the given text inputs?

## Answer 3:

## Question 4:
Your model probably has achieved a good accuracy score. However, there may be lots of things that you could still try to do to improve your classifier model. Can you try to list down some improvements that you think would be able to improve the above model's performance?

_(Hint: Maybe think about alternate architectures, #layers, hyper-paramters, etc..., but try not to come up with too complex stuff! :) )_

## Answer 4:

# Task 2: Try the better option that you proposed

In Question 4, you have proposed some alternate solution that you think will be able to somehow improve your model. Following one of the options below, try to build and train a new model, and report the new loss and accuracy scores. Is it better than your initial classifier model for the same data?

For your reference, here are some neural models using which researchers have tried to classify text before:

* Recurrent Neural Networks (RNNs)
* Long-Short Term Memory (LSTM)
* Bi-directional LSTM (BiLSTM)
* Gated Recurrent Units (GRUs)

# Task 3: Let your creativity flow!

As discussed earlier, you are free to come up with anything in task 3. Think and try to model unique (not too complex!) neural architecture on your own. Remember that this model has to be novel as much as possible, so try not to copy other people's existing work. Using the same data, train the new model, and report the accuracy scores. How much better/worse is this model than the previous two models? Why do you think this is better/worse?

# Important Notes

## NOTE 1:
If you want, you can try out the models on other datasets too for comparisons. Although this is not mandatory, it would be really interesting to see how your model performs for data from different domains maybe. Note that you may need to tweak the code a little bit when you are considering other datasets and formats. 

## NOTE 2:
Any form of plagiarism is strictly prohibited. If it is found that you have copied sample code from the internet, the entire team will be penalized.

## NOTE 3:
Often Jupyter Notebooks tend to stop working or crash due to overload of memory (lot of variables, big neural models, memory-intensive training of models, etc...). Moreover, with more number of tasks, the number of variables that you will be using will surely incerase. Therefore, it is recommended that you use separate notebooks for each _Task_ in this project.

## NOTE 4:
You are expected to write well-documented code, that is, with proper comments wherever you think is needed. Make sure you write a comprehensive report for the entire project consisting of data analysis, your model architecture, methods used, discussing and comparing the models against the accuracy and loss metrics, and a final conslusion. If you want to prepare separate reports for each _Task_, you could do this in the Jupyter Notebook itself using $Mardown$ and $\LaTeX$ code if needed. If you want to submit a single report for the entire project, you could submit a PDF file in that case (Word or $\LaTeX$).

All the very best for project 2. Wishing you happy holidays and a very happy new year in advance! :)