In [499]:
!pip install torchtext==0.6.0
!pip install torchviz
!pip install graphviz



In [462]:
import matplotlib.pyplot as plt
import nltk
import numpy as np 
from numpy.random import  RandomState
import os
import pandas as pd
import pickle
import random
import re
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sys
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchviz import make_dot

torch.manual_seed(1)

<torch._C.Generator at 0x7fa82f8aa410>

In [463]:
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir('/content/gdrive/My Drive/Colab Notebooks/cnn/')
os.getcwd()
pd.set_option('display.max_colwidth', -1)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


  """


# **CONVOLUTIONAL NEURAL NETWORK WITH EMBEDDINGS ON AG_NEWS** <br/>

**Note:** TREC question classification dataset contains 5500 labeled questions in training set and another 500 for test set. The dataset has 6 labels, 50 level-2 labels. Average length of each sentence is 10, vocabulary size of 8700.

1. Predicting call intent by analyzing phone calls: https://arxiv.org/pdf/1907.03715.pdf
2. BEN TREVETT THE OG BEAST: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/5%20-%20Multi-class%20Sentiment%20Analysis.ipynb
3. Another implementation: https://github.com/Shawn1993/cnn-text-classification-pytorch/blob/master/model.py
4. Prakash Pandey's implementation: https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/CNN.py


In [464]:
import torchtext #provides classes for useful NLP tasks in Pytorch
from torchtext import data
from torchtext import datasets
from torchtext import vocab
word = re.compile(r'\w+')

seed = 1234

def clean(txt):
    txt = re.sub(r'[^A-Za-z0-9]+', ' ', txt) # remove non-alphanumeric
    return txt.strip()

def tokenize(txt):
    txt = clean(txt)
    tokens = word.findall(txt)
    return tokens

#text define fixed length of 25
text = data.Field(tokenize=tokenize,batch_first = True)
label = data.LabelField()

# make splits for data
train, test = datasets.TREC.splits(text, label,fine_grained=False)
train, val = train.split(random_state = random.seed(seed))

print('training data size',len(train))
print('test data size',len(test))
print('validation data size',len(val))

training data size 3816
test data size 500
validation data size 1636


In [465]:
ex = train[15]
print(train.fields.items(),'\n',
      ex.text,'\n',
      ex.label)

dict_items([('text', <torchtext.data.field.Field object at 0x7fa8245ad208>), ('label', <torchtext.data.field.LabelField object at 0x7fa8245ad320>)]) 
 ['What', 'kind', 'of', 'greeting', 'is', 'appropriate', 'to', 'send', 'on', 'Yom', 'Kippur'] 
 DESC


In [0]:
max_vocab = 32000 #want to limit the size otherwise training will take too long
# load in pretrained word vectors locally and map vocabulary to unique integer values and assign pre-trained glove vectors
text.build_vocab(train, 
                 max_size = max_vocab,
                 vectors="glove.6B.300d", 
                 unk_init = torch.Tensor.normal_)
# specifying vocab for labels
label.build_vocab(train)    

In [467]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Cuda is available:',torch.cuda.is_available())

batch_size = 64

train_dl, valid_dl, test_dl = data.BucketIterator.splits(
    (train,val,test), 
    batch_size = batch_size, 
    device = device)

Cuda is available: True


# **DEFINE MODEL**

In [0]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filters, output_dim, 
                 dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.conv0 = nn.Conv2d(in_channels=1, out_channels=n_filters, 
                               kernel_size=(filters[0],embedding_dim))
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=n_filters, 
                               kernel_size=(filters[1],embedding_dim))
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=n_filters, 
                               kernel_size=(filters[2],embedding_dim))
        #Our linear layer's input dim is concatenated of all our convolutions concatenated
        self.fc = nn.Linear(len(filters)*n_filters,output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self,text):
        
        embedded = self.embedding(text)     #->(batch size, sentence length, embedding dimension)
        embedded = embedded.unsqueeze(1)    #->(batch size, 1, sentence length, embedding dimension)

        activation0 = F.relu(self.conv0(embedded).squeeze(3))
        activation1 = F.relu(self.conv0(embedded).squeeze(3))
        activation2 = F.relu(self.conv0(embedded).squeeze(3))
        '''DONT UNDERSTAND BELOWS PORTION'''
        pool0 = F.max_pool1d(activation0, activation0.size()[2]).squeeze(2)
        pool1 = F.max_pool1d(activation1, activation1.size()[2]).squeeze(2)
        pool2 = F.max_pool1d(activation2, activation2.size()[2]).squeeze(2)
        concat = self.dropout(torch.cat((pool0,pool1,pool2),dim=1)) # concatenate all of them

        # '''Printing shapes'''
        # print('--(FORWARD) EMBEDDING BEFORE UNSQUEEZE(1)',self.embedding(text).shape)
        # print('--(FORWARD) EMBEDDING AFTER UNSQUEEZE(1)',embedded.shape)
        # print('--(FORWARD) CONVOLUTION',self.conv0(embedded).shape)
        # print('--(FORWARD) CONVOLUTION AFTER SQUEEZE(3)',self.conv0(embedded).squeeze(3).shape)
        # print('--(FORWARD) ACTIVATION',activation.shape)
        
        return self.fc(concat)

In [0]:
vocab_size = len(text.vocab)
embedding_dim = 300 # glove embeddings of size 300
n_filters = 100
filters = [2,3,4]
output_dim = len(label.vocab)
dropout = 0.5
pad_idx = text.vocab.stoi[text.pad_token]

model = CNN(vocab_size, embedding_dim, n_filters, filters, output_dim, dropout, pad_idx)

In [470]:
# Loading pre-trained embeddings into model :)
pretrained_embeddings = text.vocab.vectors
# Replace initial weights of embedding layer with pre-trained embeddings
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-1.5256e+00, -7.5023e-01, -6.5398e-01,  ...,  7.9927e-01,
         -2.6190e-01,  1.5133e-01],
        [-2.2775e-01, -8.6759e-01,  3.3981e-01,  ..., -2.6398e-01,
         -1.2449e+00,  1.1790e+00],
        [ 4.6560e-02,  2.1318e-01, -7.4364e-03,  ...,  9.0611e-03,
         -2.0989e-01,  5.3913e-02],
        ...,
        [ 2.8848e-01,  6.1863e-01,  6.8970e-02,  ..., -3.5764e-01,
          1.9337e-01,  6.4179e-02],
        [ 6.7435e-01,  3.0655e-02, -3.4893e-01,  ...,  6.6987e-02,
          3.2624e-01,  1.2104e-01],
        [ 3.3716e-02, -1.3948e-04,  9.1735e-01,  ...,  9.6732e-01,
          2.5507e-01,  1.7801e+00]])

In [0]:
'0 out the intiial weights of unkown and padding tokens. irrelevant for determining sentiment'
unk_idx = text.vocab.stoi[text.unk_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

In [472]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,504,106 trainable parameters


In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def get_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [504]:
from graphviz import Digraph
# '''Try out torchviz'''
# mytensor = torch.randn(1,dtype=torch.LongTensor)
# # mytensor = torch.randn(64,100)
# model(mytensor)
torchviz_model = model
for i in train_dl:
    y = torchviz_model(i.text)
    make_dot(y).render('attached',format='png')
    sys.exit()

SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# **TRAIN MODEL**

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    iter_size = 0 #variable to track percentage complete
    total_size = len(iterator)
    model.train()
    for batch in iterator: 
        optimizer.zero_grad()
        # print('***SHAPES AND SIZES***')
        # print('--(TRAIN) BATCH OF TEXT SIZE:',batch.text.size())
        print(batch.text.shape)
        predictions = model(batch.text).squeeze(1) #tensor of predictions had to squeeze along 1 dimension
        # print('--(TRAIN) PREDICTION SIZE:',predictions.shape)

        #Calculate loss of predictions vs actual label which the loss function expects [batch_size,n_classes] and [batch_size]
        loss = criterion(predictions, batch.label)
        acc = get_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() #extracting value from the full tensor to update loss and accuracy
        epoch_acc += acc.item()

        iter_size += batch_size
        if iter_size%(batch_size*200) == 0:
            print(f'training {iter_size/(total_size*batch_size):.2%} completed')

        # #Print statements to check
        # print('-shape of predictions and batch label',predictions.shape,batch.label.shape)
        # print('-types of predictions and batch label:',type(predictions),type(batch.label.shape))
        # print('\nEXITING...\n')
    return epoch_loss/total_size, epoch_acc/total_size #average of total epoch training loss and accuracy

In [0]:
# run code on validation dataset
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text)
            
            loss = criterion(predictions, batch.label)
            acc = get_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss/len(iterator), epoch_acc/len(iterator) #average of total epoch validation loss and accuracy

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
'''Running no epochs and only running on one batch in iterator'''
# Run code on next(iter(train_dl))
EPOCHS = 25
train_graph = [] #initializing list to store training accuracy metrics
val_graph = []

best_model = float('inf')

for epoch in range(EPOCHS):
    start_time = time.time()

    train_loss, train_acc = train(model, train_dl, optimizer, criterion)
    train_graph.append([train_loss,train_acc])
    valid_loss, valid_acc = evaluate(model, valid_dl, criterion)
    val_graph.append([valid_loss,valid_acc])
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_model:
        best_model = valid_loss
        torch.save(model.state_dict(),'cnn_trec_model.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [461]:
model.load_state_dict(torch.load('cnn_trec_model.pt'))

test_loss, test_acc = evaluate(model, test_dl, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

RuntimeError: ignored