In [1]:
import torch
import torch.nn as nn
from torchtext import datasets, data
import torchtext
import random
import torch.optim as optim
import time
from sklearn.metrics import accuracy_score
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
import pandas as pd
from torchtext.data import TabularDataset
import os
from tqdm.notebook import tqdm
#from lime.lime_text import LimeTextExplainer
#from lime import lime_text
import numpy as np
import spacy
import re
from functools import partial

In [2]:
#torchtext.utils.download_from_url(datasets.text_classification.URLS['YelpReviewPolarity'])
#!tar -C .data -xvf .data/yelp_review_polarity_csv.tar.gz

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# from fastai code
def top_k_accuracy(input, targs, k=1):
    "Computes the Top-k accuracy (target is in the top k predictions)."
    input = input.topk(k=k, dim=-1)[1]
    targs = targs.unsqueeze(dim=-1).expand_as(input)
    return (input == targs).max(dim=-1)[0].float().mean()

In [5]:
class AverageMeter(object):
    def __init__(self):
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [6]:
tokenizer = get_tokenizer('basic_english')

In [7]:
TEXT = data.Field(lower=False,
                  include_lengths=True,
                  tokenize=tokenizer)
LABEL = data.Field(sequential=False,is_target=True,unk_token=None)
datafields = [('label', LABEL), 
              ('text', TEXT)]



In [8]:
train, test = TabularDataset.splits(
               path='.data/yelp_review_polarity_csv',
               train='train.csv', validation='test.csv',test=None,
               format='csv',
               skip_header=False,
               fields=datafields)



In [9]:
# hyperparams
bs = 256
min_freq = 5
vocab_max_size = 25000
embedding_size = 300

In [10]:
train_dataloader, valid_dataloader = data.BucketIterator.splits((train, test),
                                                            batch_size=bs,
#                                                            device=device,
                                                            shuffle=True,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch = True)



In [11]:
glove_vectors = torchtext.vocab.GloVe(name='6B', dim=embedding_size,cache='.data/pretrained_vectors')

In [12]:
glove_mean, glove_std = glove_vectors.vectors.mean(), glove_vectors.vectors.std()
glove_init_unk = partial(nn.init.normal_, mean=glove_mean, std=glove_std)

In [13]:
# Build vocab
TEXT.build_vocab(train, vectors=glove_vectors, max_size=vocab_max_size, min_freq=min_freq,unk_init=glove_init_unk)
LABEL.build_vocab(train)

In [None]:
torch.save(TEXT, "text_field.ptz")

In [14]:
# fit/test function
def single_epoch(dataloader, model, loss_function, optimizer=None):

    if optimizer != None:
        model.train()
    else:
        model.eval()

    losses = AverageMeter()
    accuracy = AverageMeter()
    for batch in tqdm(dataloader,leave=False):
        xb = batch.text
        yb = batch.label
        y_hat = model(*xb)
        y = yb.to(device)
        loss = loss_function(y_hat, y)

        if optimizer != None:
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        batch_size = len(yb)
        losses.update(loss.item(), batch_size)
        accuracy.update(top_k_accuracy(y_hat,y,1).item(), batch_size)

    return losses.avg ,accuracy.avg


def fit(epochs, model, train_dl, valid_dl, loss_func, optimizer, scheduler=None):
    for epoch in range(epochs):
        lr = scheduler.get_last_lr()[0]
        start_time = time.time()
        train_loss,train_acc = single_epoch(train_dl, model, loss_func, optimizer)
        scheduler.step()
        with torch.no_grad():
            valid_loss,valid_acc = single_epoch(valid_dl, model, loss_func)
        secs = int(time.time() - start_time)
        print(f'Epoch {epoch} {secs}[sec] lr={lr:.5f}',end=' ')
        print(f'Train: loss {train_loss:.4f} accuracy {train_acc:.4f}',end='\t')
        print(f'Validation: loss {valid_loss:.4f} accuracy {valid_acc:.4f}')

In [15]:
def bn_dropout_fc(in_features, out_features, dropout_p=0.5):
    return [nn.BatchNorm1d(in_features), nn.Dropout(dropout_p), nn.Linear(in_features, out_features)]

class NNet(nn.Module):
    def __init__(self,embeddings, embedding_dim, output_dim, pad_idx, lstm_hidden_size, fc_hidden):
        super().__init__()
        self.embeddings = nn.Embedding.from_pretrained(embeddings, freeze=False, padding_idx=pad_idx)
        self.dropout = nn.Dropout(0.5)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=lstm_hidden_size, 
                            num_layers=2, batch_first=False,
                            bidirectional=True,dropout=0.0)

        self.fc = nn.Sequential(
            *bn_dropout_fc(lstm_hidden_size * 2, fc_hidden[0]),
            nn.ReLU(),
            *bn_dropout_fc(fc_hidden[0], fc_hidden[1]),
            nn.ReLU(),
            *bn_dropout_fc(fc_hidden[1], output_dim)
        )


    def forward(self, input, input_lengths):
        a1 = self.dropout(self.embeddings(input.to(device)))

        packed_embeddings = nn.utils.rnn.pack_padded_sequence(a1, input_lengths,enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embeddings,) 
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        
        output,output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        return self.fc(hidden)

In [22]:
lstm_hidden=200
fc_hidden = [100,50]
model = NNet(TEXT.vocab.vectors,
                   embedding_size,
                   len(LABEL.vocab),
                   TEXT.vocab.stoi[TEXT.pad_token],
                   lstm_hidden,fc_hidden).to(device)

In [23]:
loss_function = nn.CrossEntropyLoss().to(device)
wd=0.00005
epochs=10
opt = optim.Adam(model.parameters(),lr=1e-3, weight_decay=wd)
scheduler = optim.lr_scheduler.MultiStepLR(opt, milestones=[5,10], gamma=0.1)
fit(epochs,model,train_dataloader, valid_dataloader, loss_function, opt,scheduler=scheduler)

HBox(children=(FloatProgress(value=0.0, max=2188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

Epoch 0 203[sec] lr=0.00100 Train: loss 0.2624 accuracy 0.8888	Validation: loss 0.1710 accuracy 0.9288


HBox(children=(FloatProgress(value=0.0, max=2188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

Epoch 1 203[sec] lr=0.00100 Train: loss 0.1539 accuracy 0.9444	Validation: loss 0.1258 accuracy 0.9495


HBox(children=(FloatProgress(value=0.0, max=2188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

Epoch 2 203[sec] lr=0.00100 Train: loss 0.1347 accuracy 0.9516	Validation: loss 0.1464 accuracy 0.9450


HBox(children=(FloatProgress(value=0.0, max=2188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

Epoch 3 202[sec] lr=0.00100 Train: loss 0.1253 accuracy 0.9551	Validation: loss 0.1165 accuracy 0.9564


HBox(children=(FloatProgress(value=0.0, max=2188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

Epoch 4 203[sec] lr=0.00100 Train: loss 0.1194 accuracy 0.9573	Validation: loss 0.1220 accuracy 0.9556


HBox(children=(FloatProgress(value=0.0, max=2188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

Epoch 5 203[sec] lr=0.00010 Train: loss 0.0955 accuracy 0.9668	Validation: loss 0.1045 accuracy 0.9623


HBox(children=(FloatProgress(value=0.0, max=2188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

Epoch 6 203[sec] lr=0.00010 Train: loss 0.0890 accuracy 0.9693	Validation: loss 0.1018 accuracy 0.9625


HBox(children=(FloatProgress(value=0.0, max=2188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

Epoch 7 204[sec] lr=0.00010 Train: loss 0.0861 accuracy 0.9702	Validation: loss 0.1042 accuracy 0.9624


HBox(children=(FloatProgress(value=0.0, max=2188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

Epoch 8 204[sec] lr=0.00010 Train: loss 0.0824 accuracy 0.9716	Validation: loss 0.1025 accuracy 0.9645


HBox(children=(FloatProgress(value=0.0, max=2188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

Epoch 9 203[sec] lr=0.00010 Train: loss 0.0803 accuracy 0.9725	Validation: loss 0.1005 accuracy 0.9641


In [24]:
torch.save(model.state_dict(), 'rnn_model.pt')