In [8]:
import torch
import torch.nn as nn
from torchtext import datasets, data
import torchtext
import random
import torch.optim as optim
import time
from sklearn.metrics import accuracy_score
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
import pandas as pd
from torchtext.data import TabularDataset
import os

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [10]:
# Some hyperparams
bs = 256
min_freq = 5
vocab_max_size = 25000
embedding_size = 100
lstm_hidden=200

In [11]:
TEXT = data.Field(lower=True,
                  include_lengths=True,
                  tokenize=get_tokenizer('basic_english'))
LABEL = data.Field(sequential=False,is_target=True,unk_token=None)

In [12]:
torchtext.utils.download_from_url(datasets.text_classification.URLS['YelpReviewPolarity'])
!tar -C .data -xvf .data/yelp_review_polarity_csv.tar.gz

yelp_review_polarity_csv/
yelp_review_polarity_csv/readme.txt
yelp_review_polarity_csv/test.csv
yelp_review_polarity_csv/train.csv


In [13]:
glove_vectors = torchtext.vocab.GloVe(name='6B', dim=embedding_size,cache='.cache/embeddings')
glove_mean, glove_std = glove_vectors.vectors.mean(), glove_vectors.vectors.std()

In [14]:
datafields = [('label', LABEL), 
              ('text', TEXT)]

train, valid = TabularDataset.splits(
               path='.data/yelp_review_polarity_csv',
               train='train.csv', validation='test.csv',test=None,
               format='csv',
               skip_header=False,
               fields=datafields)

In [15]:
# Build vocab
TEXT.build_vocab(train, vectors = glove_vectors, max_size=vocab_max_size,min_freq=min_freq)
LABEL.build_vocab(train)

# init oov embeddings with same distribution as glove
# skip <unk> and padding
for i in range(2, len(TEXT.vocab.vectors)):
    if len(TEXT.vocab.vectors[i, :].nonzero()) == 0:
        nn.init.normal_(TEXT.vocab.vectors[i], mean=glove_mean, std=glove_std)

In [16]:
# create iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid),
                                                            batch_size=bs,
                                                            device=device,
                                                            shuffle=True,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch = True)

In [17]:
# fit/test functions
def fit(iterator, model, optimizer, criterion):
    train_loss = 0
    train_acc = 0
    model.train()
    all_y = []
    all_y_hat = []
    for batch in iterator:
        optimizer.zero_grad()
        y = batch.label
        text, text_lengths = batch.text
        y_hat = model(text, text_lengths)
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        all_y.append(y)
        all_y_hat.append(y_hat)
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    acc = accuracy_score(y.cpu(),y_hat.argmax(1).detach().cpu())
    return train_loss / len(iterator.dataset), acc

def test(iterator, model, criterion):
    train_loss = 0
    train_acc = 0
    model.eval()
    all_y = []
    all_y_hat = []
    for batch in iterator:
        y = batch.label                       
        with torch.no_grad():
            text, text_lengths = batch.text
            y_hat = model(text, text_lengths)
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        
        all_y.append(y)
        all_y_hat.append(y_hat)
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    acc = accuracy_score(y.cpu(),y_hat.argmax(1).detach().cpu())
    return train_loss / len(iterator.dataset), acc

In [18]:
def train_n_epochs(model, n, optimizer, scheduler):

    criterion = nn.CrossEntropyLoss().to(device)
    for epoch in range(n):
        start_time = time.time()
        train_loss, train_acc = fit(train_iterator, model, optimizer, criterion)
        scheduler.step()
        valid_loss, valid_acc = test(valid_iterator, model, criterion)

        secs = int(time.time() - start_time)
        mins = secs / 60
        secs = secs % 60

        print('Epoch: %d' % (epoch), " | time in %d minutes, %d seconds" % (mins, secs))
        print(f'\tTrain Loss: {train_loss:.4f}\t|\tAccuracy: {train_acc :.6f}')
        print(f'\tValidation Loss: {valid_loss:.4f}\t|\tAccuracy: {valid_acc:.6f}') 

In [21]:
class SimpleNet(nn.Module):
    def __init__(self,embeddings, embedding_dim, output_dim, pad_idx):
        super().__init__()
        self.embeddings = nn.Embedding.from_pretrained(embeddings, freeze=False, padding_idx=pad_idx)
        self.dropout = nn.Dropout(0.5) 

        self.fc = nn.Sequential(
            nn.BatchNorm1d(embedding_dim),
            nn.Dropout(0.3),
            nn.Linear(embedding_dim, output_dim))

    def forward(self, input, input_lengths):
        emb = self.dropout(self.embeddings(input))
        # using avg pooling will count padding tokens
        emb = nn.utils.rnn.pack_padded_sequence(emb, input_lengths)
        emb, _ = nn.utils.rnn.pad_packed_sequence(emb)
        avg = emb.sum(dim=0)/input_lengths.unsqueeze(1).to(device)
        return self.fc(avg)

simple_model = SimpleNet(TEXT.vocab.vectors,
                         embedding_size,
                         len(LABEL.vocab),
                         TEXT.vocab.stoi[TEXT.pad_token]).to(device)
wd=0.00000
lr=0.01
epochs=5
optimizer = optim.Adam(simple_model.parameters(), lr=lr, weight_decay=wd)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2,6], gamma=0.1)
train_n_epochs(simple_model,epochs,optimizer,scheduler)

Epoch: 0  | time in 0 minutes, 45 seconds
	Train Loss: 0.0009	|	Accuracy: 0.910343
	Validation Loss: 0.0007	|	Accuracy: 0.935816
Epoch: 1  | time in 0 minutes, 44 seconds
	Train Loss: 0.0007	|	Accuracy: 0.925675
	Validation Loss: 0.0007	|	Accuracy: 0.939368
Epoch: 2  | time in 0 minutes, 45 seconds
	Train Loss: 0.0007	|	Accuracy: 0.933975
	Validation Loss: 0.0007	|	Accuracy: 0.940026
Epoch: 3  | time in 0 minutes, 48 seconds
	Train Loss: 0.0007	|	Accuracy: 0.934766
	Validation Loss: 0.0007	|	Accuracy: 0.934368
Epoch: 4  | time in 0 minutes, 45 seconds
	Train Loss: 0.0007	|	Accuracy: 0.935737
	Validation Loss: 0.0007	|	Accuracy: 0.940026
