In [1]:
import os
import json
from collections import Counter
from functools import partial

import numpy as np
import tqdm
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.vocab import Vocab

## read data

In [2]:
df = pd.read_csv('data/Reviews.csv')
df = df[['Score', 'Text']]
df = df.dropna()
df = df.drop_duplicates('Text')

In [3]:
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [4]:
df['Text'].describe()

count                                                393579
unique                                               393579
top       I have loved this gum since the first time I t...
freq                                                      1
Name: Text, dtype: object

In [5]:
df['Score'].unique()

array([5, 1, 4, 2, 3])

## build dataset

In [6]:
labels, reviews = df['Score'].to_numpy(), df['Text'].to_numpy()

## make splits

In [7]:
def split_data(arrays, ratios=(0.7, 0.2, 0.1)):
    data_len = arrays[0].shape[0]
    assert all(a.shape[0] == data_len for a in arrays[1:])
    sizes = [r / sum(ratios) for r in ratios]
    sizes = [int(s * data_len) for s in sizes[:-1]]
    sizes.append(data_len - sum(sizes))
    start = 0
    finish = 0
    splits = []
    for s in sizes:
        finish += s
        splits.append([a[start:finish] for a in arrays])
        start += s
    return splits

train, valid, test = split_data((labels, reviews), (0.7, 0.2, 0.1))

In [8]:
def data_merge(data):
    labels, text = data
    return [{'label': int(l), 'text': row} for l, row in zip(labels, text)]

train = data_merge(train)
valid = data_merge(valid)
test = data_merge(test)

In [9]:
del df

## tokenize text in datasets, add bigrams

In [10]:
def data_tokenize(data, tokenizer, lower, ngrams, cache=True):
    tokenizer = get_tokenizer(tokenizer)
    for entry in tqdm.tqdm(data, 'lines', len(data)):
        if lower:
            entry['text'] = entry['text'].lower()
        entry['text'] = tokenizer(entry['text'])
        entry['text'] = list(ngrams_iterator(entry['text'], ngrams))
    return data

tokenizer = 'spacy'
lower = True
ngrams = 2

train = data_tokenize(train, tokenizer, lower, ngrams)

lines: 100%|██████████| 275505/275505 [01:24<00:00, 3255.49it/s]


In [11]:
valid = data_tokenize(valid, tokenizer, lower, ngrams)
test = data_tokenize(test, tokenizer, lower, ngrams)

lines: 100%|██████████| 78715/78715 [00:24<00:00, 3256.86it/s]
lines: 100%|██████████| 39359/39359 [00:11<00:00, 3292.76it/s]


## save all data to json

In [12]:
def save_tokenized(data, filename):
    with open(filename, 'wt') as f:
        f.writelines(json.dumps(l) + '\n' for l in tqdm.tqdm(data))

save_tokenized(train, 'train_tokenized.json')

100%|██████████| 275505/275505 [00:06<00:00, 42017.81it/s]


In [13]:
save_tokenized(valid, 'valid_tokenized.json')
save_tokenized(test, 'test_tokenized.json')

100%|██████████| 78715/78715 [00:01<00:00, 43057.98it/s]
100%|██████████| 39359/39359 [00:00<00:00, 42034.53it/s]


## load train data from json if available

In [14]:
def load_tokenized(filename):
    with open(filename, 'rt') as f:
        return [json.loads(l) for l in tqdm.tqdm(f.readlines())]

try:
    if train:
        pass
except NameError:
    train = load_tokenized('train_tokenized.json')

## create vocabulary

In [15]:
def build_vocab(data,
                max_size=30000,  # x 100 emb_dim = about 3M model parameters
                ):
    counter = Counter()
    for entry in tqdm.tqdm(data):
        counter.update(entry['text'])
    return Vocab(counter, max_size)

vocab = build_vocab(train)

100%|██████████| 275505/275505 [00:09<00:00, 27597.26it/s]


## create torch datasets

In [16]:
class JsonDataset(Dataset):
    def __init__(self, filename, vocab, preload):
        self.filename = filename
        self.vocab = vocab
        self.label_dict = {i + 1: i for i in range(5)}
        self.data_len = None
        self.preload = preload
        if self.preload:
            self._preload_data()
            
    def _process_line(self, l):
        l = json.loads(l)
        label = l['label']
        label = self.label_dict[label]
        label = torch.tensor(label)
        text = l['text']
        text = torch.tensor(np.fromiter((self.vocab[token] for token in text),
                            dtype='int'))
        return label, text

    def _preload_data(self):
        self.data = []
        with open(self.filename, 'rt') as f:
            for l in tqdm.tqdm(f.readlines(), 'loading'):
                self.data.append(self._process_line(l))
        self.data_len = len(self.data)

    def __getitem__(self, index):
        if self.preload:
            return self.data[index]

        with open(self.filename, 'rt') as f:
            for i, l in enumerate(f):
                if i == index:
                    break
        return self._process_line(l)

    def __len__(self):
        if not self.data_len:
            with open(self.filename) as f:
                for i, l in enumerate(f):
                    pass
            self.data_len = i + 1
        return self.data_len

train_dataset = JsonDataset('train_tokenized.json', vocab, preload=True)

loading: 100%|██████████| 275505/275505 [00:36<00:00, 7496.79it/s]


In [17]:
valid_dataset = JsonDataset('valid_tokenized.json', vocab, True)
test_dataset = JsonDataset('test_tokenized.json', vocab, True)

loading: 100%|██████████| 78715/78715 [00:09<00:00, 8378.36it/s]
loading: 100%|██████████| 39359/39359 [00:05<00:00, 6723.67it/s]


In [18]:
assert len(train_dataset) == len(train)

In [19]:
train_dataset[0]

(tensor(4),
 tensor([    5,    23,   188,   509,    12,     3,     0,  1101,   168,    85,
           309,     6,    23,   173,    41,    57,     9,    45,    12,    43,
           272,     2,     3,    52,  1204,    68,    35,     7,  8561,    78,
             7,  2457,   833,     6,     8,  1236,   134,     2,    18, 26061,
            13,  6014,     6,   152,     0,    14,    52,   134,    78,    10,
           213,     2,    73,  3763, 18810,  7085,    58,     0,     0, 28801,
          1013, 11736,  4716,   641,  1355,  3619,  2450, 16407,   157, 19358,
          5927,  3090,  2636,    99,   381,     0,     0,  1932,   474,     0,
             0,  1422,     0,     0,  8115,   161,  3327,     0,  1193,   428,
             0,     0,     0,     0,  1237,     0,     0,   137,     0,   496,
             0,  5994, 12876]))

## create dataloaders for padded sequences

In [20]:
def padded_collate(batch, padding):
    labels, texts = zip(*batch)
    labels = torch.tensor(labels)
    texts = pad_sequence(texts, padding_value=padding)
    return labels, texts

padding = vocab['<pad>']
collate = partial(padded_collate, padding=padding)

In [21]:
batch_size = 256

train_iter = DataLoader(train_dataset,
                        batch_size,
                        shuffle=True,
                        collate_fn=collate,
                        num_workers=4)
valid_iter = DataLoader(valid_dataset,
                        batch_size,
                        shuffle=False,
                        collate_fn=collate,
                        num_workers=4)

In [22]:
test_iter  = DataLoader(test_dataset,
                        batch_size,
                        shuffle=False,
                        collate_fn=collate,
                        num_workers=0)

## add fasttext model

In [23]:
class TextModel(nn.Module):
    def __init__(self, vocab_len, embed_dim, n_classes, padding):
        super().__init__()
        self.embedding = nn.Embedding(vocab_len, embed_dim, padding)
        self.fc = nn.Linear(embed_dim, n_classes)

    def forward(self, x):
        x = self.embedding(x)
        # shape = [seq_dim, batch_dim, embed_dim]
        x = x.mean(0)
        return self.fc(x)

vocab_len = len(vocab)
embed_dim = 100
n_classes = 5
model = TextModel(vocab_len, embed_dim, n_classes, padding)

number of parameters and output shape - sanity check

In [24]:
print('total number of learnable parameters: {}'.format(
      sum(p.numel() for p in model.parameters() if p.requires_grad)))

total number of learnable parameters: 3000705


In [25]:
for l, t in train_iter:
    break
output = model(t)
print('model output shape with batch size {}: {}'.format(
    batch_size, output.shape))

model output shape with batch size 256: torch.Size([256, 5])


## train model

In [26]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

In [27]:
def train(train_iter, model, optimizer, criterion, device):
    total_loss = 0
    total_acc = 0
    total = 0

    model.train()

    for labels, texts in tqdm.tqdm(train_iter, 'train_batch', leave=False, position=0):
        labels, texts = labels.to(device), texts.to(device)
        optimizer.zero_grad()

        preds = model(texts)
        loss = criterion(preds, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc += (preds.argmax(1) == labels).sum().item()
        total += labels.shape[0]

    return total_loss / total, total_acc / total

In [28]:
def evaluate(valid_iter, model, criterion, device):
    total_loss = 0
    total_acc = 0
    total = 0

    model.eval()

    with torch.no_grad():
        for labels, texts in tqdm.tqdm(valid_iter, 'valid_batch', leave=False, position=0):
            labels, texts = labels.to(device), texts.to(device)

            preds = model(texts)
            loss = criterion(preds, labels)

            total_loss += loss.item()
            total_acc += (preds.argmax(1) == labels).sum().item()
            total += labels.shape[0]

    return total_loss / total, total_acc / total

In [29]:
epochs = 25
best_loss = None
best_path = None

for epoch in range(1, epochs + 1):
    train_loss, train_acc = train(train_iter, model, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(valid_iter, model, criterion, device)

    print('Epoch: {:03d}'.format(epoch))
    print('Training loss    : {:.5f}'.format(train_loss))
    print('Training acc     : {:.2f}'.format(train_acc))
    print('Validation loss  : {:.5f}'.format(valid_loss))
    print('Validation acc   : {:.2f}'.format(valid_acc))

    if best_loss is None or valid_loss < best_loss:
        print('New best validation loss, saving model weights')
        best_loss = valid_loss
        old_path = best_path
        best_path = 'textmodel5-epoch{:02d}-acc{:.2e}.pt'.format(epoch, valid_acc)
        torch.save(model.state_dict(), best_path)
        if old_path:
            os.remove(old_path)

train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 001
Training loss    : 0.00476
Training acc     : 0.62
Validation loss  : 0.00387
Validation acc   : 0.65
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 002
Training loss    : 0.00362
Training acc     : 0.66
Validation loss  : 0.00332
Validation acc   : 0.69
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 003
Training loss    : 0.00322
Training acc     : 0.69
Validation loss  : 0.00307
Validation acc   : 0.71
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 004
Training loss    : 0.00302
Training acc     : 0.71
Validation loss  : 0.00294
Validation acc   : 0.72
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 005
Training loss    : 0.00289
Training acc     : 0.72
Validation loss  : 0.00287
Validation acc   : 0.73
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 006
Training loss    : 0.00280
Training acc     : 0.73
Validation loss  : 0.00282
Validation acc   : 0.73
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 007
Training loss    : 0.00274
Training acc     : 0.73
Validation loss  : 0.00279
Validation acc   : 0.74
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 008
Training loss    : 0.00268
Training acc     : 0.74
Validation loss  : 0.00277
Validation acc   : 0.74
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 009
Training loss    : 0.00263
Training acc     : 0.74
Validation loss  : 0.00275
Validation acc   : 0.74
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 010
Training loss    : 0.00260
Training acc     : 0.75
Validation loss  : 0.00274
Validation acc   : 0.74
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 011
Training loss    : 0.00256
Training acc     : 0.75
Validation loss  : 0.00273
Validation acc   : 0.74
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 012
Training loss    : 0.00253
Training acc     : 0.76
Validation loss  : 0.00273
Validation acc   : 0.75
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 013
Training loss    : 0.00249
Training acc     : 0.76
Validation loss  : 0.00273
Validation acc   : 0.75
New best validation loss, saving model weights


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 014
Training loss    : 0.00246
Training acc     : 0.76
Validation loss  : 0.00273
Validation acc   : 0.75


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 015
Training loss    : 0.00244
Training acc     : 0.76
Validation loss  : 0.00273
Validation acc   : 0.75


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 016
Training loss    : 0.00241
Training acc     : 0.77
Validation loss  : 0.00274
Validation acc   : 0.75


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 017
Training loss    : 0.00239
Training acc     : 0.77
Validation loss  : 0.00274
Validation acc   : 0.75


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 018
Training loss    : 0.00236
Training acc     : 0.77
Validation loss  : 0.00274
Validation acc   : 0.75


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 019
Training loss    : 0.00234
Training acc     : 0.77
Validation loss  : 0.00275
Validation acc   : 0.75


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 020
Training loss    : 0.00232
Training acc     : 0.78
Validation loss  : 0.00276
Validation acc   : 0.75


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 021
Training loss    : 0.00229
Training acc     : 0.78
Validation loss  : 0.00277
Validation acc   : 0.75


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 022
Training loss    : 0.00227
Training acc     : 0.78
Validation loss  : 0.00278
Validation acc   : 0.75


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 023
Training loss    : 0.00225
Training acc     : 0.78
Validation loss  : 0.00278
Validation acc   : 0.75


train_batch:   0%|          | 0/1077 [00:00<?, ?it/s]           

Epoch: 024
Training loss    : 0.00223
Training acc     : 0.79
Validation loss  : 0.00280
Validation acc   : 0.75


                                                                

Epoch: 025
Training loss    : 0.00221
Training acc     : 0.79
Validation loss  : 0.00281
Validation acc   : 0.75




## evaluate model

In [30]:
test_loss, test_acc = evaluate(test_iter, model, criterion, device)
print('Test loss: {:.5f}, test acc: {:.2f}'.format(test_loss, test_acc))

                                                               

Test loss: 0.00281, test acc: 0.74




## try to predict scores for arbitrary reviews

In [31]:
def predict_score(text, tokenizer, ngrams, vocab, device):
    tokens = text.lower()
    tokenizer = get_tokenizer(tokenizer)
    tokens = tokenizer(tokens)
    tokens = ngrams_iterator(tokens, ngrams)
    tokens = (vocab[t] for t in tokens)
    tokens = torch.tensor(np.fromiter(tokens, dtype='int'))
    
    model.eval()
    preds = model(tokens.to(device))
    probs = nn.functional.softmax(preds, dim=0)
    
    print('Input review: {}'.format(text))
    print('Predicted probas: {}'.format(probs.data))
    print('Predicted score : {} / 5'.format(preds.argmax() + 1))

In [32]:
score_fn = partial(predict_score, tokenizer='spacy', ngrams=2, vocab=vocab, device=device)

In [33]:
score_fn('Nice product!')              

Input review: Nice product!
Predicted probas: tensor([0.0000e+00, 0.0000e+00, 0.0000e+00, 4.0393e-24, 1.0000e+00],
       device='cuda:0')
Predicted score : 5 / 5


In [34]:
score_fn('I was hoping for more, but I guess it\'s ok')

Input review: I was hoping for more, but I guess it's ok
Predicted probas: tensor([2.2421e-44, 6.5471e-11, 1.0000e+00, 0.0000e+00, 0.0000e+00],
       device='cuda:0')
Predicted score : 3 / 5


In [35]:
score_fn('Terrible handling, not usable')

Input review: Terrible handling, not usable
Predicted probas: tensor([1.0000e+00, 9.8091e-45, 0.0000e+00, 0.0000e+00, 0.0000e+00],
       device='cuda:0')
Predicted score : 1 / 5


In [36]:
score_fn('Color almost makes up for a really mediocre material')

Input review: Color almost makes up for a really mediocre material
Predicted probas: tensor([4.3744e-12, 1.0000e+00, 1.3811e-13, 1.1236e-20, 6.6669e-33],
       device='cuda:0')
Predicted score : 2 / 5


In [37]:
score_fn('It seems perfect, but the price is too steep for my taste')

Input review: It seems perfect, but the price is too steep for my taste
Predicted probas: tensor([4.9174e-35, 2.5035e-14, 9.9802e-01, 1.9841e-03, 7.0943e-25],
       device='cuda:0')
Predicted score : 3 / 5
