In [1]:
import json
from collections import Counter
from functools import partial

import numpy as np
import tqdm
import pandas as pd
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.vocab import Vocab


## read data

In [2]:
df = pd.read_csv('data/Reviews.csv')
df = df[['Score', 'Text']]
df = df.dropna()
df = df.drop_duplicates('Text')

In [3]:
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [4]:
df['Text'].describe()

count                                                393579
unique                                               393579
top       This is the best cat litter box next to the Li...
freq                                                      1
Name: Text, dtype: object

In [5]:
df['Score'].unique()

array([5, 1, 4, 2, 3])

## build dataset

In [6]:
labels, reviews = df['Score'].to_numpy(), df['Text'].to_numpy()

make splits

In [7]:
def split_data(arrays, ratios=(0.7, 0.2, 0.1)):
    data_len = arrays[0].shape[0]
    assert all(a.shape[0] == data_len for a in arrays[1:])
    sizes = [r / sum(ratios) for r in ratios]
    sizes = [int(s * data_len) for s in sizes[:-1]]
    sizes.append(data_len - sum(sizes))
    start = 0
    finish = 0
    splits = []
    for s in sizes:
        finish += s
        splits.append([a[start:finish] for a in arrays])
        start += s
    return splits

train, valid, test = split_data((labels, reviews), (0.7, 0.2, 0.1))

In [8]:
def data_merge(data):
    labels, text = data
    return [{'label': int(l), 'text': row} for l, row in zip(labels, text)]

train = data_merge(train)
valid = data_merge(valid)
test = data_merge(test)

In [9]:
del df

tokenize text in datasets, add bigrams

In [10]:
def data_tokenize(data, tokenizer, lower, ngrams, cache=True):
    tokenizer = get_tokenizer(tokenizer)
    for entry in tqdm.tqdm(data, 'lines', len(data)):
        if lower:
            entry['text'] = entry['text'].lower()
        entry['text'] = tokenizer(entry['text'])
        entry['text'] = list(ngrams_iterator(entry['text'], ngrams))
    return data

tokenizer = 'spacy'
lower = True
ngrams = 2

train = data_tokenize(train, tokenizer, lower, ngrams)

lines: 100%|██████████| 275505/275505 [01:24<00:00, 3243.43it/s]


In [11]:
valid = data_tokenize(valid, tokenizer, lower, ngrams)
test = data_tokenize(test, tokenizer, lower, ngrams)

lines: 100%|██████████| 78715/78715 [00:24<00:00, 3154.55it/s]
lines: 100%|██████████| 39359/39359 [00:12<00:00, 3069.20it/s]


save data

In [13]:
def save_tokenized(data, filename):
    with open(filename, 'wt') as f:
        f.writelines(json.dumps(l) + '\n' for l in tqdm.tqdm(data))

save_tokenized(train, 'train_tokenized.json')

100%|██████████| 275505/275505 [00:06<00:00, 45648.26it/s]


In [14]:
save_tokenized(valid, 'valid_tokenized.json')
save_tokenized(test, 'test_tokenized.json')

100%|██████████| 78715/78715 [00:01<00:00, 45678.30it/s]
100%|██████████| 39359/39359 [00:00<00:00, 42899.17it/s]


load data if available

In [2]:
def load_tokenized(filename):
    with open(filename, 'rt') as f:
        return [json.loads(l) for l in tqdm.tqdm(f.readlines())]

try:
    if train:
        pass
except NameError:
    train = load_tokenized('train_tokenized.json')

100%|██████████| 275505/275505 [00:08<00:00, 33676.79it/s]


create vocabulary

In [3]:
def build_vocab(data,
                max_size=30000,  # x 100 emb_dim = about 3M model parameters
                ):
    counter = Counter()
    for entry in tqdm.tqdm(data):
        counter.update(entry['text'])
    return Vocab(counter, max_size)

vocab = build_vocab(train)

100%|██████████| 275505/275505 [00:10<00:00, 25736.63it/s]


create torch datasets

In [4]:
class JsonDataset(Dataset):
    def __init__(self, filename, vocab):
        self.filename = filename
        self.vocab = vocab
        self.label_dict = {i + 1: i for i in range(5)}
        self.data_len = None

    def __getitem__(self, index):
        with open(self.filename, 'rt') as f:
            for i, l in enumerate(f):
                if i == index:
                    break
        line = json.loads(l)
        label = line['label']
        label = self.label_dict[label]
        text = line['text']
        text = torch.tensor(np.fromiter((vocab[token] for token in text),
                                        dtype='int'))
        return label, text

    def __len__(self):
        if not self.data_len:
            with open(self.filename) as f:
                for i, l in enumerate(f):
                    pass
            self.data_len = i + 1
        return self.data_len

train_dataset = JsonDataset('train_tokenized.json', vocab)

In [5]:
valid_dataset = JsonDataset('valid_tokenized.json', vocab)
test_dataset = JsonDataset('test_tokenized.json', vocab)

In [6]:
assert len(train_dataset) == len(train)

In [7]:
train_dataset[0]

(4,
 tensor([    5,    23,   188,   509,    12,     3,     0,  1101,   168,    85,
           309,     6,    23,   173,    41,    57,     9,    45,    12,    43,
           272,     2,     3,    52,  1204,    68,    35,     7,  8561,    78,
             7,  2457,   833,     6,     8,  1236,   134,     2,    18, 26061,
            13,  6014,     6,   152,     0,    14,    52,   134,    78,    10,
           213,     2,    73,  3763, 18810,  7085,    58,     0,     0, 28801,
          1013, 11736,  4716,   641,  1355,  3619,  2450, 16407,   157, 19358,
          5927,  3090,  2636,    99,   381,     0,     0,  1932,   474,     0,
             0,  1422,     0,     0,  8115,   161,  3327,     0,  1193,   428,
             0,     0,     0,     0,  1237,     0,     0,   137,     0,   496,
             0,  5994, 12876]))

In [9]:
def padded_collate(batch, padding):
    labels, texts = zip(*batch)
    labels = torch.tensor(labels)
    texts = pad_sequence(texts, padding_value=padding)
    return labels, texts

padding = vocab['<pad>']
collate = partial(padded_collate, padding=padding)

In [16]:
batch_size = 32

train_iter = DataLoader(train_dataset,
                        batch_size,
                        shuffle=True,
                        collate_fn=collate,
                        num_workers=4)
valid_iter = DataLoader(valid_dataset,
                        batch_size,
                        shuffle=False,
                        collate_fn=collate,
                        num_workers=4)
test_iter  = DataLoader(test_dataset,
                        batch_size,
                        shuffle=False,
                        collate_fn=collate,
                        num_workers=4)

In [22]:
class TextModel(nn.Module):
    def __init__(self, vocab_len, embed_dim, n_classes, padding):
        super().__init__()
        self.embedding = nn.Embedding(vocab_len, embed_dim, padding)
        self.fc = nn.Linear(embed_dim, n_classes)

    def forward(self, x):
        x = self.embedding(x)
        # shape = [seq_dim, batch_dim, embed_dim]
        x = x.mean(0)
        return self.fc(x)

vocab_len = len(vocab)
embed_dim = 100
n_classes = 5
model = TextModel(vocab_len, embed_dim, n_classes, padding)

In [24]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

3000705