In [4]:
import json
import gc
from collections import Counter

import numpy as np
import tqdm
import pandas as pd
import torch
from torch.utils.data import IterableDataset
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.datasets import TextClassificationDataset
from torchtext.vocab import Vocab

## read data

In [2]:
df = pd.read_csv('data/Reviews.csv')
df = df[['Score', 'Text']]
df = df.dropna()
df = df.drop_duplicates('Text')

In [3]:
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [4]:
df['Text'].describe()

count                                                393579
unique                                               393579
top       Recently purchased several cans of this for in...
freq                                                      1
Name: Text, dtype: object

In [5]:
df['Score'].unique()

array([5, 1, 4, 2, 3])

## build dataset

In [6]:
labels, reviews = df['Score'].to_numpy(), df['Text'].to_numpy()

make splits

In [7]:
def split_data(arrays, ratios=(0.7, 0.2, 0.1)):
    data_len = arrays[0].shape[0]
    assert all(a.shape[0] == data_len for a in arrays[1:])
    sizes = [r / sum(ratios) for r in ratios]
    sizes = [int(s * data_len) for s in sizes[:-1]]
    sizes.append(data_len - sum(sizes))
    start = 0
    finish = 0
    splits = []
    for s in sizes:
        finish += s
        splits.append([a[start:finish] for a in arrays])
        start += s
    return splits

train, valid, test = split_data((labels, reviews), (0.7, 0.2, 0.1))

In [12]:
def data_merge(data):
    labels, text = data
    return [{'label': int(l), 'text': row} for l, row in zip(labels, text)]

train = data_merge(train)
valid = data_merge(valid)
test = data_merge(test)

ValueError: too many values to unpack (expected 2)

In [20]:
del df

tokenize text in datasets, add bigrams

In [None]:
! python -m spacy download en

In [13]:
def data_tokenize(data, tokenizer, lower, ngrams, cache=True):
    tokenizer = get_tokenizer(tokenizer)
    for entry in tqdm.tqdm(data, 'lines', len(data)):
        if lower:
            entry['text'] = entry['text'].lower()
        entry['text'] = tokenizer(entry['text'])
        entry['text'] = list(ngrams_iterator(entry['text'], ngrams))
    return data

tokenizer = 'spacy'
lower = True
ngrams = 2

train = data_tokenize(train, tokenizer, lower, ngrams)

lines: 100%|██████████| 275505/275505 [01:21<00:00, 3400.75it/s]


In [15]:
valid = data_tokenize(valid, tokenizer, lower, ngrams)
test = data_tokenize(test, tokenizer, lower, ngrams)

lines: 100%|██████████| 78715/78715 [00:23<00:00, 3420.01it/s]
lines: 100%|██████████| 39359/39359 [00:11<00:00, 3367.05it/s]


save just in case

In [18]:
with open('train_tokenized.json', 'wt') as file:
    json.dump(train, file)

with open('valid_tokenized.json', 'wt') as file:
    json.dump(valid, file)

with open('test_tokenized.json', 'wt') as file:
    json.dump(test, file)

load data if available

In [2]:
with open('train_tokenized.json', 'rt') as file:
    train = json.load(file)

with open('valid_tokenized.json', 'rt') as file:
    valid = json.load(file)

with open('test_tokenized.json', 'rt') as file:
    test = json.load(file)

create vocabulary

In [3]:
def build_vocab(data, max_size=30000):
    counter = Counter()
    for entry in tqdm.tqdm(data):
        counter.update(entry['text'])
    return Vocab(counter, max_size)

vocab = build_vocab(train)

100%|██████████| 275505/275505 [00:10<00:00, 26624.58it/s]


create torch datasets

In [16]:
def setup_dataset(data, vocab):
    result = []
    label_dict = {i + 1: i for i in range(5)}
    for entry in tqdm.tqdm(data, 'lines', len(data)):
        label = label_dict[entry['label']]
        row = [torch.tensor(vocab[token]) for token in entry['text']]
        result.append((label, row))
    return TextClassificationDataset(vocab, data, set(range(5)))

# train = setup_dataset(train, vocab)
test = setup_dataset(test, vocab)

lines: 100%|██████████| 39359/39359 [00:37<00:00, 1037.45it/s]


In [19]:
gc.collect()

78742

In [None]:
class IterableTextDataset(IterableDataset):
    def __init__(self):
        super.__init__()