In [2]:
from collections import Counter

import numpy as np
import tqdm
import pandas as pd
import torch
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.datasets import TextClassificationDataset
from torchtext.vocab import Vocab

## read data

In [3]:
df = pd.read_csv('data/Reviews.csv')

In [4]:
df = df[['Score', 'Text']]

In [5]:
df = df.dropna()

In [6]:
df = df.drop_duplicates('Text')

In [7]:
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [8]:
df['Text'].describe()

count                                                393579
unique                                               393579
top       I realize all at once that I've been eating (a...
freq                                                      1
Name: Text, dtype: object

In [9]:
df['Score'].unique()

array([5, 1, 4, 2, 3])

## build dataset

In [10]:
labels, reviews = df['Score'].to_numpy(), df['Text'].to_numpy()

make splits

In [11]:
def split_data(arrays, ratios=(0.7, 0.2, 0.1)):
    data_len = arrays[0].shape[0]
    assert all(a.shape[0] == data_len for a in arrays[1:])
    sizes = [r / sum(ratios) for r in ratios]
    sizes = [int(s * data_len) for s in sizes[:-1]]
    sizes.append(data_len - sum(sizes))
    start = 0
    finish = 0
    splits = []
    for s in sizes:
        finish += s
        splits.append([a[start:finish] for a in arrays])
        start += s
    return splits

train, valid, test = split_data((labels, reviews), (0.7, 0.2, 0.1))

In [12]:
def data_merge(data):
    labels, text = data
    labels = [int(l) for l in labels]
    text = [row for row in text]
    return zip(labels, text)

train = data_merge(train)
valid = data_merge(valid)
test = data_merge(test)

tokenize text in datasets, add bigrams

In [None]:
! python -m spacy download en

In [13]:
def data_tokenize(data, tokenizer, lower, ngrams):
    tokenizer = get_tokenizer(tokenizer)
    labels, text = zip(*data)
    if lower:
        text = (row.lower() for row in text)
    text = (tokenizer(row) for row in text)
    text = [list(ngrams_iterator(row, ngrams)) for row in
            tqdm.tqdm(text, 'lines', len(labels))]
    return zip(labels, text)

tokenizer = 'spacy'
lower = True
ngrams = 2

train = data_tokenize(train, tokenizer, lower, ngrams)

lines: 100%|██████████| 275505/275505 [01:18<00:00, 3512.75it/s]


In [15]:
valid = data_tokenize(valid, tokenizer, lower, ngrams)
test = data_tokenize(test, tokenizer, lower, ngrams)

lines: 100%|██████████| 78715/78715 [00:22<00:00, 3453.42it/s]
lines: 100%|██████████| 39359/39359 [00:11<00:00, 3470.43it/s]


create vocabulary

In [18]:
def build_vocab(data, max_size=10000):
    counter = Counter()
    for label, row in data:
        counter.update(row)
    return Vocab(counter, max_size)

vocab = build_vocab(train)

create torch datasets

['this',
 'saltwater',
 'taffy',
 'had',
 'great',
 'flavors',
 'and',
 'was',
 'very',
 'soft',
 'and',
 'chewy',
 '.',
 ' ',
 'each',
 'candy',
 'was',
 'individually',
 'wrapped',
 'well',
 '.',
 ' ',
 'none',
 'of',
 'the',
 'candies',
 'were',
 'stuck',
 'together',
 ',',
 'which',
 'did',
 'happen',
 'in',
 'the',
 'expensive',
 'version',
 ',',
 'fralinger',
 "'s",
 '.',
 ' ',
 'would',
 'highly',
 'recommend',
 'this',
 'candy',
 '!',
 ' ',
 'i',
 'served',
 'it',
 'at',
 'a',
 'beach',
 '-',
 'themed',
 'party',
 'and',
 'everyone',
 'loved',
 'it',
 '!',
 'this saltwater',
 'saltwater taffy',
 'taffy had',
 'had great',
 'great flavors',
 'flavors and',
 'and was',
 'was very',
 'very soft',
 'soft and',
 'and chewy',
 'chewy .',
 '.  ',
 '  each',
 'each candy',
 'candy was',
 'was individually',
 'individually wrapped',
 'wrapped well',
 'well .',
 '.  ',
 '  none',
 'none of',
 'of the',
 'the candies',
 'candies were',
 'were stuck',
 'stuck together',
 'together ,',
 ', 