In [16]:
from collections import Counter

import tqdm
import pandas as pd
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.vocab import Vocab

## read data

In [17]:
df = pd.read_csv('data/Reviews.csv')

In [18]:
df = df[['Score', 'Text']]

In [19]:
df = df.dropna()

In [20]:
df = df.drop_duplicates('Text')

In [21]:
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [22]:
df['Text'].describe()

count                                                393579
unique                                               393579
top       Product wasn't packaged well, the candy just r...
freq                                                      1
Name: Text, dtype: object

In [23]:
df['Score'].unique()

array([5, 1, 4, 2, 3])

## build dataset

In [24]:
labels, reviews = df['Score'].to_numpy(), df['Text'].to_numpy()

make splits

In [25]:
def split_data(arrays, ratios=(0.7, 0.2, 0.1)):
    data_len = arrays[0].shape[0]
    assert all(a.shape[0] == data_len for a in arrays[1:])
    sizes = [r / sum(ratios) for r in ratios]
    sizes = [int(s * data_len) for s in sizes[:-1]]
    sizes.append(data_len - sum(sizes))
    start = 0
    finish = 0
    splits = []
    for s in sizes:
        finish += s
        splits.append([a[start:finish] for a in arrays])
        start += s
    return splits

train, valid, test = split_data((labels, reviews), (0.7, 0.2, 0.1))

create vocabulary with bigrams

In [26]:
train_labels, train_text = train

In [29]:
def build_vocab(text,
                tokenizer='basic_english',
                max_size=30000,  # 3M parameters with emb_dim=100
                ngrams=2,
                ):
    tokenizer = get_tokenizer(tokenizer)
    tokenized_text = (tokenizer(row) for row in text)
    tokenized_text = (ngrams_iterator(row, ngrams) for row in tokenized_text)
    counter = Counter()
    for row in tqdm.tqdm(tokenized_text, 'lines', text.shape[0]):
        counter.update(row)
    return Vocab(counter, max_size)

vocab = build_vocab(train_text, tokenizer='spacy')

lines: 100%|██████████| 275505/275505 [01:54<00:00, 2412.11it/s]


In [31]:
sorted(vocab.freqs)[::-1]

['être fort',
 'être',
 'çelem )',
 'çelem',
 'çay !',
 'çay',
 '×piperita L.',
 '×piperita',
 "Ît 's",
 'Ît',
 'Île de',
 'Île',
 'Çaykur teas',
 'Çaykur',
 'Â  ',
 'Â',
 '¾ water',
 '¾ tsp',
 '¾ ounces',
 '¾ or',
 '¾ of',
 '¾ cup',
 '¾ "',
 '¾',
 '½-ounce pouches',
 '½-ounce',
 '½-inch pieces',
 '½-inch',
 '½ years',
 '½ year',
 '½ worked',
 '½ with',
 '½ weeks',
 '½ tsp',
 '½ the',
 '½ teaspoon',
 '½ tablespoons',
 '½ stick',
 '½ star',
 '½ pounds',
 '½ pack',
 '½ ounces',
 '½ or',
 '½ of',
 '½ minutes',
 '½ lbs',
 '½ lb',
 '½ inch',
 '½ in',
 '½ gallon',
 '½ fat',
 '½ cups',
 '½ cup',
 '½ c',
 '½ an',
 '½ a',
 '½ Yorkie',
 '½ Pomeranian',
 '½ -1',
 '½ -',
 '½ ,',
 '½ )',
 '½ &',
 '½ "',
 '½',
 '¼ the',
 '¼ teaspoon',
 '¼ pound)<br',
 '¼ package',
 '¼ of',
 '¼ inch',
 '¼ cups',
 '¼ cup.<br',
 '¼ cup',
 '¼ c.',
 '¼ TBSP',
 '¼ "',
 '¼',
 'º',
 '· Sweetener',
 '· Sucralose',
 '· Saccharin',
 '· Cyclamates',
 '· Aspartame',
 '· Acesulfame',
 '·  ',
 '·',
 'µg per',
 'µg daily',
 'µg THC