In [63]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.experimental.datasets.text_classification\
    import TextClassificationDataset

## read data

In [64]:
df = pd.read_csv('data/Reviews.csv')

In [65]:
df = df[['Score', 'Text']]

In [66]:
df = df.dropna()

In [67]:
df = df.drop_duplicates('Text')

In [68]:
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [69]:
df['Text'].describe()

count                                                393579
unique                                               393579
top       I tried these cookies at a GF expo.  The are d...
freq                                                      1
Name: Text, dtype: object

In [70]:
df['Score'].unique()

array([5, 1, 4, 2, 3])

## define dataset

In [71]:
labels, reviews = df['Score'].to_numpy(), df['Text'].to_numpy()

In [73]:
def split_data(arrays, ratios=(0.7, 0.2, 0.1)):
    data_len = arrays[0].shape[0]
    assert all(a.shape[0] == data_len for a in arrays[1:])
    sizes = [r / sum(ratios) for r in ratios]
    sizes = [int(s * data_len) for s in sizes[:-1]]
    sizes.append(data_len - sum(sizes))
    start = 0
    finish = 0
    splits = []
    for s in sizes:
        finish += s
        splits.append([a[start:finish] for a in arrays])
        start += s
    return splits

train, valid, test = split_data((labels, reviews), (0.7, 0.2, 0.1))

In [74]:
train_labels, train_text = train

In [75]:
train_text

array(['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
       'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
       'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
    

In [76]:
assert int(df.shape[0] * 0.7) == train_text.shape[0]

In [77]:
train_labels

array([5, 1, 4, ..., 3, 5, 1])

In [78]:
assert (train[0].shape[0] + valid[0].shape[0] + test[0].shape[0]
        == df.shape[0])

In [79]:
def to_tuple_list(arrays):
    labels, reviews = arrays
    return [(l, r) for l, r in zip(labels, reviews)]

In [80]:
help(TextClassificationDataset)

Help on class TextClassificationDataset in module torchtext.datasets.text_classification:

class TextClassificationDataset(torch.utils.data.dataset.Dataset)
 |  TextClassificationDataset(vocab, data, labels)
 |  
 |  Defines an abstract text classification datasets.
 |  Currently, we only support the following datasets:
 |  
 |        - AG_NEWS
 |        - SogouNews
 |        - DBpedia
 |        - YelpReviewPolarity
 |        - YelpReviewFull
 |        - YahooAnswers
 |        - AmazonReviewPolarity
 |        - AmazonReviewFull
 |  
 |  Method resolution order:
 |      TextClassificationDataset
 |      torch.utils.data.dataset.Dataset
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, i)
 |  
 |  __init__(self, vocab, data, labels)
 |      Initiate text-classification dataset.
 |      
 |      Arguments:
 |          vocab: Vocabulary object used for dataset.
 |          data: a list of label/tokens tuple. tokens are a tensor after
 |              numerica