In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_df = pd.read_csv('../../Reviews.csv')
raw_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
processed_df = raw_df.dropna()
len(processed_df)

568412

In [10]:
from sklearn.model_selection import train_test_split

texts = {}
summaries = {}

texts['train'], texts['eval'], summaries['train'], summaries['eval'] = train_test_split(
    processed_df.Text.values, processed_df.Summary.values, train_size=0.8
)
texts['decode'], texts['eval'], summaries['decode'], summaries['eval'] = train_test_split(
    texts['eval'], summaries['eval'], test_size=0.5
)

In [4]:
import nltk

In [5]:
# Special tokens
PARAGRAPH_START = '<p>'
PARAGRAPH_END = '</p>'
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'
UNKNOWN_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'
DOCUMENT_START = '<d>'
DOCUMENT_END = '</d>'

In [6]:
def process_text(text):
    return '{} {} {} {} {} {} {}'.format(DOCUMENT_START, PARAGRAPH_START, SENTENCE_START,
                                         (SENTENCE_END + ' ' + SENTENCE_START).join(text.split('.')),
                                         SENTENCE_END, PARAGRAPH_END, DOCUMENT_END)

In [7]:
import tensorflow
from tensorflow.core.example import example_pb2
import struct

In [11]:
from tqdm import tqdm

for mode in ['train', 'eval', 'decode']:
    with open('data/' + mode + '_food', 'wb') as data_file:
        for i in tqdm(range(texts[mode].shape[0])):
            tf_example = example_pb2.Example()
            tf_example.features.feature['article'].bytes_list.value.extend(
                [process_text(texts[mode][i]).encode()]
            )
            tf_example.features.feature['abstract'].bytes_list.value.extend(
                [process_text(summaries[mode][i]).encode()]
            )
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            data_file.write(struct.pack('q', str_len))
            data_file.write(struct.pack('%ds' % str_len, tf_example_str))


100%|██████████| 454729/454729 [02:53<00:00, 2618.65it/s]
100%|██████████| 56842/56842 [00:21<00:00, 2637.60it/s]
100%|██████████| 56841/56841 [00:21<00:00, 2637.34it/s]


In [12]:
token_counter = nltk.Counter()

for text in tqdm(processed_df.Text):
    token_counter.update(nltk.word_tokenize(process_text(text)))
for summary in tqdm(processed_df.Summary):
    token_counter.update(nltk.word_tokenize(process_text(summary)))


100%|██████████| 568412/568412 [08:43<00:00, 1086.49it/s]
100%|██████████| 568412/568412 [02:29<00:00, 3795.48it/s]


In [13]:
token_count = sum(token_counter.values())

In [16]:
vocab_size = 10000

with open('data/vocab_food', 'w') as vocab_file:
    most_common_token_count = 0
    for token, num in token_counter.most_common(vocab_size):
        print(token, num, file=vocab_file)
        most_common_token_count += num
    print(UNKNOWN_TOKEN, token_count - most_common_token_count, file=vocab_file)
    print(PAD_TOKEN, 5, file=vocab_file)
    for token in [PARAGRAPH_END, PARAGRAPH_START, DOCUMENT_END, DOCUMENT_START]:
        print(token, len(processed_df), file=vocab_file)
    for token in [SENTENCE_END, SENTENCE_START]:
        print(token, len(processed_df) + token_counter['.'], file=vocab_file)