In [1]:
import numpy as np
import pandas as pd

In [2]:
reviews = pd.read_csv('../../Reviews.csv')
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
reviews = reviews[['Summary', 'Text']]

reviews = reviews.dropna()
reviews.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [8]:
# Special tokens
PARAGRAPH_START = '<p>'
PARAGRAPH_END = '</p>'
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'
UNKNOWN_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'
DOCUMENT_START = '<d>'
DOCUMENT_END = '</d>'

In [50]:
import nltk, re

def clean_text(text, remove_stopwords=True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
        
    text = re.sub('\.{3}', '. ', text)
        
    # Add special tokens at both ends of the text
    text = " ".join([DOCUMENT_START, PARAGRAPH_START, SENTENCE_START, text, SENTENCE_END, PARAGRAPH_END, DOCUMENT_END])
    
    # Replace '.' with sentence start/end tokens
    text = (" ".join(['', SENTENCE_END, SENTENCE_START])).join(text.split('.'))
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    #text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]]', '', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(nltk.corpus.stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)
        
    return text

In [56]:
from tqdm import tqdm

# Clean the summaries and texts
clean_summaries = []
for summary in tqdm(reviews.Summary):
    clean_summaries.append(clean_text(summary, remove_stopwords=False))

clean_texts = []
for text in tqdm(reviews.Text):
    clean_texts.append(clean_text(text, remove_stopwords=True))

100%|██████████| 568428/568428 [00:09<00:00, 57753.32it/s]
100%|██████████| 568428/568428 [03:55<00:00, 2411.25it/s]


In [52]:
# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(5):
    print("Review #", i+1)
    print(reviews.Summary[i], '-->', clean_summaries[i])
    print(reviews.Text[i], '-->', clean_texts[i])
    print()

Review # 1
Good Quality Dog Food --> <d> <p> <s> good quality dog food </s> </p> </d>
I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. --> <d> <p> <s> bought several vitality canned dog food products found good quality </s> <s> product looks like stew processed meat smells better </s> <s> labrador finicky appreciates product better </s> <s> </s> </p> </d>

Review # 2
Not as Advertised --> <d> <p> <s> not as advertised </s> </p> </d>
Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo". --> <d> <p> <s> product arrived labeled jumbo salted peanuts </s> <s> peanuts actually small sized unsalted </s> <s> sure error vendor intended represent pro

In [57]:
from sklearn.model_selection import train_test_split

texts = {}
summaries = {}

texts['train'], texts['eval'], summaries['train'], summaries['eval'] = train_test_split(
    clean_texts, clean_summaries, train_size=0.8
)
texts['decode'], texts['eval'], summaries['decode'], summaries['eval'] = train_test_split(
    texts['eval'], summaries['eval'], test_size=0.5
)

In [59]:
from tensorflow.core.example import example_pb2
import struct

for mode in ['train', 'eval', 'decode']:
    with open('data/' + mode + '_food', 'wb') as data_file:
        for i in tqdm(range(len(texts[mode]))):
            tf_example = example_pb2.Example()
            tf_example.features.feature['text'].bytes_list.value.extend(
                [texts[mode][i].encode()]
            )
            tf_example.features.feature['summary'].bytes_list.value.extend(
                [summaries[mode][i].encode()]
            )
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            data_file.write(struct.pack('q', str_len))
            data_file.write(struct.pack('%ds' % str_len, tf_example_str))



  0%|          | 0/454742 [00:00<?, ?it/s][A
  0%|          | 262/454742 [00:00<02:53, 2617.03it/s][A
  0%|          | 519/454742 [00:00<02:54, 2600.90it/s][A
  0%|          | 776/454742 [00:00<02:55, 2590.59it/s][A
  0%|          | 1034/454742 [00:00<02:55, 2585.29it/s][A
  0%|          | 1294/454742 [00:00<02:55, 2587.53it/s][A
  0%|          | 1550/454742 [00:00<02:55, 2578.09it/s][A
  0%|          | 1808/454742 [00:00<02:55, 2576.07it/s][A
  0%|          | 2065/454742 [00:00<02:55, 2572.98it/s][A
  1%|          | 2308/454742 [00:00<02:59, 2518.91it/s][A
  1%|          | 2563/454742 [00:01<02:58, 2527.59it/s][A
  1%|          | 2822/454742 [00:01<02:57, 2543.72it/s][A
  1%|          | 3078/454742 [00:01<02:57, 2546.73it/s][A
  1%|          | 3335/454742 [00:01<02:56, 2552.59it/s][A
  1%|          | 3591/454742 [00:01<02:56, 2553.23it/s][A
  1%|          | 3851/454742 [00:01<02:55, 2565.55it/s][A
  1%|          | 4110/454742 [00:01<02:55, 2571.94it/s][A
  1%|       

In [60]:
token_counter = nltk.Counter()

for text in tqdm(clean_texts):
    token_counter.update(text.split())
for summary in tqdm(clean_summaries):
    token_counter.update(summary.split())


100%|██████████| 568428/568428 [00:10<00:00, 55230.41it/s]
100%|██████████| 568428/568428 [00:04<00:00, 138868.43it/s]


In [62]:
token_count = sum(token_counter.values())
print(token_count, len(token_counter))

37186025 191791


In [67]:
token_counter.most_common(10000)[-10:]

[('iowa', 111),
 ('nicest', 111),
 ('piping', 111),
 ('jordan', 111),
 ('obligate', 111),
 ('emphasize', 111),
 ('brag', 111),
 ('creatures', 111),
 ('bridal', 110),
 ('orderd', 110)]

In [69]:
vocab_size = 10000

with open('data/vocab_food', 'w') as vocab_file:
    most_common_token_count = 0
    for _, num in token_counter.most_common(vocab_size):
        most_common_token_count += num
    print(UNKNOWN_TOKEN, token_count - most_common_token_count, file=vocab_file)
    for token, num in token_counter.most_common(vocab_size):
        print(token, num, file=vocab_file)
    print(PAD_TOKEN, 5, file=vocab_file)
    #for token in [PARAGRAPH_END, PARAGRAPH_START, DOCUMENT_END, DOCUMENT_START]:
    #    print(token, len(processed_df), file=vocab_file)
    #for token in [SENTENCE_END, SENTENCE_START]:
    #    print(token, len(processed_df) + token_counter['.'], file=vocab_file)

In [81]:
for mode in ['train', 'eval', 'decode']:
    with open('data/' + mode + '_food_test', 'wb') as data_file:
        for i in tqdm(range(len(texts[mode]))):
            tf_example = example_pb2.Example()
            test_input = texts[mode][i].split()
            test_sentence = ' '.join(test_input[:6] + test_input[-3:])
            #print(test_sentence)
            tf_example.features.feature['text'].bytes_list.value.extend(
                [test_sentence.encode()]
            )
            tf_example.features.feature['summary'].bytes_list.value.extend(
                [test_sentence.encode()]
            )
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            data_file.write(struct.pack('q', str_len))
            data_file.write(struct.pack('%ds' % str_len, tf_example_str))


100%|██████████| 454742/454742 [02:55<00:00, 2587.00it/s]
100%|██████████| 56843/56843 [00:21<00:00, 2590.03it/s]
100%|██████████| 56843/56843 [00:21<00:00, 2587.69it/s]


In [73]:
print(texts['eval'][123].split()[4])
print(summaries['eval'][123].split()[4])

creating
secret
