In [13]:
import torch

import pandas as pd
import numpy as np
from pathlib import Path
import random

import math
import json
import spacy
import pickle

from collections import Counter

In [2]:
from tqdm.notebook import tqdm

In [3]:
# change your path here
PATH = Path("/data2/yinterian/Amazon_review_2014")

## Data download and pre-processing

**Step 0**: Get the data. The zip file is 18G it make take 20 minutes or so.

In [4]:
def unpack_dataset():
    ! wget http://snap.stanford.edu/data/amazon/productGraph/aggressive_dedup.json.gz -P $PATH
    ! gunzip $PATH/aggressive_dedup.json.gz 
#unpack_dataset()   

In [5]:
# the file is 55G
!ls -lhS  $PATH/data/aggressive_dedup.json

-rw-rw-r-- 1 yinterian yinterian 55G Feb 17  2016 /data2/yinterian/Amazon_review_2014/data/aggressive_dedup.json


**Step 1**: Split file into smaller chunks:

I splitted into 42 files of around 1.3G each. This is very slow and you should run it on your terminal. You need to install `jq`

`jq -c < $PATH/aggressive_dedup.json | split -l 2000000 `

Here are the files:

In [6]:
!ls  $PATH/data/x*

/data2/yinterian/Amazon_review_2014/data/xaa
/data2/yinterian/Amazon_review_2014/data/xab
/data2/yinterian/Amazon_review_2014/data/xac
/data2/yinterian/Amazon_review_2014/data/xad
/data2/yinterian/Amazon_review_2014/data/xae
/data2/yinterian/Amazon_review_2014/data/xaf
/data2/yinterian/Amazon_review_2014/data/xag
/data2/yinterian/Amazon_review_2014/data/xah
/data2/yinterian/Amazon_review_2014/data/xai
/data2/yinterian/Amazon_review_2014/data/xaj
/data2/yinterian/Amazon_review_2014/data/xak
/data2/yinterian/Amazon_review_2014/data/xal
/data2/yinterian/Amazon_review_2014/data/xam
/data2/yinterian/Amazon_review_2014/data/xan
/data2/yinterian/Amazon_review_2014/data/xao
/data2/yinterian/Amazon_review_2014/data/xap
/data2/yinterian/Amazon_review_2014/data/xaq
/data2/yinterian/Amazon_review_2014/data/xar
/data2/yinterian/Amazon_review_2014/data/xas
/data2/yinterian/Amazon_review_2014/data/xat
/data2/yinterian/Amazon_review_2014/data/xau
/data2/yinterian/Amazon_review_201

**Spep 2**: In order to preprocess the data I saved the following code in a file called pre-process.py (find it in the repo) and I ran each part of the original fine independently. Here is an example of how you run the first part. This is also slow because we are running spacy. 

`python pre_process_amz_reviews.py --input_file data/xaa --output_file amazon_reviews-01.json > out1 &`

If you have many CPUs you can run a few in parallel. 

In [7]:
import html
def load_data(datafile):
    samples = [json.loads(line) for line in open(datafile).readlines()]
    data = {}
    data['review'] = [html.unescape(sample['reviewText']) for sample in samples]
    data['summary'] = [html.unescape(sample['summary']) for sample in samples]
    data['rating'] = np.array([sample['overall'] for sample in samples])
    return data

def get_clean_review(review, summ, rating):
    sample = {}
    # remove stop-words and whitespace tokens split paragraphs into sentences
    review_valid = [[tok for tok in sent if not tok.is_stop and tok.text.strip() != ''] for sent in review.sents]
    # remove empty sentences
    review_valid = [sent for sent in review_valid if not len(sent) == 0]
    sample['review'] = [[tok.text.lower() for tok in sent] for sent in review_valid]
    # remove stop-words and whitespace tokens
    summary_valid = [tok for tok in summ if not tok.is_stop and tok.text.strip() != '']
    sample['summary'] = [tok.text.lower() for tok in summary_valid]
    sample['rating'] = int(rating)
    return sample

def dump_dataset(raw_data, outfile, summary=True):
    with open(outfile, 'w') as outf:
        nlp = spacy.load('en_core_web_sm')
        review_docs = nlp.pipe(raw_data['review'])
        summ_docs = nlp.pipe(raw_data['summary'])
        n = len(raw_data['rating'])
        pbar = tqdm(total=n)
        for review, summ, rating in zip(review_docs, summ_docs, raw_data['rating']):
            sample = get_clean_review(review, summ, rating)
            pbar.update()
            outf.write(json.dumps(sample) + '\n')  

**Step 3**: Split in train/ valid/ test. 

In [8]:
def load_data_from_json(datafile):
    samples = [json.loads(line) for line in open(datafile).readlines()]
    return samples

# here is a sample of 3  files
#L1 = load_data_from_json(PATH/"amazon_reviews-01.json")
#L2 = load_data_from_json(PATH/"amazon_reviews-02.json")
#L3 = load_data_from_json(PATH/"amazon_reviews-03.json")

In [9]:
def split_data(Ls):
    train = np.array([])
    valid = np.array([])
    test = np.array([])
    np.random.seed(seed=3)
    for L in Ls:
        L = np.array(L)
        np.random.shuffle(L)
        n1 = int(0.8*len(L))
        n2 = int(0.9*len(L))
        t1, t2, t3 = L[:n1], L1[n1:n2], L1[n2:]
        train = np.concatenate((train, t1))
        valid = np.concatenate((valid, t2))
        test = np.concatenate((test, t3))
    return train, valid, test

In [19]:
# this is a subset of the data
#train, valid, test = split_data([L1, L2, L3])

In [22]:
train.shape, valid.shape, test.shape

((4800000,), (600000,), (600000,))

In [23]:
train[0]

{'review': [['inexpensive',
   'method',
   'club',
   'face',
   'clean',
   'golf',
   'course',
   '.'],
  ['buy', '2', '"', 'wear', '"', 'easily', '.']],
 'summary': ['handy', 'tool'],
 'rating': 5}

In [30]:
# save your files
#pickle.dump(train, open(PATH/"train123.pickle", 'wb'))
#pickle.dump(valid, open(PATH/"valid123.pickle", 'wb'))
#pickle.dump(test, open(PATH/"test123.pickle", 'wb'))

## Building vocabulary

In [8]:
train = pickle.load(open(PATH/"train123.pickle", "rb"))
valid = pickle.load(open(PATH/"valid123.pickle", "rb"))
test = pickle.load(open(PATH/"test123.pickle", "rb"))

In [9]:
train.shape, valid.shape, test.shape

((4800000,), (600000,), (600000,))

In [19]:
reviews = [[word for sent in sample['review'] for word in sent] + sample['summary'] for sample in train]

In [20]:
reviews 

[['inexpensive',
  'method',
  'club',
  'face',
  'clean',
  'golf',
  'course',
  '.',
  'buy',
  '2',
  '"',
  'wear',
  '"',
  'easily',
  '.',
  'handy',
  'tool'],
 ['cateye',
  'mc100w',
  'nice',
  'computer',
  ',',
  'suffers',
  'poor',
  'mounting',
  'system',
  '.',
  'mounting',
  'bracket',
  'attached',
  'handlebar',
  'tie',
  'wraps',
  '.',
  'way',
  'tighten',
  'computer',
  'stable',
  '.',
  'result',
  'pushing',
  'button',
  'ofter',
  'requires',
  'hands',
  ',',
  'hold',
  'computer',
  'push',
  'button',
  '.',
  'poor',
  'mounting',
  'system'],
 ['love',
  'shoes',
  '!',
  'adorable',
  '-',
  '.',
  'exactly',
  'look',
  'like',
  'pictures',
  '.',
  'definitely',
  'infants',
  '/',
  'toddlers',
  'walking',
  '.',
  'sturdy',
  'soles',
  'good',
  'tread',
  'grip',
  'floor',
  '/',
  'ground',
  '!',
  'true',
  'size',
  '.',
  'son',
  'growing',
  'size',
  '3',
  'size',
  '4',
  'slightly',
  'big',
  '.',
  'adorable'],
 ['truly',
 

In [21]:
def build_dict(train, max_words=50000):
    """
        Build a dictionary for the words in reviews.
        Only the max_words ones are kept and the remaining will be mapped to <UNK>.
    """
    reviews = [[word for sent in sample['review'] for word in sent] + sample['summary'] for sample in train]
    word_count = Counter()
    for review in reviews:
        for w in review:
            word_count[w] += 1

    words = word_count.most_common(max_words)
    word_dict = {w[0]: index + 2 for (index, w) in enumerate(words)}
    word_dict["<pad>"] = 0 
    word_dict["<unk>"] = 1
    return word_dict

In [22]:
word_dict = build_dict(train)

In [24]:
vocab_file = PATH/"vocab123.pickle"
pickle.dump(word_dict, open(vocab_file, 'wb'))

In [25]:
word_dict = pickle.load(open(PATH/"vocab123.pickle", "rb"))

In [26]:
len(word_dict)

50002

In [32]:
def words2index(data, word_dict):
    reviews = [sample['review'] for sample in data]
    summaries = [sample['summary'] for sample in data]
    ratings = [sample['rating'] for sample in data]
    Reviews = [[[word_dict.get(w, 1) for w in sent] for sent in review if len(sent) > 0] for review in reviews]
    Summaries = [[word_dict.get(w, 1) for w in summary] for summary in summaries]
    df = pd.DataFrame({"review": Reviews, "summary": Summaries, "rating": ratings})
    return df

In [33]:
df = words2index(train, word_dict)

In [34]:
df.head()

Unnamed: 0,review,summary,rating
0,"[[1345, 2050, 1471, 362, 284, 3294, 288, 2], [...","[926, 534]",5
1,"[[38160, 1, 32, 343, 3, 4892, 407, 2662, 299, ...","[407, 2662, 299]",2
2,"[[13, 352, 4], [1604, 6, 2], [220, 63, 10, 415...",[1604],5
3,"[[336, 140, 7, 2], [14, 47, 977, 3724, 5477, 1...","[8, 1157, 3, 1662, 15037]",5
4,"[[1879, 67, 49660, 2040, 1811, 3, 184, 2], [30...",[85],3


In [35]:
pickle.dump(df, open(PATH/"Train123.pickle", 'wb'))

In [36]:
df = words2index(valid, word_dict)
pickle.dump(df, open(PATH/"Valid123.pickle", 'wb'))
df = words2index(test, word_dict)
pickle.dump(df, open(PATH/"Test123.pickle", 'wb'))

## Filtering empty reviews

In [None]:
train_df = pickle.load(open(PATH/"Train123.pickle", "rb"))
valid_df = pickle.load(open(PATH/"Valid123.pickle", "rb"))
test_df = pickle.load(open(PATH/"Test123.pickle", "rb"))

In [None]:
def filter_fn(row):
    words = [word for sent in row["review"] for word in sent] + row["summary"]
    if len(words) == 0:
        return False
    return True

m = train_df.apply(filter_fn, axis=1)
train_df = train_df[m]

m = valid_df.apply(filter_fn, axis=1)
valid_df = valid_df[m]

m = test_df.apply(filter_fn, axis=1)
test_df = test_df[m]

In [None]:
train_df.shape, valid_df.shape, test_df.shape

In [12]:
train_df.shape, valid_df.shape, test_df.shape

((4799998, 3), (600000, 3), (599997, 3))

In [13]:
train_df.head()

Unnamed: 0,review,summary,rating
0,"[[1237, 2102, 1463, 377, 286, 3089, 304, 2], [...","[842, 485]",5
1,"[[37311, 0, 25, 347, 3, 4987, 332, 2681, 303, ...","[332, 2681, 303]",2
2,"[[13, 318, 4], [1370, 6, 2], [207, 64, 10, 418...",[1370],5
3,"[[336, 143, 8, 2], [14, 50, 1018, 3686, 5491, ...","[7, 1123, 3, 1633, 15383]",5
4,"[[1920, 63, 49809, 2055, 1841, 3, 193, 2], [35...",[87],3


In [14]:
pickle.dump(train_df, open(PATH/"Train123.pickle", 'wb'))

In [15]:
pickle.dump(valid_df, open(PATH/"Valid123.pickle", 'wb'))

In [16]:
pickle.dump(valid_df, open(PATH/"Test123.pickle", 'wb'))

## References
* https://github.com/Shivanshu-Gupta/hierarchical-attention-network <br>
