In [1]:
import torch

import pandas as pd
import numpy as np
from pathlib import Path
import random

import math
import json
import spacy
import pickle

from torchtext.data import Field

In [2]:
from tqdm.notebook import tqdm

In [3]:
# change your path here
PATH = Path("/data2/yinterian/Amazon_review_2014")

## Data download and pre-processing

**Step 0**: Get the data. The zip file is 18G it make take 20 minutes or so.

In [4]:
def unpack_dataset():
    ! wget http://snap.stanford.edu/data/amazon/productGraph/aggressive_dedup.json.gz -P $PATH
    ! gunzip $PATH/aggressive_dedup.json.gz 
#unpack_dataset()   

In [5]:
# the file is 55G
!ls -lhS  $PATH/data/aggressive_dedup.json

-rw-rw-r-- 1 yinterian yinterian 55G Feb 17  2016 /data2/yinterian/Amazon_review_2014/data/aggressive_dedup.json


**Step 1**: Split file into smaller chunks:

I splitted into 42 files of around 1.3G each. This is very slow and you should run it on your terminal. You need to install `jq`

`jq -c < $PATH/aggressive_dedup.json | split -l 2000000 `

Here are the files:

In [6]:
!ls  $PATH/data/x*

/data2/yinterian/Amazon_review_2014/data/xaa
/data2/yinterian/Amazon_review_2014/data/xab
/data2/yinterian/Amazon_review_2014/data/xac
/data2/yinterian/Amazon_review_2014/data/xad
/data2/yinterian/Amazon_review_2014/data/xae
/data2/yinterian/Amazon_review_2014/data/xaf
/data2/yinterian/Amazon_review_2014/data/xag
/data2/yinterian/Amazon_review_2014/data/xah
/data2/yinterian/Amazon_review_2014/data/xai
/data2/yinterian/Amazon_review_2014/data/xaj
/data2/yinterian/Amazon_review_2014/data/xak
/data2/yinterian/Amazon_review_2014/data/xal
/data2/yinterian/Amazon_review_2014/data/xam
/data2/yinterian/Amazon_review_2014/data/xan
/data2/yinterian/Amazon_review_2014/data/xao
/data2/yinterian/Amazon_review_2014/data/xap
/data2/yinterian/Amazon_review_2014/data/xaq
/data2/yinterian/Amazon_review_2014/data/xar
/data2/yinterian/Amazon_review_2014/data/xas
/data2/yinterian/Amazon_review_2014/data/xat
/data2/yinterian/Amazon_review_2014/data/xau
/data2/yinterian/Amazon_review_201

**Spep 2**: In order to preprocess the data I saved the following code in a file called pre-process.py (find it in the repo) and I ran each part of the original fine independently. Here is an example of how you run the first part. This is also slow because we are running spacy. 

`python pre_process_amz_reviews.py --input_file data/xaa --output_file amazon_reviews-01.json > out1 &`

If you have many CPUs you can run a few in parallel. 

In [7]:
import html
def load_data(datafile):
    samples = [json.loads(line) for line in open(datafile).readlines()]
    data = {}
    data['review'] = [html.unescape(sample['reviewText']) for sample in samples]
    data['summary'] = [html.unescape(sample['summary']) for sample in samples]
    data['rating'] = np.array([sample['overall'] for sample in samples])
    return data

def get_clean_review(review, summ, rating):
    sample = {}
    # remove stop-words and whitespace tokens split paragraphs into sentences
    review_valid = [[tok for tok in sent if not tok.is_stop and tok.text.strip() != ''] for sent in review.sents]
    # remove empty sentences
    review_valid = [sent for sent in review_valid if not len(sent) == 0]
    sample['review'] = [[tok.text.lower() for tok in sent] for sent in review_valid]
    # remove stop-words and whitespace tokens
    summary_valid = [tok for tok in summ if not tok.is_stop and tok.text.strip() != '']
    sample['summary'] = [tok.text.lower() for tok in summary_valid]
    sample['rating'] = int(rating)
    return sample

def dump_dataset(raw_data, outfile, summary=True):
    with open(outfile, 'w') as outf:
        nlp = spacy.load('en_core_web_sm')
        review_docs = nlp.pipe(raw_data['review'])
        summ_docs = nlp.pipe(raw_data['summary'])
        n = len(raw_data['rating'])
        pbar = tqdm(total=n)
        for review, summ, rating in zip(review_docs, summ_docs, raw_data['rating']):
            sample = get_clean_review(review, summ, rating)
            pbar.update()
            outf.write(json.dumps(sample) + '\n')  

**Step 3**: Split in train/ valid/ test. 

In [8]:
def load_data_from_json(datafile):
    samples = [json.loads(line) for line in open(datafile).readlines()]
    return samples

# here is a sample of 3  files
#L1 = load_data_from_json(PATH/"amazon_reviews-01.json")
#L2 = load_data_from_json(PATH/"amazon_reviews-02.json")
#L3 = load_data_from_json(PATH/"amazon_reviews-03.json")

In [9]:
def split_data(Ls):
    train = np.array([])
    valid = np.array([])
    test = np.array([])
    np.random.seed(seed=3)
    for L in Ls:
        L = np.array(L)
        np.random.shuffle(L)
        n1 = int(0.8*len(L))
        n2 = int(0.9*len(L))
        t1, t2, t3 = L[:n1], L1[n1:n2], L1[n2:]
        train = np.concatenate((train, t1))
        valid = np.concatenate((valid, t2))
        test = np.concatenate((test, t3))
    return train, valid, test

In [19]:
# this is a subset of the data
#train, valid, test = split_data([L1, L2, L3])

In [22]:
train.shape, valid.shape, test.shape

((4800000,), (600000,), (600000,))

In [23]:
train[0]

{'review': [['inexpensive',
   'method',
   'club',
   'face',
   'clean',
   'golf',
   'course',
   '.'],
  ['buy', '2', '"', 'wear', '"', 'easily', '.']],
 'summary': ['handy', 'tool'],
 'rating': 5}

In [30]:
# save your files
#pickle.dump(train, open(PATH/"train123.pickle", 'wb'))
#pickle.dump(valid, open(PATH/"valid123.pickle", 'wb'))
#pickle.dump(test, open(PATH/"test123.pickle", 'wb'))

## Building vocabulary

In [10]:
train = pickle.load(open(PATH/"train123.pickle", "rb"))
valid = pickle.load(open(PATH/"valid123.pickle", "rb"))
test = pickle.load(open(PATH/"test123.pickle", "rb"))

In [11]:
train.shape, valid.shape, test.shape

((4800000,), (600000,), (600000,))

In [37]:
def build_comb_vocab(train, vocab_file):
    reviews = [[word for sent in sample['review'] for word in sent] + sample['summary'] * 2 for sample in train]
    review_field = Field()
    review_field.build_vocab(reviews, min_freq=5, vectors="glove.6B.200d")
    print("Dumping vocab to {}".format(vocab_file))
    pickle.dump(review_field.vocab, open(vocab_file, 'wb'))
    
#build_comb_vocab(train, PATH/"vocab123.pickle")

.vector_cache/glove.6B.zip: 862MB [06:27, 2.22MB/s]                              
100%|█████████▉| 398721/400000 [00:28<00:00, 13645.56it/s]

Dumping vocab to /data2/yinterian/Amazon_review_2014/vocab123.pickle


100%|█████████▉| 398721/400000 [00:42<00:00, 13645.56it/s]

In [14]:
vocab = pickle.load(open(PATH/"vocab123.pickle", "rb"))

In [17]:
len(vocab)

282174

In [23]:
def words2index(data, vocab, filename):
    reviews = [sample['review'] for sample in data]
    summaries = [sample['summary'] for sample in data]
    ratings = [sample['rating'] for sample in data]
    Reviews = [[[vocab.stoi[w] for w in sent] for sent in review if len(sent) > 0] for review in reviews]
    Summaries = [[vocab.stoi[w] for w in summary] for summary in summaries]
    df = pd.DataFrame({"review": Reviews, "summary": Summaries, "rating": ratings})
    pickle.dump(df, open(filename, 'wb'))

In [24]:
words2index(train, vocab, PATH/"Train123.pickle")
words2index(valid, vocab, PATH/"Valid123.pickle")
words2index(test, vocab, PATH/"Test123.pickle")

## References
* https://github.com/Shivanshu-Gupta/hierarchical-attention-network <br>
* Hierarchical Attention Networks for Document Classification. Zichao Yang, Diyi Yang, Chris Dyer, Xiaodong He, Alex Smola, Eduard Hovy
