### Overview

Decimate reviews by category.  The notebook reads each original category data
file, randomly chooses at most `MAX_PER_CATEGORY` (currently 100,000) reviews
in the file that actually have review text (a small number do not), then
writes those reviews to a new file.

#### Input Files

The input files should be the original data files as downloaded, stored in the subdirectory `original/`.  Example `original/Kindle_Store_5.json.gz`.

#### Output Files

The output files will be placed in a subdirectory called `decimated/` with the
same basename as the original file, and the extension `.json`.  Example:
`decimated/Kindle_Store_5.json`.  Note the output files are not compressed.  

#### Shuffling

The random generator used to decimate and shuffle the reviews is derived from
`DECIMATE_SEED`.  Because the notebook shuffles the output reviews, one can
choose training and validation sets for any category just be splitting reviews
in the corresponding file.

In [1]:
import gzip
import json
import random

BASE_NAMES = [
    "AMAZON_FASHION_5",
    "All_Beauty_5",
    "Appliances_5",
    "Arts_Crafts_and_Sewing_5",
    "Automotive_5",
    "Books_5",
    "CDs_and_Vinyl_5",
    "Cell_Phones_and_Accessories_5",
    "Clothing_Shoes_and_Jewelry_5",
    "Digital_Music_5",
    "Electronics_5",
    "Gift_Cards_5",
    "Grocery_and_Gourmet_Food_5",
    "Home_and_Kitchen_5",
    "Industrial_and_Scientific_5",
    "Kindle_Store_5",
    "Luxury_Beauty_5",
    "Magazine_Subscriptions_5",
    "Movies_and_TV_5",
    "Musical_Instruments_5",
    "Office_Products_5",
    "Patio_Lawn_and_Garden_5",
    "Pet_Supplies_5",
    "Prime_Pantry_5",
    "Software_5",
    "Sports_and_Outdoors_5",
    "Tools_and_Home_Improvement_5",
    "Toys_and_Games_5",
    "Video_Games_5",
]

DECIMATE_SEED = 1892

MAX_PER_CATEGORY = 100_000

In [2]:
def original_filename(basename:str):
    return f"original/{basename}.json.gz"

def decimated_filename(basename:str):
    return f"decimated/{basename}.json"

In [3]:
#
# Return the zero-based indices of reviews in an original file that have
# non-empty string reviews.
#

def good_line_indices(basename:str):
    result = []
    with gzip.open(original_filename(basename)) as inputfile:
        for index, line in enumerate(inputfile):
            review = json.loads(line)
            text = review.get('reviewText')
            if type(text) is str and text and not text.isspace():
                result.append(index)
    return result

#
# Return a randomly selected subset of indices as a list in random order.  The
# size of the subset is min(len(indices),max_reviews).  The random number
# generator used is seeded with the given seen.
#

def decimate_indices(indices:list[int],max_reviews:int,seed:int):
    randgen = random.Random(seed)
    indices = list(indices)
    randgen.shuffle(indices)
    return indices[:max_reviews]


In [4]:
def decimate_reviews(basename:str,indices:list[int]):
    index_map = { indices[i]:i for i in range(len(indices)) }
    survivors:list[bytes|None] = [ None ] * len(indices)
    with gzip.open(original_filename(basename)) as inputfile:
        for index, line in enumerate(inputfile):
            if index in index_map:
                assert survivors[index_map[index]] is None
                survivors[index_map[index]] = line
    with open(decimated_filename(basename),"wb") as outputfile:
        for line in survivors:
            assert line is not None
            outputfile.write(line)

In [5]:
!mkdir original
!mv *.json.gz original

mkdir: original: File exists
mv: rename *.json.gz to original/*.json.gz: No such file or directory


In [6]:
DECIMATE_FILES = True

if DECIMATE_FILES:
    for k, basename in enumerate(BASE_NAMES):
        print(f"Processing {basename}")
        print("    ...scanning")
        indices = good_line_indices(basename)
        print("    ...decimating")
        indices = decimate_indices(indices,MAX_PER_CATEGORY,DECIMATE_SEED+k)
        decimate_reviews(basename,indices)

Processing AMAZON_FASHION_5
    ...scanning
    ...decimating
Processing All_Beauty_5
    ...scanning
    ...decimating
Processing Appliances_5
    ...scanning
    ...decimating
Processing Arts_Crafts_and_Sewing_5
    ...scanning
    ...decimating
Processing Automotive_5
    ...scanning
    ...decimating
Processing Books_5
    ...scanning
    ...decimating
Processing CDs_and_Vinyl_5
    ...scanning
    ...decimating
Processing Cell_Phones_and_Accessories_5
    ...scanning
    ...decimating
Processing Clothing_Shoes_and_Jewelry_5
    ...scanning
    ...decimating
Processing Digital_Music_5
    ...scanning
    ...decimating
Processing Electronics_5
    ...scanning
    ...decimating
Processing Gift_Cards_5
    ...scanning
    ...decimating
Processing Grocery_and_Gourmet_Food_5
    ...scanning
    ...decimating
Processing Home_and_Kitchen_5
    ...scanning
    ...decimating
Processing Industrial_and_Scientific_5
    ...scanning
    ...decimating
Processing Kindle_Store_5
    ...scanning
  