In [2]:
#
# Get the web page for the Amazon data referenced by the LIN127 Project
# description.  Note we need to convert it from a byte sequence to a string.
#

import urllib.request

content = urllib.request.urlopen("https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/").read()
content = content.decode("utf-8")

for lineno, line in enumerate(content.split('\n'),1):
    print(line)
    if lineno == 10: break


<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
  <head>
    <title>Amazon review data</title>
    <meta name="author" content="Jianmo Ni">
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">

    <script type="application/ld+json">


In [3]:
#
# Most links in a web page look something link
#
#   <a href="https://example.com/foo.txt">Foo</a>
#
# Instead of parsing the html, we do something quick and dirty: use a regular
# expression to pull out the URLs referenced from the page that include the
# path segment /categoryFilesSmall/ and end with _5,json.gz.  That corresponds
# to the links in the "Small subsets" table.  (Both filtering elements are
# required.)
#

import re

SMALL_LINKS_RE = re.compile(r'href\s*=\s*"([^"]+/categoryFilesSmall/[^"]+_5.json.gz)"')
small_links = [ m[1] for m in SMALL_LINKS_RE.finditer(content) ]
print(f"There are {len(small_links)} small links.")
small_links

There are 29 small links.


['https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/AMAZON_FASHION_5.json.gz',
 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/All_Beauty_5.json.gz',
 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Appliances_5.json.gz',
 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Arts_Crafts_and_Sewing_5.json.gz',
 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Automotive_5.json.gz',
 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Books_5.json.gz',
 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/CDs_and_Vinyl_5.json.gz',
 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz',
 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Clothing_Shoes_and_Jewelry_5.json.gz',
 'https://datarepo.eng.ucs

In [4]:
import os

def wget_if_needed(url:str, localname:str|None=None, force=False):
    if localname is None:
        localname = os.path.basename(url)
    if force or not os.path.exists(localname):
        print(f"Fetching {url}...")
        os.system(f"wget -q -O {localname} {url}")
    else:
        print(f"[Already have {url}]")

for link in small_links:
    wget_if_needed(link)

print("Done")

Fetching https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/AMAZON_FASHION_5.json.gz...
Fetching https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/All_Beauty_5.json.gz...
Fetching https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Appliances_5.json.gz...
Fetching https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Arts_Crafts_and_Sewing_5.json.gz...
Fetching https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Automotive_5.json.gz...
Fetching https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Books_5.json.gz...
Fetching https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/CDs_and_Vinyl_5.json.gz...
Fetching https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz...
Fetching https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles

In [5]:
!ls -l

total 29024512
-rw-r--r--@ 1 tscreven  staff      287013 Apr 28  2023 AMAZON_FASHION_5.json.gz
-rw-r--r--@ 1 tscreven  staff      633507 Apr 28  2023 All_Beauty_5.json.gz
-rw-r--r--@ 1 tscreven  staff       72998 Apr 28  2023 Appliances_5.json.gz
-rw-r--r--@ 1 tscreven  staff    53965563 Apr 28  2023 Arts_Crafts_and_Sewing_5.json.gz
-rw-r--r--@ 1 tscreven  staff   202890818 Apr 28  2023 Automotive_5.json.gz
-rw-r--r--@ 1 tscreven  staff  7096438325 Apr 28  2023 Books_5.json.gz
-rw-r--r--@ 1 tscreven  staff   484144315 Apr 28  2023 CDs_and_Vinyl_5.json.gz
-rw-r--r--@ 1 tscreven  staff   169071325 Apr 28  2023 Cell_Phones_and_Accessories_5.json.gz
-rw-r--r--@ 1 tscreven  staff  1262892731 Apr 28  2023 Clothing_Shoes_and_Jewelry_5.json.gz
-rw-r--r--@ 1 tscreven  staff    19408584 Apr 28  2023 Digital_Music_5.json.gz
-rw-r--r--@ 1 tscreven  staff  1251876861 Apr 28  2023 Electronics_5.json.gz
-rw-r--r--@ 1 tscreven  staff      176952 Apr 28  2023 Gift_Cards_5.json.gz
-rw-r--r--@ 1 tscreven