- https://nijianmo.github.io/amazon/index.html
- https://jmcauley.ucsd.edu/data/amazon/ 
- https://colab.research.google.com/drive/1Zv6MARGQcrBbLHyjPVVMZVnRWsRnVMpV

# get data

    download from http://localhost:8080/notebooks/git/product-category/notebooks/secrets_get_amazon_data.ipynb

In [1]:
import pandas as pd
import gzip
import json

In [8]:
pdata = "/data/git/product-category/data/amazon_meta_data"
!ls -hl {pdata}
fnms = !ls {pdata}
fnms = [o for o in fnms if o.endswith('.json.gz')]
domains = [o.strip('meta_').strip('.json.gz') for o in fnms]
dmn2fnm = dict(zip(domains, fnms))
dmn2fnm

total 13G
-rw-rw-r-- 1 ubuntu ubuntu   30M Dec  2 00:07 meta_AMAZON_FASHION.json.gz
-rw-rw-r-- 1 ubuntu ubuntu  9.6M Aug  7  2020 meta_All_Beauty.json.gz
-rw-rw-r-- 1 ubuntu ubuntu   57M Aug  7  2020 meta_Appliances.json.gz
-rw-rw-r-- 1 ubuntu ubuntu   57M Aug  7  2020 meta_Appliances.json.gz.1
-rw-rw-r-- 1 ubuntu ubuntu  125M Aug  7  2020 meta_Arts_Crafts_and_Sewing.json.gz
-rw-rw-r-- 1 ubuntu ubuntu  1.5G Aug  7  2020 meta_Automotive.json.gz
-rw-rw-r-- 1 ubuntu ubuntu  1.2G Aug  7  2020 meta_Books.json.gz
-rw-rw-r-- 1 ubuntu ubuntu  1.2G Aug  7  2020 meta_Books.json.gz.1
-rw-rw-r-- 1 ubuntu ubuntu  153M Aug  7  2020 meta_CDs_and_Vinyl.json.gz
-rw-rw-r-- 1 ubuntu ubuntu  333M Aug  7  2020 meta_Cell_Phones_and_Accessories.json.gz
-rw-rw-r-- 1 ubuntu ubuntu  1.5G Dec  2 00:18 meta_Clothing_Shoes_and_Jewelry.json.gz
-rw-rw-r-- 1 ubuntu ubuntu   12M Aug  7  2020 meta_Digital_Music.json.gz
-rw-rw-r-- 1 ubuntu ubuntu  1.2G Aug  7  2020 meta_Electronics.json.gz
-rw-rw-r-- 1 ubu

{'AMAZON_FASHION': 'meta_AMAZON_FASHION.json.gz',
 'All_Beauty': 'meta_All_Beauty.json.gz',
 'Appliance': 'meta_Appliances.json.gz',
 'Arts_Crafts_and_Sewi': 'meta_Arts_Crafts_and_Sewing.json.gz',
 'Automotive': 'meta_Automotive.json.gz',
 'Book': 'meta_Books.json.gz',
 'CDs_and_Vinyl': 'meta_CDs_and_Vinyl.json.gz',
 'Cell_Phones_and_Accessorie': 'meta_Cell_Phones_and_Accessories.json.gz',
 'Clothing_Shoes_and_Jewelry': 'meta_Clothing_Shoes_and_Jewelry.json.gz',
 'Digital_Music': 'meta_Digital_Music.json.gz',
 'Electronic': 'meta_Electronics.json.gz',
 'Gift_Card': 'meta_Gift_Cards.json.gz',
 'Grocery_and_Gourmet_Food': 'meta_Grocery_and_Gourmet_Food.json.gz',
 'Home_and_Kitche': 'meta_Home_and_Kitchen.json.gz',
 'Industrial_and_Scientific': 'meta_Industrial_and_Scientific.json.gz',
 'Kindle_Store': 'meta_Kindle_Store.json.gz',
 'Luxury_Beauty': 'meta_Luxury_Beauty.json.gz',
 'Magazine_Subscripti': 'meta_Magazine_Subscriptions.json.gz',
 'Movies_and_TV': 'meta_Movies_and_TV.json.gz',
 

In [35]:
KEYS2USE = set(['category', 'description', 'title', 'brand', 'feature', 'asin'])

In [48]:
# https://colab.research.google.com/drive/1Zv6MARGQcrBbLHyjPVVMZVnRWsRnVMpV#scrollTo=7igYuRaV4bF7
    
### load the meta data
def get_meta_data(domain, nrows=None):
    keys2use = KEYS2USE
    fnm = dmn2fnm[domain]
    data = []
    with gzip.open(f'{pdata}/{fnm}') as f:
        for i,l in enumerate(f):
            dat = json.loads(l.strip())
            dat = {k:v for k,v in dat.items() if k in keys2use}
            data.append(dat)
            if nrows and i>nrows: break

    # total length of list, this number equals total number of products
#     print("len(data)", len(data))

    # convert list into pandas dataframe
    df = pd.DataFrame.from_dict(data)
#     print("len(df)", len(df))

    ### remove rows with unformatted title (i.e. some 'title' may still contain html style content)
    df3 = df.fillna('')
    df4 = df3[df3.title.str.contains('getTime')] # unformatted rows
    df5 = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows
#     print("len(df4)", len(df4))
#     print("len(df5)", len(df5))

    return df5

In [53]:
df = get_meta_data(domains[3], nrows=10000)
df.head(3)

Unnamed: 0,category,description,title,brand,feature,asin
0,"[Arts, Crafts & Sewing, Sewing, Trim & Embelli...",[The patch features the sweaty masculine hands...,You Son of a Bitch! 1987 Embroidered Patch,Honchosfx,[You son of a bitch patch - exclusive to Honch...,6665560953
1,"[Arts, Crafts & Sewing, Crafting, Paper & Pape...","[With 5 packs stars folding paper, each pack a...",Origami Stars Papers Package 1H (5 packs),,[],7000000376
2,"[Arts, Crafts & Sewing, Painting, Drawing & Ar...",[],Yi De Ge Chinese Calligraphy Sumi Drawing Blac...,MasterChinese,[],7000001089


# preprocess data

In [54]:
dfs = []
for dmn in domains:
    df = get_meta_data(dmn, nrows=1000)
    if set(df.columns)!= KEYS2USE: continue
    df = df[df.category.apply(len)>0]
    if len(df)==0: continue
    print(dmn, df.shape)
    df['domain'] = dmn
    display(df.sample(2))
    dfs.append(df)

Appliance (989, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
854,"[Appliances, Parts &amp; Accessories, Refriger...","[, <br /><strong>Replace Every Six Months</str...","ALPINEWATER 4396841, 4396710, Kenmore 9030, 90...",ALPINEWATER,[The real filter life span depends on water qu...,B000GJYMJK,Appliance
422,"[Appliances, Parts & Accessories, Food Waste D...",[The EZ-FLO garbage disposer splash guard is d...,EZ-FLO 30102 Garbage Disposer Splash Guard,EZ-Flo,"[Splash guard, Fits *insinkerator, Black]",B00068UTY8,Appliance


Arts_Crafts_and_Sewi (971, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
812,"[Arts, Crafts & Sewing, Sewing, Sewing Notions...",[Ideal for carrying bulky items! Heavy duty fu...,VELCRO Brand All-Purpose Strap with Handle | L...,VELCRO Brand,[NOTICE: Amazon is a trusted seller of Velcro ...,B00006IC2S,Arts_Crafts_and_Sewi
831,"[Arts, Crafts & Sewing, Crafting, Paper & Pape...",[Express your creativity! Create announcements...,"Geographics Design Paper, Scroll, 24 lb, 8.5 x...",Geographics,"[Express your creativity!, Create announcement...",B00006IDX0,Arts_Crafts_and_Sewi


Automotive (945, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
528,"[Automotive, Exterior Accessories, Body Armor]",[],Megaman X Ultimate Armor Kit - Force Armor,,[],B00009WNYB,Automotive
116,"[Automotive, Exterior Accessories, Bumper Stic...",[If it doesn't have a Flag It header - its an ...,Flag It California Heavy Duty Vinyl Bumper Sti...,Flag It,"[Great for windows, cars, travel trailers, boa...",9539751322,Automotive


Book (842, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
253,"[Books, Children's Books]",[],Has Winter Come?,Visit Amazon's Wendy Watson Page,[],1953184,Book
257,"[Books, Literature & Fiction, World Literature]",[],VANCOUVER A Novel (Signed),"David & Griffiths, Alison Cruise",[],2006588,Book


CDs_and_Vinyl (1002, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
23,"[CDs & Vinyl, New Age, Meditation]",[Meditation for Success by Michael Midkiff ena...,"<span class=""a-size-medium a-color-secondary a...",Michael Midkiff,[],0615205399,CDs_and_Vinyl
364,"[CDs & Vinyl, Rock, Progressive, Progressive R...",[Yes Spin-Off],"Anderson, Bruford, Wakeman &amp; Howe VHS",,[],630154224X,CDs_and_Vinyl


Cell_Phones_and_Accessorie (978, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
470,"[Cell Phones & Accessories, Cases, Holsters & ...",[],3 Items Combo For Nokia Lumia 822 (Verizon) Bl...,,[],9862165677,Cell_Phones_and_Accessorie
581,"[Cell Phones & Accessories, Accessories, Acces...",[],Motorola Electrify M XT901 (US Cellular) Color...,,[],9868874505,Cell_Phones_and_Accessorie


Clothing_Shoes_and_Jewelry (1002, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
940,"[Clothing, Shoes & Jewelry, Women, Jewelry, Ri...",[This fabulous channel-set diamond ring is sub...,14k White Gold Channel-set Princess Cut Diamon...,Amazon Collection,[All our diamond suppliers confirm that they c...,B0000CEMGJ,Clothing_Shoes_and_Jewelry
227,"[Clothing, Shoes & Jewelry, Men, Clothing, Shi...","[Pour some martinis, put on your favorite jazz...",Tommy Bahama Geo-Tastic Silk Camp Shirt,,"[Clothing, Shoes & Jewelry, Men, Clothing, Shi...",9789898282,Clothing_Shoes_and_Jewelry


Electronic (1002, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
99,"[Electronics, eBook Readers & Accessories, Cov...",[],NOOK HD 7&quot; Cover - Lautner Cover (Dark Fu...,Nook,[],594513189,Electronic
310,"[Electronics, Computers &amp; Accessories, Tab...",[PREMIUM USB Adapter Power Kit for Acer Iconia...,PREMIUM USB Adapter Power Kit for Acer Iconia ...,Factory Direct,[This all in one accessory kit includes everyt...,1060119811,Electronic


Gift_Card (997, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
840,"[Gift Cards, Gift Cards, No expiration, no fee...",[],Amazon eGift Card - Cake It (Animated) [Hallm...,,"[<span class=""a-size-base a-color-secondary"">\...",B00H5BM5VE,Gift_Card
396,"[Gift Cards, Gift Cards]",[],Amazon eGift Card,,[],B00BWDHT62,Gift_Card


Grocery_and_Gourmet_Food (1002, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
93,"[Grocery & Gourmet Food, Breakfast Foods, Brea...","[ZonePerfect Nutrition Bars, Fudge Graham/Choc...","ZonePerfect Nutrition Bars, Fudge Graham/Choco...",Zone Perfect,[],B0000AA8UL,Grocery_and_Gourmet_Food
486,"[Grocery & Gourmet Food, Meat & Seafood, Bacon]","[Pancetta is a traditional Italian specialty, ...",Pancetta by Beretta (14 ounce),Fratelli Beretta,[],B0000D9NCI,Grocery_and_Gourmet_Food


Home_and_Kitche (1002, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
42,"[Home & Kitchen, Bedding, Bed Pillows & Positi...",[We don't know when or if this item will be ba...,My Sticker Book: Princess Swing &ndash; Reusab...,Five Star,[We don't know when or if this item will be ba...,0983124248,Home_and_Kitche
928,"[Home & Kitchen, Kitchen & Dining, Bakeware, P...",[Heavy-gauge steel crisper is perfect for crea...,Chicago Metallic Commercial Pizza Crisper,Chicago Metallic,"[Perfect for frozen, leftover, or homemade piz...",B00004R91W,Home_and_Kitche


Industrial_and_Scientific (981, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
430,"[Industrial & Scientific, Tapes, Adhesives & S...","[3M 198 Scotch 1"" x 36"" Reflective Tape, White...",3M 198 Scotch 1&quot; x 36&quot; Reflective Ta...,3M,"[3M #198 1x40 Silver Grey Refle Tape, 3M COMPANY]",B00004Z49Q,Industrial_and_Scientific
779,"[Industrial & Scientific, Material Handling Pr...",[Tough molded rubber wheel and heavy steel mou...,Steelex D2622 2-Inch 65-Pound Swivel Rubber Pl...,Steelex,"[2-Inch size, 2-3/8-Inch by 2-Inch plate, Swiv...",B0000DD1FC,Industrial_and_Scientific


Kindle_Store (1002, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
643,"[Kindle Store, Kindle eBooks, Health, Fitness ...",[],,Visit Amazon's Sylvia K. Blood Page,[],B000OI18EA,Kindle_Store
304,"[Kindle Store, Kindle eBooks, Literature &amp;...",[],,Public Domain Books,[],B000JMKWAW,Kindle_Store


Magazine_Subscripti (946, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
771,"[Magazine Subscriptions, Children & Teen, Chil...",[Magazine for girls of ages 6-12 filled with e...,"<span class=""a-size-medium a-color-secondary""",Bluffton News Publishing Co,[],B00007AXWU,Magazine_Subscripti
512,"[Magazine Subscriptions, Sports, Recreation & ...",[Contains information of interest to individua...,"<span class=""a-size-medium a-color-secondary""",Harris Publishing Inc,[],B00006KX8R,Magazine_Subscripti


Movies_and_TV (1002, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
281,"[Movies & TV, Studio Specials, Sony Pictures H...","[Monty Python And The Holy Grail, Could this b...",Monty Python and the Holy Grail,Graham Chapman,[],767824571,Movies_and_TV
692,"[Movies & TV, Studio Specials, MGM Home Entert...","[Hot stars James Marsden (""Bella Mafia""), Kati...",Disturbing Behavior VHS,James Marsden,[],792840119,Movies_and_TV


Musical_Instrument (952, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
182,"[Musical Instruments, Studio Recording Equipme...",[],MixMan DJ Megamix,Mixman Technologies,[Easily create professional quality music on y...,B00004VWIF,Musical_Instrument
96,"[Musical Instruments, Amplifiers & Effects, Gu...",[Pedal is green in color. Maximum effect contr...,Boss Ph-2 Super Phaser,BOSS,[UPC of this product is G60172171098. It is ba...,1925542475,Musical_Instrument


Office_Product (940, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
719,"[Office Products, Office &amp; School Supplies...",[],2012 Switched on Schoolhouse French 1 Elective...,Alpha Omega Publications,[],740327003,Office_Product
80,"[Office Products, Office &amp; School Supplies...",[Reproducible pages on the back of each chart!],Scholastic Ocean Adventure! Incentive Chart (T...,Scholastic,"[Measures 17"" x 22""., Plastic-coated for color...",439505747,Office_Product


Patio_Lawn_and_Garde (1002, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
959,"[Patio, Lawn & Garden, Pest Control, Traps]","[RIGID PLASTIC; NON-TIPPING WITH 2 CLAMPS\, Th...",LINT TRAP W/VINYL -,Unknown,"[""PROFLEX"" INDOOR DRYER VENT KIT, 4""x5', White...",B00004YWK5,Patio_Lawn_and_Garde
48,"[Patio, Lawn & Garden, Outdoor Dcor, Garden Sc...",[],Japanese Koinobori Koi Nobori NYLON Blue Carp ...,Yokohama Gifts,[This listing is for 1 Set of Japanese Koinobo...,9622447279,Patio_Lawn_and_Garde


Pet_Supplie (939, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
688,"[Pet Supplies, Dogs, Treats, Cookies, Biscuits...",[<b>?Why do you need anti-bark collar StopWoof...,[NEW 2019] Humane Bark collar-Dog bark Collar-...,Purina,[? EFFECTIVE: Studies show that 80-85% of do...,B0000BXJG1,Pet_Supplie
529,"[Pet Supplies, Small Animals, Food]",[Your guinea pig or rabbit wont be able to res...,Oxbow Orchard Grass Hay,Oxbow Animal Health,[],B00008DFPK,Pet_Supplie


Software (776, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
528,"[Software, Education &amp; Reference, Languages]",[This used set includes 1 available French Lev...,Rosetta Stone Homeschool French Level 1-3 Set ...,Rosetta Stone,[Interactive software; Parent's guide; Supplem...,1607179342,Software
702,"[Software, Education & Reference, Test Prepara...","[Affordable, current, and complete study guide...","Essentials of Nursing Care Health Safety, N104...",MyStudyGroup101 LLC,[],1936452006,Software


Sports_and_Outdoor (916, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
219,"[Sports & Outdoors, Fan Shop, Cell Phone Acces...",[Protect and personalize your Verizon Samsung ...,Black Spade Skull Poker Card Design Rubberized...,Handheldfashion,[High Quality Super Clear LCD Screen Protector...,9861051139,Sports_and_Outdoor
350,"[Sports & Outdoors, Outdoor Recreation, Cyclin...",[U-Lock Padlocks Keyed Sports Series 8-1/4-inc...,"Master Lock 8184DSG U-Lock with Bracket, 8-1/4...",Master Lock,[8-1/4-inch (21cm) wide steel lock body for st...,B00004SQMD,Sports_and_Outdoor


Tools_and_Home_Improvement (951, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
900,"[Tools & Home Improvement, Power & Hand Tools,...",[Precision Miter Box With Saw - PRECISION MITE...,Jorgensen 64016 Precision Miter Saw,Adjustable Clamp Co,[Miter saw for use in various carpentry projec...,B00002244D,Tools_and_Home_Improvement
606,"[Tools & Home Improvement, Power & Hand Tools,...",[],Bosch 1275DVS 3-Inch x 24-Inch Variable Speed ...,Bosch,"[10.5 Amps, 1,150 SFPM power for professional ...",B0000223H8,Tools_and_Home_Improvement


Toys_and_Game (870, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
590,"[Toys & Games, Learning & Education, Reading &...",[<b>After</b> your children play with this awe...,Magnetic Before and After Activity Game - Supe...,Super Duper Publications,"[Sturdy, Colorful, Magnetic Game Board (14"" x ...",1586503839,Toys_and_Game
277,"[Toys & Games, Games, Card Games]",[],Duh: It Will Make You Feel Smart Game,,[],974889148,Toys_and_Game


Video_Game (982, 6)


Unnamed: 0,category,description,title,brand,feature,asin,domain
125,"[Video Games, PC]",[The Limited Edition game features six special...,Battlefield Bad Company 2 Limited Edition,Battlefield Bad Company 2 Limited Edition,[],8990289440,Video_Game
345,"[Video Games, Retro Gaming & Microconsoles, Ni...","[Nintendo 64, <i>Star Wars: Episode 1 Racer</i...",Star Wars - Episode I - Racer,Nintendo,[lose. not with box],B00000J2OO,Video_Game


In [57]:
df = pd.concat(dfs)

df.shape

(22991, 7)

In [59]:
df.sample(3)

Unnamed: 0,category,description,title,brand,feature,asin,domain
185,"[Industrial & Scientific, Tapes, Adhesives & S...",[America's #1 selling latex caulk. ALEXPLUS is...,DAP INC 18152 10.1oz White Alex Plus Acrylic L...,DAP,"[Sturdy and secure construction, Perfect for y...",B00002ND6L,Industrial_and_Scientific
868,"[Office Products, Office &amp; School Supplies...",[This Lang Heart &amp; Home 2016 Wall Calendar...,The Lang Heart &amp; Home 2016 Wall Calendar f...,Lang,"[Artwork from hand painted originals, Brass gr...",0741255162,Office_Product
491,"[Grocery & Gourmet Food, Olives, Pickles & Rel...",[This is an authentic German sauerkraut and is...,Kuehne former Gundelsheim Barrel Sauerkraut (1...,Gundelsheim,[],B0000D9MT2,Grocery_and_Gourmet_Food
