In [2]:
import gzip
path = "/Users/yaohanli/Downloads/amazon_reviews_us_Gift_Card_v1_00.tsv.gz"
f = gzip.open(path, 'rt')

In [3]:
import csv
reader = csv.reader(f, delimiter = '\t')

In [4]:
header = next(reader)

In [5]:
dataset = []
for line in reader:
    d = dict(zip(header, line))
    for field in ['helpful_votes', 'star_rating', 'total_votes']:
        d[field] = int(d[field])
    for field in ['verified_purchase', 'vine']:
        if d[field] == 'Y':
            d[field] = True
        else:
            d[field] = False
        dataset.append(d)

In [6]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '24371595',
 'review_id': 'R27ZP1F1CD0C3Y',
 'product_id': 'B004LLIL5A',
 'product_parent': '346014806',
 'product_title': 'Amazon eGift Card - Celebrate',
 'product_category': 'Gift Card',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': False,
 'verified_purchase': True,
 'review_headline': 'Five Stars',
 'review_body': 'Great birthday gift for a young adult.',
 'review_date': '2015-08-31'}

In [8]:
ratings = [d['star_rating'] for d in dataset]
sum(ratings)/len(ratings) ##average rating

4.731333018677096

In [9]:
## Rating distribution
ratingCounts = {1: 0, 2: 0, 3: 0, 4:0, 5:0}
for d in dataset:
    ratingCounts[d['star_rating']] += 1

In [10]:
ratingCounts

{1: 9532, 2: 3120, 3: 6294, 4: 19616, 5: 258058}

In [11]:
from collections import defaultdict

In [12]:
ratingCounts = defaultdict(int)
for d in dataset:
    ratingCounts[d['star_rating']] += 1

In [13]:
ratingCounts

defaultdict(int, {5: 258058, 1: 9532, 4: 19616, 2: 3120, 3: 6294})

In [15]:
verifiedCounts = defaultdict(int)
for d in dataset:
    verifiedCounts[d['verified_purchase']] += 1

In [16]:
verifiedCounts

defaultdict(int, {True: 270578, False: 26042})

In [17]:
verifiedCounts = {True: 0, False: 0}
for d in dataset:
    verifiedCounts[d['verified_purchase']] += 1

In [18]:
verifiedCounts

{True: 270578, False: 26042}

In [19]:
## We can use defaultdict to determine product popularity (count which products appear most in the dataset)
productCounts = defaultdict(int)
for d in dataset:
    productCounts[d['product_id']] += 1

In [20]:
counts = [(productCounts[p], p) for p in productCounts]

In [21]:
counts.sort()

In [25]:
counts[-10:]

[(4076, 'B004KNWWO0'),
 (4346, 'B0066AZGD4'),
 (5260, 'BT00DDC7CE'),
 (5286, 'B004LLIKY2'),
 (6814, 'BT00DDC7BK'),
 (6880, 'BT00CTOUNS'),
 (8566, 'B00IX1I3G6'),
 (10068, 'BT00DDVMVQ'),
 (12074, 'B00A48G0D4'),
 (57410, 'B004LLIKVU')]

In [27]:
## Compute the average rating for each product, which requires that we first construct the list of rating for each product
## This can also be done using defaultdict, with the "list" subclass
ratingsPerProduct = defaultdict(list)
for d in dataset:
    ratingsPerProduct[d['product_id']].append(d['star_rating'])

In [28]:
averageRatingPerProduct = {}
for p in ratingsPerProduct:
    averageRatingPerProduct[p] = sum(ratingsPerProduct[p]) / len(ratingsPerProduct[p])

In [29]:
averageRatingPerProduct

{'B004LLIL5A': 4.811214953271028,
 'B004LLIKVU': 4.802891482320153,
 'B00IX1I3G6': 4.734064907774924,
 'B005ESMGV4': 4.076923076923077,
 'B004KNWWU4': 4.759784075573549,
 'BT00CTP2EE': 4.323076923076923,
 'B004W8D102': 4.717703349282297,
 'B00H5BNLUS': 4.884615384615385,
 'B004KNWX6C': 4.836893203883495,
 'BT00CTOYC0': 4.691860465116279,
 'B00H5BMH44': 4.8065454545454545,
 'B005ESMMKE': 4.75,
 'B00EPLT448': 4.85,
 'B005ESMKMO': 4.8,
 'B00PG40PAK': 5.0,
 'B00G4IWEZG': 4.7207207207207205,
 'B007V6EVY2': 4.753396029258099,
 'B007V6ETDK': 4.783359497645212,
 'B00A48G0D4': 4.764286897465628,
 'B00E1QB6ZC': 4.731182795698925,
 'B004W8D0Y4': 4.8175487465181055,
 'B005Z3D5OU': 4.225,
 'B00BWDHGFG': 4.7272727272727275,
 'B004LLIL4G': 4.73156089193825,
 'B00Q5BNBC2': 4.833333333333333,
 'B00CHSWJPC': 4.854251012145749,
 'B00BWDH368': 4.824273072060683,
 'B004LLILGO': 4.745469522240527,
 'B005FGQIL4': 4.546052631578948,
 'B00BWDH3VS': 4.762342135476464,
 'B00CHQKIDE': 4.521739130434782,
 'B00CHQK

In [31]:
## Sort by ratings, and also filter to only include resonably popular products:
topRated = [(averageRatingPerProduct[p], p) for p in averageRatingPerProduct if len(ratingsPerProduct[p]) > 50] 
## Only products with more than 50 reviews

In [32]:
topRated.sort()

In [33]:
topRated[-10:]

[(4.9423076923076925, 'B00SNMPQYC'),
 (4.944444444444445, 'B007V6EWKK'),
 (4.947368421052632, 'B004LLIL5K'),
 (4.955882352941177, 'B00H5BNKYA'),
 (4.959183673469388, 'B00Q5BOQVC'),
 (4.961538461538462, 'B00AF0KAWI'),
 (4.962962962962963, 'B00CRQ4XA8'),
 (4.966101694915254, 'B00P8N49M4'),
 (4.967741935483871, 'B00I542D5I'),
 (4.975609756097561, 'B00CT78POK')]