In [2]:
import gzip
path = "/mnt/c/Users/user/Downloads/amazon_reviews_us_Gift_Card_v1_00.tsv.gz"
# calling this path from within linux and not windows
f = gzip.open(path,'rt')

In [3]:
import csv
reader = csv.reader(f, delimiter='\t')

In [4]:
header = next(reader) # get the header(fields) of the tsv file as a list

In [5]:
dataset = [] # we want to create a list of dictionaries
for line in reader:
    d = dict(zip(header,line))
    for field in ['helpful_votes', 'star_rating', 'total_votes']:
        d[field] = int(d[field]) # convert string to number for number fields
    for field in ['verified_purchase', 'vine']:
        if d[field] == 'Y':
            d[field] = True # convert string to boolean for boolean fields
        else:
            d[field] = False
    dataset.append(d) # dataset is a list of dictionaries

In [10]:
# Average rating = get all ratings / number of ratings
ratings = [d['star_rating'] for d in dataset]
print(f"Average Gift Card Rating: {sum(ratings) / len(ratings):2f}")

Average Gift Card Rating: 4.731333


In [12]:
# Rating distribution e.g how many ratings for 1 are there, for 2, for 3 and so on.
ratingsCount = {1:0, 2:0, 3:0 , 4:0 , 5:0}
for d in dataset:
    ratingsCount[d['star_rating']] +=1
    

In [13]:
ratingsCount

{1: 4766, 2: 1560, 3: 3147, 4: 9808, 5: 129029}

In [14]:
# The above solution works if there are few possibilities
from collections import defaultdict
ratingsCount = defaultdict(int)
for d in dataset:
    ratingsCount[d['star_rating']] +=1

In [21]:
ratingsCount

defaultdict(int, {5: 129029, 1: 4766, 4: 9808, 2: 1560, 3: 3147})

In [22]:
verifiedCounts = defaultdict(int)
for d in dataset:
    verifiedCounts[d['verified_purchase']] +=1

In [23]:
verifiedCounts

defaultdict(int, {True: 135289, False: 13021})

In [25]:
productCounts = defaultdict(int)
for d in dataset:
    productCounts[d['product_id']] +=1

counts = [(productCounts[p],p) for p in productCounts] 
# p=key i.e {product_id:count} counts= (count,product_id)
 # printing print(dict) will print the key
 # we are sorting by the first value in tuple - the counts that is why we made counts first value in list
 # the last 10 items in list - the 10 highest
counts.sort()
counts[-10:]

[(2038, 'B004KNWWO0'),
 (2173, 'B0066AZGD4'),
 (2630, 'BT00DDC7CE'),
 (2643, 'B004LLIKY2'),
 (3407, 'BT00DDC7BK'),
 (3440, 'BT00CTOUNS'),
 (4283, 'B00IX1I3G6'),
 (5034, 'BT00DDVMVQ'),
 (6037, 'B00A48G0D4'),
 (28705, 'B004LLIKVU')]

In [30]:
# average rating for each product
# get the ratings for each product
# divide ratings by the number of ratings
ratingsPerProduct = defaultdict(list)

In [34]:
for d in dataset:
    ratingsPerProduct[d['product_id']].append(d['star_rating'])

In [35]:
averageRatingsPerProduct = {}
for p in ratingsPerProduct:
    averageRatingsPerProduct[p] = sum(ratingsPerProduct[p]) / len(ratingsPerProduct[p])

In [36]:
topRated  = [(averageRatingsPerProduct[p],p) for p in averageRatingsPerProduct if len(ratingsPerProduct[p]) > 50 ]

In [37]:
topRated.sort()
topRated[-10:]

[(4.9423076923076925, 'B00SNMPQYC'),
 (4.944444444444445, 'B007V6EWKK'),
 (4.947368421052632, 'B004LLIL5K'),
 (4.955882352941177, 'B00H5BNKYA'),
 (4.959183673469388, 'B00Q5BOQVC'),
 (4.961538461538462, 'B00AF0KAWI'),
 (4.962962962962963, 'B00CRQ4XA8'),
 (4.966101694915254, 'B00P8N49M4'),
 (4.967741935483871, 'B00I542D5I'),
 (4.975609756097561, 'B00CT78POK')]

In [58]:
# Most active users
activeUserCount = defaultdict(int)

In [59]:
for d in dataset:
    activeUserCount[int(d['customer_id'])] +=1

In [76]:
counts = [(activeUserCount[u],u) for u in activeUserCount if activeUserCount[u] > 3]
counts.sort()
counts[-10:]
#activeUserCount

[(5, 41920477),
 (5, 42184434),
 (5, 45298444),
 (5, 45359231),
 (5, 50442527),
 (5, 50822336),
 (5, 51046621),
 (6, 30058414),
 (6, 52166758),
 (7, 9374792)]

In [84]:
# most commonly used words in the reviews
# split the words and count the words
reviewWords = defaultdict(int)
for d in dataset:
    for word in d['review_body'].split():
        word.replace('!','').replace('.','').replace('!','').strip().replace("'","").replace("&","")
        if len(word)>3: # can have a list of words to filter out here
            reviewWords[word]+=1

counts = [ (reviewWords[word], word) for word in reviewWords]
counts.sort()
counts[-10:]

[(16553, 'cards'),
 (18130, 'they'),
 (20647, 'have'),
 (21871, 'with'),
 (22705, 'easy'),
 (27034, 'this'),
 (27037, 'that'),
 (30710, 'Amazon'),
 (47077, 'card'),
 (91436, 'gift')]

In [88]:
# difference in average rating between  verified and non-verified purchase
# get average rating for verified  and non-verified purchases
purchasesRatings = defaultdict(list)

In [90]:
for d in dataset:
    if d['verified_purchase']:
        purchasesRatings['verified'].append(d['star_rating'])
    else:
        purchasesRatings['non-verified'].append(d['star_rating'])

for r in purchasesRatings:
    print("{} average ratings: {:2f} ".format(r.title(),sum(purchasesRatings[r])/len(purchasesRatings[r]) ))

Verified average ratings: 4.746077 
Non-Verified average ratings: 4.578143 
