In [26]:
import pandas as pd
import numpy as np

url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz'
url_data = pd.read_table(url, sep='\t', nrows=809815, error_bad_lines=False)

b'Skipping line 9076: expected 15 fields, saw 22\nSkipping line 19256: expected 15 fields, saw 22\nSkipping line 24313: expected 15 fields, saw 22\nSkipping line 47211: expected 15 fields, saw 22\nSkipping line 54295: expected 15 fields, saw 22\nSkipping line 56641: expected 15 fields, saw 22\nSkipping line 63067: expected 15 fields, saw 22\n'
b'Skipping line 93796: expected 15 fields, saw 22\n'
b'Skipping line 132806: expected 15 fields, saw 22\nSkipping line 164631: expected 15 fields, saw 22\nSkipping line 167019: expected 15 fields, saw 22\nSkipping line 167212: expected 15 fields, saw 22\n'
b'Skipping line 198103: expected 15 fields, saw 22\nSkipping line 199191: expected 15 fields, saw 22\nSkipping line 202841: expected 15 fields, saw 22\nSkipping line 218228: expected 15 fields, saw 22\nSkipping line 235900: expected 15 fields, saw 22\n'
b'Skipping line 277761: expected 15 fields, saw 22\nSkipping line 304582: expected 15 fields, saw 22\nSkipping line 312029: expected 15 fields,

In [27]:
#Cleaning Date Data: Turns out it's not appropriate because the raw data provided doesn't span enough time range
# import datetime
# url_data['review_date'] = pd.to_datetime(url_data['review_date'])

In [28]:
url_product = url_data.set_index('product_title')
by_ptitle = url_product.groupby('product_title')['star_rating'].agg(['mean', 'count'])
by_ptitle.columns = ['Avg Rating', '# Of Reviews']

In [29]:
def weighted_rank(df):
    total_count = df['# Of Reviews'].count()
    threshold = df['# Of Reviews'].mean() + df['# Of Reviews'].std()
    df['count_rank'] = df['# Of Reviews'].rank(ascending=False)
    df['rating_rank'] = df['Avg Rating'].rank(ascending=False)
    df['rank_score'] = np.where(df['# Of Reviews'] >= threshold,
                                       df['Avg Rating']/5 + (total_count-df['count_rank'])/total_count,
                                       (df['Avg Rating']/5 + ((1-((threshold-df['# Of Reviews'])/threshold))*(total_count-df['count_rank'])/total_count)))
    df['rank'] = df['rank_score'].rank(ascending=False)

In [30]:
def keyword_review(keyword):
    select_df = url_product[url_product['review_body'].str.contains(keyword, na=False)]
    #select_df = select_df[(select_df['review_date'] >= review_start) & (select_df['review_date'] <= review_end)]
    ranked_select_df = select_df.groupby('product_title')['star_rating'].agg(['mean', 'count'])
    ranked_select_df.columns = ['Avg Rating', '# Of Reviews']
    weighted_rank(ranked_select_df)
    clean_df = ranked_select_df[['Avg Rating', '# Of Reviews', 'rank']].sort_values('rank', ascending=True)
    return clean_df

In [31]:
weighted_rank(by_ptitle)
clean_df = by_ptitle[['Avg Rating', '# Of Reviews', 'rank']].sort_values('rank', ascending=True)
clean_df.head(n=10)

Unnamed: 0_level_0,Avg Rating,# Of Reviews,rank
product_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Caseling Premium Hard EVA Case Travel Bag Pouch for Bose Soundlink Mini Bluetooth Speaker,4.88853,619,1.0
"Monoprice Multi Media Desktop Stand 22"" x 9.5"" (109434)",4.887805,205,2.0
"Etekcity 10 Pack Power Extension Cord Cable, Outlet Saver, 3 Prong, 16AWG-13A, UL Listed (Black, 1-Foot)",4.879464,224,3.0
AmazonBasics High-Speed HDMI Cable and Digital Audio Optical Cable 2-Pack - 6-Feet (1.8 Meters),4.919643,112,4.0
"Anker 3.5mm Premium Auxiliary Audio Cable (4ft / 1.2m) AUX Cable for Beats Headphones, iPods, iPhones, iPads, Home / Car Stereos and More",4.846682,437,5.0
"Cable Management Sleeve, JOTO Cable Sleeve, [Set of 4] 19"" Neoprene Cable Sleeves for PC / TV / Home Theater / Speaker, Flexible Cable Wrap, Cable Cover, Cable Organizer, Cords Management (4 Piece)",4.881988,161,6.0
iXCC Stereo cable Male to Male 3.5mm,4.839844,256,7.0
"ECHOGEAR Full Motion Articulating TV Wall Mount Bracket for most 37-70 inch LED, LCD, OLED and Plasma Flat Screen TVs w/ VESA patterns up to 600 x 400 - 16"" Extension - EGLF1-BK",4.840708,226,8.0
HDMI-DVI Cables,4.810784,1020,9.0
"Swimbuds SPORT Waterproof Headphones - See below under ""Special Offers and Product Promotions"" for discounts on this Headphone",4.828467,274,10.0


In [32]:
keyword_review('work').head(n=10)

Unnamed: 0_level_0,Avg Rating,# Of Reviews,rank
product_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Sewell Direct SW-29863-12 Deadbolt Banana Plugs, 12-Pair",4.892857,84,1.0
Twisted Veins ACHLA3 Three (3) Pack of HDMI 270 Degree/Right Angle Connectors/Adapters,4.897436,39,2.0
"Swimbuds SPORT Waterproof Headphones - See below under ""Special Offers and Product Promotions"" for discounts on this Headphone",4.807692,104,3.0
Panasonic eneloop AA New 2100 Cycle Ni-MH Pre-Charged Rechargeable Batteries,4.790698,172,4.0
HDMI-DVI Cables,4.766187,278,5.0
Underwater Audio Waterproof iPod Shuffle,4.755,200,6.0
KabelDirekt Male Stereo Audio Cable - PRO Series,4.82,50,7.0
iXCC Stereo cable Male to Male 3.5mm,4.8,60,8.0
"Anker 3.5mm Premium Auxiliary Audio Cable (4ft / 1.2m) AUX Cable for Beats Headphones, iPods, iPhones, iPads, Home / Car Stereos and More",4.779221,77,9.0
"ECHOGEAR Full Motion Articulating TV Wall Mount Bracket for most 37-70 inch LED, LCD, OLED and Plasma Flat Screen TVs w/ VESA patterns up to 600 x 400 - 16"" Extension - EGLF1-BK",4.813953,43,10.0
