In [33]:
import pandas as pd
import numpy as np

url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz'
url_data = pd.read_table(url, sep='\t', nrows=809815, error_bad_lines=False)

b'Skipping line 9076: expected 15 fields, saw 22\nSkipping line 19256: expected 15 fields, saw 22\nSkipping line 24313: expected 15 fields, saw 22\nSkipping line 47211: expected 15 fields, saw 22\nSkipping line 54295: expected 15 fields, saw 22\nSkipping line 56641: expected 15 fields, saw 22\nSkipping line 63067: expected 15 fields, saw 22\n'
b'Skipping line 93796: expected 15 fields, saw 22\n'
b'Skipping line 132806: expected 15 fields, saw 22\nSkipping line 164631: expected 15 fields, saw 22\nSkipping line 167019: expected 15 fields, saw 22\nSkipping line 167212: expected 15 fields, saw 22\n'
b'Skipping line 198103: expected 15 fields, saw 22\nSkipping line 199191: expected 15 fields, saw 22\nSkipping line 202841: expected 15 fields, saw 22\nSkipping line 218228: expected 15 fields, saw 22\nSkipping line 235900: expected 15 fields, saw 22\n'
b'Skipping line 277761: expected 15 fields, saw 22\nSkipping line 304582: expected 15 fields, saw 22\nSkipping line 312029: expected 15 fields,

In [34]:
#Cleaning Date Data: Turns out it's not appropriate because the raw data provided doesn't span enough time range
# import datetime
# url_data['review_date'] = pd.to_datetime(url_data['review_date'])

In [35]:
url_product = url_data.set_index('product_title')

In [36]:
def weighted_rank(df):
    total_count = df['# Of Reviews'].count()
    threshold = df['# Of Reviews'].mean() + df['# Of Reviews'].std()
    df['count_rank'] = df['# Of Reviews'].rank(ascending=False)
    df['rating_rank'] = df['Avg Rating'].rank(ascending=False)
    df['rank_score'] = np.where(df['# Of Reviews'] >= threshold,
                                       df['Avg Rating']/5 + (total_count-df['count_rank'])/total_count,
                                       (df['Avg Rating']/5 + ((1-((threshold-df['# Of Reviews'])/threshold))*(total_count-df['count_rank'])/total_count)))
    df['rank'] = df['rank_score'].rank(ascending=False)

In [37]:
def get_filtered_chart(df, item_per_page_number=10):
    filtered_df = df.groupby('product_title')['star_rating'].agg(['mean', 'count'])
    filtered_df.columns = ['Avg Rating', '# Of Reviews']
    weighted_rank(filtered_df)
    filtered_df = filtered_df[['Avg Rating', '# Of Reviews', 'rank']].sort_values('rank', ascending=True)
    return filtered_df.head(n=item_per_page_number)

In [38]:
def search_item(user_search_term):
    
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    
    words = word_tokenize(user_search_term)
    words_list = [word for word in words
                      if word not in stopwords.words()]
    search_term = '|'.join(words_list)
    
    search_df = url_product[url_product.index.str.contains(search_term, na=False, case=False)]
    return get_filtered_chart(search_df)

In [39]:
def keyword_review(user_review_term):
    review_df = url_product[url_product['review_body'].str.contains(user_review_term, na=False)]
    #select_df = select_df[(select_df['review_date'] >= review_start) & (select_df['review_date'] <= review_end)]
    return get_filtered_chart(review_df)

In [40]:
by_ptitle = url_product.groupby('product_title')['star_rating'].agg(['mean', 'count'])
by_ptitle.columns = ['Avg Rating', '# Of Reviews']
weighted_rank(by_ptitle)
clean_df = by_ptitle[['Avg Rating', '# Of Reviews', 'rank']].sort_values('rank', ascending=True)
clean_df.head(n=10)

Unnamed: 0_level_0,Avg Rating,# Of Reviews,rank
product_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Caseling Premium Hard EVA Case Travel Bag Pouch for Bose Soundlink Mini Bluetooth Speaker,4.88853,619,1.0
"Monoprice Multi Media Desktop Stand 22"" x 9.5"" (109434)",4.887805,205,2.0
"Etekcity 10 Pack Power Extension Cord Cable, Outlet Saver, 3 Prong, 16AWG-13A, UL Listed (Black, 1-Foot)",4.879464,224,3.0
AmazonBasics High-Speed HDMI Cable and Digital Audio Optical Cable 2-Pack - 6-Feet (1.8 Meters),4.919643,112,4.0
"Anker 3.5mm Premium Auxiliary Audio Cable (4ft / 1.2m) AUX Cable for Beats Headphones, iPods, iPhones, iPads, Home / Car Stereos and More",4.846682,437,5.0
"Cable Management Sleeve, JOTO Cable Sleeve, [Set of 4] 19"" Neoprene Cable Sleeves for PC / TV / Home Theater / Speaker, Flexible Cable Wrap, Cable Cover, Cable Organizer, Cords Management (4 Piece)",4.881988,161,6.0
iXCC Stereo cable Male to Male 3.5mm,4.839844,256,7.0
"ECHOGEAR Full Motion Articulating TV Wall Mount Bracket for most 37-70 inch LED, LCD, OLED and Plasma Flat Screen TVs w/ VESA patterns up to 600 x 400 - 16"" Extension - EGLF1-BK",4.840708,226,8.0
HDMI-DVI Cables,4.810784,1020,9.0
"Swimbuds SPORT Waterproof Headphones - See below under ""Special Offers and Product Promotions"" for discounts on this Headphone",4.828467,274,10.0


In [41]:
search_item('headphones')

Unnamed: 0_level_0,Avg Rating,# Of Reviews,rank
product_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Anker 3.5mm Premium Auxiliary Audio Cable (4ft / 1.2m) AUX Cable for Beats Headphones, iPods, iPhones, iPads, Home / Car Stereos and More",4.846682,437,1.0
"Swimbuds SPORT Waterproof Headphones - See below under ""Special Offers and Product Promotions"" for discounts on this Headphone",4.828467,274,2.0
"Anker 3.5mm Nylon Braided Auxiliary Audio Cable (4ft / 1.2m) Tangle-Free AUX Cable for Headphones, iPods, iPhones, iPads, Home / Car Stereos and More (Red)",4.811966,117,3.0
Status Audio HD One Headphones - Noise isolating. Matte finish. Foldable. 2 cables. Mic.,4.692661,218,4.0
"Brainwavz Replacement Memory Foam Earpads - Suitable For Many Other Large Over The Ear Headphones - AKG, HifiMan, ATH, Philips, Fostex",4.676329,207,5.0
AudioFlood Waterproof iPod Shuffle with True Short Cord Headphones,4.624161,447,6.0
Sony MDR-V6 Monitor Series Headphones with CCAW Voice Coil,4.610345,290,7.0
"Philips X2/27 Fidelio Premium Headphones, Black",4.664122,131,8.0
Sennheiser Headphones,4.543198,1007,9.0
"V-MODA XL-GREY Memory Cushions for Over-Ear Headphones, Grey",4.607955,176,10.0


In [42]:
keyword_review('better')

Unnamed: 0_level_0,Avg Rating,# Of Reviews,rank
product_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
VIZIO XRT302 KE Internet Qwerty Keyboard Remote for Select VIZIO TV'S,5.0,16,1.0
Underwater Audio Waterproof iPod Shuffle,4.888889,36,2.0
"Screen Cleaner Kit - Natural, Streak-Free, Antibacterial - For Phones, LED/LCD TVs, Computers, Laptops, Optical Devices, ... - Includes Spray + Microfiber Cloth (washable) - Made in Germany",4.95,20,3.0
HDMI-High-Speed,4.882353,34,4.5
Status Audio HD One Headphones - Noise isolating. Matte finish. Foldable. 2 cables. Mic.,4.882353,34,4.5
"Swimbuds SPORT Waterproof Headphones - See below under ""Special Offers and Product Promotions"" for discounts on this Headphone",4.892857,28,6.0
"Cable Management Sleeve, JOTO Cable Sleeve, [Set of 4] 19"" Neoprene Cable Sleeves for PC / TV / Home Theater / Speaker, Flexible Cable Wrap, Cable Cover, Cable Organizer, Cords Management (4 Piece)",4.941176,17,7.0
BIC America F12 12-Inch 475-Watt Front Firing Powered Subwoofer,4.904762,21,8.0
"Anker 3.5mm Premium Auxiliary Audio Cable (4ft / 1.2m) AUX Cable for Beats Headphones, iPods, iPhones, iPads, Home / Car Stereos and More",4.862069,29,9.0
ED 4 Pack CINEMA 3D GLASSES For LG 3D TVs – Adult Sized Passive Circular Polarized 3D Glasses,4.888889,18,10.0


In [43]:
def program():
    
    user_search_term = input('What items are you looking for?: ')
    user_review_term = input('What do you want to look for in reviews?: ')
    
    