In [2]:
import pandas as pd
import numpy as np

url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz'
url_data = pd.read_table(url, sep='\t', nrows=809815, error_bad_lines=False)

b'Skipping line 9076: expected 15 fields, saw 22\nSkipping line 19256: expected 15 fields, saw 22\nSkipping line 24313: expected 15 fields, saw 22\nSkipping line 47211: expected 15 fields, saw 22\nSkipping line 54295: expected 15 fields, saw 22\nSkipping line 56641: expected 15 fields, saw 22\nSkipping line 63067: expected 15 fields, saw 22\n'
b'Skipping line 93796: expected 15 fields, saw 22\n'
b'Skipping line 132806: expected 15 fields, saw 22\nSkipping line 164631: expected 15 fields, saw 22\nSkipping line 167019: expected 15 fields, saw 22\nSkipping line 167212: expected 15 fields, saw 22\n'
b'Skipping line 198103: expected 15 fields, saw 22\nSkipping line 199191: expected 15 fields, saw 22\nSkipping line 202841: expected 15 fields, saw 22\nSkipping line 218228: expected 15 fields, saw 22\nSkipping line 235900: expected 15 fields, saw 22\n'
b'Skipping line 277761: expected 15 fields, saw 22\nSkipping line 304582: expected 15 fields, saw 22\nSkipping line 312029: expected 15 fields,

In [34]:
#Cleaning Date Data: Turns out it's not appropriate because the raw data provided doesn't span enough time range
# import datetime
# url_data['review_date'] = pd.to_datetime(url_data['review_date'])
with open('electronics_raw_data.csv', encoding='utf-8') as fp:
    df = pd.read_csv(fp)

In [3]:
url_product = df.set_index('product_title')
url_product = url_product[['star_rating', 'review_body']]

In [None]:
url_product.head()

In [None]:
review_df = url_product.groupby('product_title')['review_body'].apply(lambda x: '||'.join(x.dropna())).reset_index()


In [None]:
ovr_review_body_list = review_df['review_body'].tolist()
ovr_pos_review_list = []
ovr_neg_review_list = []
for x in ovr_review_body_list:
    if len(x) <= 10:
        ovr_pos_review_list.append('None Available')
        ovr_neg_review_list.append('None Available')
    else:
        ovr_score, ovr_pos_perc, ovr_neg_perc = pos_neg_analyzer(x)
        ovr_num_words, ovr_char_per_word, ovr_word_per_sent, ovr_unique_word_perc = get_complexity(x)
        prod_review_body = x.split('||')
        pos_composite_score = 0
        neg_composite_score = 0
        for y in prod_review_body:
            review_score = 0
            score, pos_perc, neg_perc = pos_neg_analyzer(y)
            num_words, char_per_word, word_per_sent, unique_word_perc = get_complexity(y)
            review_score = (num_words/ovr_num_words) + (char_per_word/ovr_char_per_word)
            + (word_per_sent/ovr_word_per_sent) + (unique_word_perc/ovr_unique_word_perc)
            if pos_perc > ovr_pos_perc and score > ovr_score:
                if review_score > pos_composite_score:
                    top_positive_review = y
                    pos_composite_score = review_score
            if neg_perc > ovr_neg_perc and score < ovr_score:
                if review_score > neg_composite_score:
                    top_critical_review = y
                    neg_composite_score = review_score
        ovr_pos_review_list.append(top_positive_review)
        ovr_neg_review_list.append(top_critical_review)
review_df['Top Positive Review'] = pd.Series(ovr_pos_review_list).values
review_df['Top Critical Review'] = pd.Series(ovr_neg_review_list).values

In [4]:
def weighted_rank(df):
    total_count = df['# Of Reviews'].count()
    threshold = df['# Of Reviews'].mean() + df['# Of Reviews'].std()
    df['count_rank'] = df['# Of Reviews'].rank(ascending=False)
    df['rating_rank'] = df['Avg Rating'].rank(ascending=False)
    df['rank_score'] = np.where(df['# Of Reviews'] >= threshold,
                                       df['Avg Rating']/5 + (total_count-df['count_rank'])/total_count,
                                       (df['Avg Rating']/5 + ((1-((threshold-df['# Of Reviews'])/threshold))*(total_count-df['count_rank'])/total_count)))
    df['rank'] = df['rank_score'].rank(ascending=False)

In [5]:
def get_filtered_chart(df, item_per_page_number=10):
    filtered_df = df.groupby('product_title')['star_rating'].agg(['mean', 'count'])
    filtered_df.columns = ['Avg Rating', '# Of Reviews']
    weighted_rank(filtered_df)
    filtered_df = filtered_df[['Avg Rating', '# Of Reviews', 'rank']].sort_values('rank', ascending=True)
    return filtered_df.head(n=item_per_page_number)

In [6]:
def search_item(user_search_term, user_review_term = ''):
    
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    import difflib
    
    #Get tokenized search term to check for in product name
    words = word_tokenize(user_search_term)
    words_list = [word for word in words
                      if word not in stopwords.words()]
    search = '|'.join(words_list)
    
    #Filter out peripheral products from search query
    peripheral_terms = ['cable', 'cord', 'case', 'cover', 'sleeve', 'mount', 'stand', 'wire']
   
    for x in words_list:
        match = difflib.get_close_matches(x, peripheral_terms)
        if match:
            peripheral_terms.remove(match)
    peripheral_search = '|'.join(peripheral_terms)
    
    #Search filtered query from database
    if user_review_term:
        search_df = url_product[url_product.index.str.contains(search, na=False, case=False)
                           & ~url_product.index.str.contains(peripheral_search, na=False, case=False)
                            & url_product['review_body'].str.contains(user_review_term, na=False, case=False)]
    else:
        search_df = url_product[url_product.index.str.contains(search, na=False, case=False)
                           & ~url_product.index.str.contains(peripheral_search, na=False, case=False)]
                        
    return get_filtered_chart(search_df)
        

In [8]:
by_ptitle = url_product.groupby('product_title')['star_rating'].agg(['mean', 'count'])
by_ptitle.columns = ['Avg Rating', '# Of Reviews']
weighted_rank(by_ptitle)
clean_df = by_ptitle[['Avg Rating', '# Of Reviews', 'rank']].sort_values('rank', ascending=True)
clean_df.head(n=10)

Unnamed: 0_level_0,Avg Rating,# Of Reviews,rank
product_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Caseling Premium Hard EVA Case Travel Bag Pouch for Bose Soundlink Mini Bluetooth Speaker,4.88853,619,1.0
"Monoprice Multi Media Desktop Stand 22"" x 9.5"" (109434)",4.887805,205,2.0
"Etekcity 10 Pack Power Extension Cord Cable, Outlet Saver, 3 Prong, 16AWG-13A, UL Listed (Black, 1-Foot)",4.879464,224,3.0
AmazonBasics High-Speed HDMI Cable and Digital Audio Optical Cable 2-Pack - 6-Feet (1.8 Meters),4.919643,112,4.0
"Anker 3.5mm Premium Auxiliary Audio Cable (4ft / 1.2m) AUX Cable for Beats Headphones, iPods, iPhones, iPads, Home / Car Stereos and More",4.846682,437,5.0
"Cable Management Sleeve, JOTO Cable Sleeve, [Set of 4] 19"" Neoprene Cable Sleeves for PC / TV / Home Theater / Speaker, Flexible Cable Wrap, Cable Cover, Cable Organizer, Cords Management (4 Piece)",4.881988,161,6.0
iXCC Stereo cable Male to Male 3.5mm,4.839844,256,7.0
"ECHOGEAR Full Motion Articulating TV Wall Mount Bracket for most 37-70 inch LED, LCD, OLED and Plasma Flat Screen TVs w/ VESA patterns up to 600 x 400 - 16"" Extension - EGLF1-BK",4.840708,226,8.0
HDMI-DVI Cables,4.810784,1020,9.0
"Swimbuds SPORT Waterproof Headphones - See below under ""Special Offers and Product Promotions"" for discounts on this Headphone",4.828467,274,10.0


In [9]:
search_item('headphones')

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  Searched in:
    - '/Users/zhen/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/Users/zhen/anaconda3/nltk_data'
    - '/Users/zhen/anaconda3/share/nltk_data'
    - '/Users/zhen/anaconda3/lib/nltk_data'
    - ''
**********************************************************************


In [None]:
def program():
    
    user_search_term = input('What items are you looking for?: ')
    user_review_term = input('What do you want to look for in reviews?: ')
    
    

In [None]:
def output('product_title')
    