In [1]:
import os, pandas as pd, colorcet as cc
from bokeh.transform import factor_cmap
from bokeh.plotting import save, output_file, figure, show
from bokeh.models import ColumnDataSource, HoverTool, Range1d, Legend, LegendItem

In [152]:
df = pd.read_parquet('labeled_samples/grocery_labeled_sample.parquet')

In [134]:
list_x = df.c2.tolist()
list_y = df.c1.tolist()
labels_ = df.label.tolist()
desc = (df.product_title.astype(str) + ' ' + df.review).to_list()

output_file(filename='grocery_embeddings.html', title='Grocery Embeddings')

source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc, clustering=labels_))

hover = HoverTool(
    tooltips=[
        ('index', '$index'),
        ('(x,y)', '(@x, @y)'),
        ('desc', '@desc')
        ]
    )

f = figure(width=800, height=800, tools=[hover, 'pan, wheel_zoom, reset'])
f.title.text = 'UMAP Applied to a Random Sample of 100,000 Grocery Reviews'
f.title.text_font_size = '15px'
f.title.align = 'center'

f.scatter('x', 'y', source=source, size=1, color='navy', alpha=0.5)

save(f)

'c:\\Users\\E079051\\Desktop\\Product_Reviews_Analysis\\grocery_embeddings.html'

In [22]:
df['legend_rank'] = df.label.map(df.label.value_counts().to_dict())

In [28]:
df.sort_values('legend_rank', ascending=False, inplace=True)

In [171]:
labels = df.label.tolist()
desc = (df.product_title.astype(str) + ' ' + df.review).to_list()

output_file(filename='grocery_clusters.html', title='Grocery Clusters')

source = ColumnDataSource(
    data=dict(
        x=df.c2.tolist(), 
        y=df.c1.tolist(), 
        desc=desc, 
        labels=labels
        )
    )

hover = HoverTool(
    tooltips=[
        ('index', '$index'),
        ('(x,y)', '(@x, @y)'),
        ('desc', '@desc')
        ]
    )

f = figure(width=935, height=800, tools=[hover, 'pan, wheel_zoom, reset'])
f.title.text = 'HDBSCAN Applied to UMAP Embeddings. Clusters Labeled Using YAKE'
f.title.text_font_size = '15px'
f.title.align = 'center'

mapper = factor_cmap(
    field_name='labels',
    palette=['#EEEEEE']+cc.glasbey[:len(set(labels))], 
    factors=['unclustered']+list(set(labels))
    )

f.add_layout(Legend(), 'right')
f.scatter('x', 'y', legend_group='labels', source=source, size=1, color=mapper, alpha=0.7)
# f.legend.visible = False

# legend_items = f.legend.items
# legend = Legend(items=legend_items[:9], location=(115,1))
# f.add_layout(legend, 'center')
# legend = Legend(items=legend_items[9:18], location=(218,1))
# f.add_layout(legend, 'center')
# legend = Legend(items=legend_items[18:27], location=(318,1))
# f.add_layout(legend, 'center')
# legend = Legend(items=legend_items[27:36], location=(411,1))
# f.add_layout(legend, 'center')
# legend = Legend(items=legend_items[36:43], location=(518,70))
# f.add_layout(legend, 'center')

show(f)

In [2]:
files = [file for file in os.listdir('data')]

In [80]:
def get_verified_purchase_stats(file):
    df1 = pd.read_parquet(f'data/{file}')
    total_count = df1.shape[0]
    verified_count = (df1.verified_purchase == 'Y').sum()
    # print('-'*10, file, '-'*10)
    # print(f'Verified purchase count: {verified_count:,}')
    # print(f'Proportion of total records: {verified_count/df1.shape[0]:,.2%}')
    df2 = df1[df1.verified_purchase == 'Y']
    # print('If we then exclude reviews with helpful votes > 5 ...')
    # helpful_count = (df2.helpful_votes > 5).sum()
    # print(f'... there are {helpful_count:,}')
    # print('If we then exclude reviews having less than 0.9 ratio of helpful to total votes')
    # df3 = df2[df2.helpful_votes > 5]
    # print(f'Final count: {(df3.helpful_votes/df3.total_votes > .9).sum():,}')
    helpful_count = (df2.helpful_votes/df2.total_votes > .8).sum()
    return total_count, verified_count, helpful_count

In [76]:
import re

In [81]:
dfs = {}
for file in files:
    name = re.findall('(?<=reviews_)[.\w]*(?=.parq)', file)[0].lower() 
    dfs[name] = get_verified_purchase_stats(file)
    print(name)

apparel
automotive
baby
beauty
books
camera
digital_ebook_purchase
digital_music_purchase
digital_software
digital_video_download
digital_video_games
electronics
furniture
gift_card
grocery
health_personal_care
major_appliances
mobile_apps
mobile_electronics
multilingual
music
musical_instruments
office_products
outdoors
pc
personal_care_appliances
pet_products
shoes
software
sports
tools
toys
video
video_dvd
video_games
watches
wireless


In [115]:
new_df = pd.DataFrame(dfs).T.rename(columns={0:'total',1:'verified',2:'helpful'})

In [120]:
new_df

Unnamed: 0,total,verified,helpful
apparel,5906333,5312781,1009348
automotive,3514942,3228457,719089
baby,1752932,1392128,236822
beauty,5115666,4230268,994353
books,3105520,229346,130004
camera,1801974,1494403,369056
digital_ebook_purchase,5101693,3942399,1051017
digital_music_purchase,1688884,1256444,151599
digital_software,102084,70860,15516
digital_video_download,4057147,2704502,272787


In [121]:
((new_df.helpful-new_df.total)/new_df.total*100).mean()

-84.03867906193258

In [17]:
df.sort_values('helpful_votes', ascending=False).head()

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date,review
2344167,47364506,RXXPVOUH9NLL3,B00032G1S0,753469671,"Tuscan Dairy Whole Vitamin D Milk, Gallon, 128 oz",5,23755,24170,N,N,2008-07-08,Make this your only stock and store Once upon ...
923600,29857793,R2DQNJRV27D3P0,B000EVOSE4,990977895,"Haribo Gummi Candy Gold-Bears,",5,11045,11901,N,N,2014-10-22,I have seen the face of God. I didn't feel the...
2393030,49234072,RKT07YYORZMZE,B00032G1S0,753469671,"Tuscan Dairy Whole Vitamin D Milk, Gallon, 128 oz",3,9986,10304,N,N,2006-10-30,"One Friday, Without the Milk He always brought..."
1756395,9286343,R3SC3RFGX29KPJ,B00012182G,554396858,Fresh Whole Rabbit,1,7886,8076,N,N,2013-05-10,Not like Game of Thrones Brad and I were very ...
249057,809523,R2JGNJ5ZPJT4YC,B000EVOSE4,990977895,"Haribo Gummi Candy Gold-Bears,",1,6201,6900,N,N,2015-06-05,"See you in hell, Haribo Sugar-Free Gummi Bears..."


In [125]:
re.findall('(?<=reviews_)[.\w]*(?=.parq)', 'reviews_this_is_a_test.parquet')[0].lower()

'this_is_a_test'

In [90]:
df = pd.read_parquet('data/amazon_reviews_gift_card.parquet')

In [112]:
df[(df.verified_purchase == 'Y') & (df.helpful_votes/df.total_votes > 0.8)]

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date,review
11,23413911,R1XZHS8M1GCGI7,B004KNWWU4,326384774,Amazon Gift Card - Print - Happy Birthday (Birds),5,1,1,N,Y,2015-08-31,Always good Easy to print from email. I love ...
12,2026222,R1DAI0N03SKRJN,B004LLIKVU,473048287,Amazon.com eGift Cards,5,1,1,N,Y,2015-08-31,Five Stars Amazing with 10 dollar
14,20241560,RIBOP6OEAZA47,B00H5BNLUS,637715957,Amazon eGift Card - Hoops and Yoyo Thank You V...,5,1,1,N,Y,2015-08-31,Five Stars good
19,14222739,R196T0NZ73TYIM,B005ESMMKE,459536087,"Amazon.com Gift Cards, Pack of 10 (Various Car...",5,1,1,N,Y,2015-08-31,Five Stars nice
32,4788872,R3MHW0ENYJ5FKI,B004LLIKVU,473048287,Amazon.com eGift Cards,5,1,1,N,Y,2015-08-31,Five Stars Good
...,...,...,...,...,...,...,...,...,...,...,...,...
149006,15348738,R2KPA5J1MXBJSD,B001GXRQW0,246986987,Amazon.com Gift Cards,1,186,222,N,Y,2008-12-22,Card didn't work After I ordered a gift card a...
149017,41021473,R3M9UKKKPVQVF0,B001H53QDK,246986987,Amazon.com Gift Cards,1,576,688,N,Y,2008-12-15,Prime Members BEWARE I am a dedicated Amazon.c...
149019,52540230,R1XNO68OHJM5SW,B001H53QDK,246986987,Amazon.com Gift Cards,1,452,545,N,Y,2008-12-11,Horrible experience Bought three $50 gift card...
149036,46747390,R1GLUPO4UO93OS,B000LGKQHU,941684275,Apple iTunes $25 Music Card,5,5,5,N,Y,2007-03-08,Apple iTunes $25 Music card Easy to use. As yo...


In [91]:
dff = df[df.verified_purchase == 'Y']
dff.shape

(136042, 12)

In [98]:
df3 = dff[dff.total_votes > 2]
df3.shape

(1350, 12)

In [92]:
df4 = dff[(dff.helpful_votes / dff.total_votes) > .8]
df4.shape

(3723, 12)

In [63]:
f'{(dff.shape[0]-df.shape[0])/df.shape[0]:,.2%}',\
f'{(df3.shape[0]-dff.shape[0])/dff.shape[0]:,.2%}',\
f'{(df4.shape[0]-df3.shape[0])/df3.shape[0]:,.2%}'

('-30.59%', '-92.47%', '-41.86%')

In [69]:
(dff.helpful_votes/dff.total_votes)[dff.helpful_votes/dff.total_votes > .9]

15        1.000000
27        1.000000
36        1.000000
65        1.000000
76        1.000000
            ...   
101933    1.000000
101944    1.000000
101968    0.976744
101986    1.000000
101989    1.000000
Length: 14450, dtype: float64

In [73]:
(dff.helpful_votes/dff.total_votes > .9).sum()

14450

In [101]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

In [131]:
df['test1'] = pd.Series(['hello']*df.shape[0], dtype='category')

In [138]:
df.test1 = df.test1.astype(str)

In [141]:
import numpy as np

In [176]:
df[df.star_rating==5].index

Int64Index([    0,     2,     3,     4,     6,     7,     8,     9,    10,
               14,
            ...
            99987, 99989, 99990, 99991, 99992, 99994, 99996, 99997, 99998,
            99999],
           dtype='int64', length=69266)

In [197]:
docs = df.product_title.str[:].copy() + ' ' + df.review.str[:].copy()
sample = docs.sample(min(100000, docs.shape[0]), random_state=1729)

In [198]:
sample = sample.to_frame().rename(columns={0:'cluster'})

In [199]:
sample

Unnamed: 0,cluster
68037,Lipton Tea Bags Good Just the right type of te...
344,"Mad Monkey Coffee Capsules, 48 Count Five Star..."
68833,"Annie's Organic Bunny Fruit Snacks, Variety Pa..."
69327,"MarketSpice Teabags, box of 24 (Market Spice T..."
22061,Suddenly Pasta Salad Yummy This mix is excelle...
...,...
88921,"Heinz Reduced Sugar Tomato Ketchup, 13 oz (Pac..."
54821,"Cadbury Easter Creme Egg, 1.2-Ounce Eggs (Pack..."
66803,CedarHouse Ultra-Premium Cedar Paper Grilling ...
68710,Nutiva Hi Fiber Hemp Protein Powder - 16 OZ Go...


In [204]:
sample['test'] = [np.random.randint(0,100) for i in range(sample.shape[0])]
sample

Unnamed: 0,cluster,test
68037,Lipton Tea Bags Good Just the right type of te...,43
344,"Mad Monkey Coffee Capsules, 48 Count Five Star...",33
68833,"Annie's Organic Bunny Fruit Snacks, Variety Pa...",88
69327,"MarketSpice Teabags, box of 24 (Market Spice T...",28
22061,Suddenly Pasta Salad Yummy This mix is excelle...,10
...,...,...
88921,"Heinz Reduced Sugar Tomato Ketchup, 13 oz (Pac...",87
54821,"Cadbury Easter Creme Egg, 1.2-Ounce Eggs (Pack...",6
66803,CedarHouse Ultra-Premium Cedar Paper Grilling ...,53
68710,Nutiva Hi Fiber Hemp Protein Powder - 16 OZ Go...,59
