In [1]:
import os, pandas as pd, colorcet as cc
from bokeh.transform import factor_cmap
from bokeh.plotting import save, output_file, figure, show
from bokeh.models import ColumnDataSource, HoverTool, Range1d, Legend, LegendItem

In [152]:
df = pd.read_parquet('labeled_samples/grocery_labeled_sample.parquet')

In [134]:
list_x = df.c2.tolist()
list_y = df.c1.tolist()
labels_ = df.label.tolist()
desc = (df.product_title.astype(str) + ' ' + df.review).to_list()

output_file(filename='grocery_embeddings.html', title='Grocery Embeddings')

source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc, clustering=labels_))

hover = HoverTool(
    tooltips=[
        ('index', '$index'),
        ('(x,y)', '(@x, @y)'),
        ('desc', '@desc')
        ]
    )

f = figure(width=800, height=800, tools=[hover, 'pan, wheel_zoom, reset'])
f.title.text = 'UMAP Applied to a Random Sample of 100,000 Grocery Reviews'
f.title.text_font_size = '15px'
f.title.align = 'center'

f.scatter('x', 'y', source=source, size=1, color='navy', alpha=0.5)

save(f)

'c:\\Users\\E079051\\Desktop\\Product_Reviews_Analysis\\grocery_embeddings.html'

In [22]:
df['legend_rank'] = df.label.map(df.label.value_counts().to_dict())

In [28]:
df.sort_values('legend_rank', ascending=False, inplace=True)

In [171]:
labels = df.label.tolist()
desc = (df.product_title.astype(str) + ' ' + df.review).to_list()

output_file(filename='grocery_clusters.html', title='Grocery Clusters')

source = ColumnDataSource(
    data=dict(
        x=df.c2.tolist(), 
        y=df.c1.tolist(), 
        desc=desc, 
        labels=labels
        )
    )

hover = HoverTool(
    tooltips=[
        ('index', '$index'),
        ('(x,y)', '(@x, @y)'),
        ('desc', '@desc')
        ]
    )

f = figure(width=935, height=800, tools=[hover, 'pan, wheel_zoom, reset'])
f.title.text = 'HDBSCAN Applied to UMAP Embeddings. Clusters Labeled Using YAKE'
f.title.text_font_size = '15px'
f.title.align = 'center'

mapper = factor_cmap(
    field_name='labels',
    palette=['#EEEEEE']+cc.glasbey[:len(set(labels))], 
    factors=['unclustered']+list(set(labels))
    )

f.add_layout(Legend(), 'right')
f.scatter('x', 'y', legend_group='labels', source=source, size=1, color=mapper, alpha=0.7)
# f.legend.visible = False

# legend_items = f.legend.items
# legend = Legend(items=legend_items[:9], location=(115,1))
# f.add_layout(legend, 'center')
# legend = Legend(items=legend_items[9:18], location=(218,1))
# f.add_layout(legend, 'center')
# legend = Legend(items=legend_items[18:27], location=(318,1))
# f.add_layout(legend, 'center')
# legend = Legend(items=legend_items[27:36], location=(411,1))
# f.add_layout(legend, 'center')
# legend = Legend(items=legend_items[36:43], location=(518,70))
# f.add_layout(legend, 'center')

show(f)

In [2]:
files = [file for file in os.listdir('data')]

In [80]:
def get_verified_purchase_stats(file):
    df1 = pd.read_parquet(f'data/{file}')
    total_count = df1.shape[0]
    verified_count = (df1.verified_purchase == 'Y').sum()
    # print('-'*10, file, '-'*10)
    # print(f'Verified purchase count: {verified_count:,}')
    # print(f'Proportion of total records: {verified_count/df1.shape[0]:,.2%}')
    df2 = df1[df1.verified_purchase == 'Y']
    # print('If we then exclude reviews with helpful votes > 5 ...')
    # helpful_count = (df2.helpful_votes > 5).sum()
    # print(f'... there are {helpful_count:,}')
    # print('If we then exclude reviews having less than 0.9 ratio of helpful to total votes')
    # df3 = df2[df2.helpful_votes > 5]
    # print(f'Final count: {(df3.helpful_votes/df3.total_votes > .9).sum():,}')
    helpful_count = (df2.helpful_votes/df2.total_votes > .8).sum()
    return total_count, verified_count, helpful_count

In [76]:
import re

In [81]:
dfs = {}
for file in files:
    name = re.findall('(?<=reviews_)[.\w]*(?=.parq)', file)[0].lower() 
    dfs[name] = get_verified_purchase_stats(file)
    print(name)

apparel
automotive
baby
beauty
books
camera
digital_ebook_purchase
digital_music_purchase
digital_software
digital_video_download
digital_video_games
electronics
furniture
gift_card
grocery
health_personal_care
major_appliances
mobile_apps
mobile_electronics
multilingual
music
musical_instruments
office_products
outdoors
pc
personal_care_appliances
pet_products
shoes
software
sports
tools
toys
video
video_dvd
video_games
watches
wireless


In [115]:
new_df = pd.DataFrame(dfs).T.rename(columns={0:'total',1:'verified',2:'helpful'})

In [120]:
new_df

Unnamed: 0,total,verified,helpful
apparel,5906333,5312781,1009348
automotive,3514942,3228457,719089
baby,1752932,1392128,236822
beauty,5115666,4230268,994353
books,3105520,229346,130004
camera,1801974,1494403,369056
digital_ebook_purchase,5101693,3942399,1051017
digital_music_purchase,1688884,1256444,151599
digital_software,102084,70860,15516
digital_video_download,4057147,2704502,272787


In [121]:
((new_df.helpful-new_df.total)/new_df.total*100).mean()

-84.03867906193258

In [17]:
df.sort_values('helpful_votes', ascending=False).head()

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date,review
2344167,47364506,RXXPVOUH9NLL3,B00032G1S0,753469671,"Tuscan Dairy Whole Vitamin D Milk, Gallon, 128 oz",5,23755,24170,N,N,2008-07-08,Make this your only stock and store Once upon ...
923600,29857793,R2DQNJRV27D3P0,B000EVOSE4,990977895,"Haribo Gummi Candy Gold-Bears,",5,11045,11901,N,N,2014-10-22,I have seen the face of God. I didn't feel the...
2393030,49234072,RKT07YYORZMZE,B00032G1S0,753469671,"Tuscan Dairy Whole Vitamin D Milk, Gallon, 128 oz",3,9986,10304,N,N,2006-10-30,"One Friday, Without the Milk He always brought..."
1756395,9286343,R3SC3RFGX29KPJ,B00012182G,554396858,Fresh Whole Rabbit,1,7886,8076,N,N,2013-05-10,Not like Game of Thrones Brad and I were very ...
249057,809523,R2JGNJ5ZPJT4YC,B000EVOSE4,990977895,"Haribo Gummi Candy Gold-Bears,",1,6201,6900,N,N,2015-06-05,"See you in hell, Haribo Sugar-Free Gummi Bears..."


In [125]:
re.findall('(?<=reviews_)[.\w]*(?=.parq)', 'reviews_this_is_a_test.parquet')[0].lower()

'this_is_a_test'

In [90]:
df = pd.read_parquet('data/amazon_reviews_gift_card.parquet')

In [112]:
df[(df.verified_purchase == 'Y') & (df.helpful_votes/df.total_votes > 0.8)]

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date,review
11,23413911,R1XZHS8M1GCGI7,B004KNWWU4,326384774,Amazon Gift Card - Print - Happy Birthday (Birds),5,1,1,N,Y,2015-08-31,Always good Easy to print from email. I love ...
12,2026222,R1DAI0N03SKRJN,B004LLIKVU,473048287,Amazon.com eGift Cards,5,1,1,N,Y,2015-08-31,Five Stars Amazing with 10 dollar
14,20241560,RIBOP6OEAZA47,B00H5BNLUS,637715957,Amazon eGift Card - Hoops and Yoyo Thank You V...,5,1,1,N,Y,2015-08-31,Five Stars good
19,14222739,R196T0NZ73TYIM,B005ESMMKE,459536087,"Amazon.com Gift Cards, Pack of 10 (Various Car...",5,1,1,N,Y,2015-08-31,Five Stars nice
32,4788872,R3MHW0ENYJ5FKI,B004LLIKVU,473048287,Amazon.com eGift Cards,5,1,1,N,Y,2015-08-31,Five Stars Good
...,...,...,...,...,...,...,...,...,...,...,...,...
149006,15348738,R2KPA5J1MXBJSD,B001GXRQW0,246986987,Amazon.com Gift Cards,1,186,222,N,Y,2008-12-22,Card didn't work After I ordered a gift card a...
149017,41021473,R3M9UKKKPVQVF0,B001H53QDK,246986987,Amazon.com Gift Cards,1,576,688,N,Y,2008-12-15,Prime Members BEWARE I am a dedicated Amazon.c...
149019,52540230,R1XNO68OHJM5SW,B001H53QDK,246986987,Amazon.com Gift Cards,1,452,545,N,Y,2008-12-11,Horrible experience Bought three $50 gift card...
149036,46747390,R1GLUPO4UO93OS,B000LGKQHU,941684275,Apple iTunes $25 Music Card,5,5,5,N,Y,2007-03-08,Apple iTunes $25 Music card Easy to use. As yo...


In [91]:
dff = df[df.verified_purchase == 'Y']
dff.shape

(136042, 12)

In [98]:
df3 = dff[dff.total_votes > 2]
df3.shape

(1350, 12)

In [92]:
df4 = dff[(dff.helpful_votes / dff.total_votes) > .8]
df4.shape

(3723, 12)

In [63]:
f'{(dff.shape[0]-df.shape[0])/df.shape[0]:,.2%}',\
f'{(df3.shape[0]-dff.shape[0])/dff.shape[0]:,.2%}',\
f'{(df4.shape[0]-df3.shape[0])/df3.shape[0]:,.2%}'

('-30.59%', '-92.47%', '-41.86%')

In [69]:
(dff.helpful_votes/dff.total_votes)[dff.helpful_votes/dff.total_votes > .9]

15        1.000000
27        1.000000
36        1.000000
65        1.000000
76        1.000000
            ...   
101933    1.000000
101944    1.000000
101968    0.976744
101986    1.000000
101989    1.000000
Length: 14450, dtype: float64

In [73]:
(dff.helpful_votes/dff.total_votes > .9).sum()

14450

In [101]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

In [131]:
df['test1'] = pd.Series(['hello']*df.shape[0], dtype='category')

In [138]:
df.test1 = df.test1.astype(str)

In [141]:
import numpy as np

In [176]:
df[df.star_rating==5].index

Int64Index([    0,     2,     3,     4,     6,     7,     8,     9,    10,
               14,
            ...
            99987, 99989, 99990, 99991, 99992, 99994, 99996, 99997, 99998,
            99999],
           dtype='int64', length=69266)

In [197]:
docs = df.product_title.str[:].copy() + ' ' + df.review.str[:].copy()
sample = docs.sample(min(100000, docs.shape[0]), random_state=1729)

In [198]:
sample = sample.to_frame().rename(columns={0:'cluster'})

In [199]:
sample

Unnamed: 0,cluster
68037,Lipton Tea Bags Good Just the right type of te...
344,"Mad Monkey Coffee Capsules, 48 Count Five Star..."
68833,"Annie's Organic Bunny Fruit Snacks, Variety Pa..."
69327,"MarketSpice Teabags, box of 24 (Market Spice T..."
22061,Suddenly Pasta Salad Yummy This mix is excelle...
...,...
88921,"Heinz Reduced Sugar Tomato Ketchup, 13 oz (Pac..."
54821,"Cadbury Easter Creme Egg, 1.2-Ounce Eggs (Pack..."
66803,CedarHouse Ultra-Premium Cedar Paper Grilling ...
68710,Nutiva Hi Fiber Hemp Protein Powder - 16 OZ Go...


In [204]:
sample['test'] = [np.random.randint(0,100) for i in range(sample.shape[0])]
sample

Unnamed: 0,cluster,test
68037,Lipton Tea Bags Good Just the right type of te...,43
344,"Mad Monkey Coffee Capsules, 48 Count Five Star...",33
68833,"Annie's Organic Bunny Fruit Snacks, Variety Pa...",88
69327,"MarketSpice Teabags, box of 24 (Market Spice T...",28
22061,Suddenly Pasta Salad Yummy This mix is excelle...,10
...,...,...
88921,"Heinz Reduced Sugar Tomato Ketchup, 13 oz (Pac...",87
54821,"Cadbury Easter Creme Egg, 1.2-Ounce Eggs (Pack...",6
66803,CedarHouse Ultra-Premium Cedar Paper Grilling ...,53
68710,Nutiva Hi Fiber Hemp Protein Powder - 16 OZ Go...,59


In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet('samples/key_topics.parquet')

In [3]:
df

Unnamed: 0,labels,bad,good,ranking,product_category,bad_set,good_set
0,baby,430,1461,0.294319,apparel,poorly rough ripped thinner stitching suspende...,amazing carters family careful thinking beauti...
1,bag,198,603,0.328358,apparel,ripped falling useless coming broke poor broke...,amazing idea easier contents construction plan...
2,belt,591,1138,0.519332,apparel,digs fray pin sending home crap disappointing ...,amazing personally plenty surprise cell catch ...
3,bra,1623,3184,0.509736,apparel,ahh twisted madonna lying meh factor beware un...,blouses heaven duty yay subtle smart expectati...
4,costume,1088,2590,0.420077,apparel,practically refund junk sending sags god edge ...,expectations dirty warned preferred nicer gart...
...,...,...,...,...,...,...,...
910,stylus,275,1340,0.205224,wireless,usable poor falls broken angle version middle ...,amazing smartphone process purchases stuff ini...
911,unclustered,7634,16791,0.454648,wireless,buzzing ending garbled eligible beta deceived ...,stumbled lightly cleaner bows jackery replacea...
912,usb,445,594,0.749158,wireless,noticed recognized wouldn compatible picture e...,fantastic amazing highly contacted home meant ...
913,watch,543,649,0.836672,wireless,crap disappointing desired returning lock phot...,scratch professional messaging surprised singl...


In [4]:
dff = pd.read_parquet('samples/grocery_key_topics.parquet')

In [6]:
dff

Unnamed: 0,labels,bad,good,ranking,product_category,bad_set,good_set
0,baby,158,416,0.379808,grocery,items description content couldn reading expir...,eaten entire local bowl chips pure house regul...
1,bars,343,1018,0.336935,grocery,properly picture mold spit description worse m...,selection pleased sweetened spirulina fills wa...
2,beans,126,538,0.234201,grocery,crunchy severely change badly bent goods facto...,serving hot ship healthy prime veggies goodnes...
3,bread,101,613,0.164763,grocery,horrible bite terrible packaging,easiest hot set baking eaten son scratch effor...
4,butter,212,988,0.214575,grocery,bitter beware upset description maranatha real...,rice peanutbutter alternative pleased hope bow...
5,candy,747,2081,0.358962,grocery,beware smashed bitter upset receiving arcor st...,coming kinds tend sick easter hooked colorful ...
6,cereal,166,1062,0.156309,grocery,overpriced strange changed shipment waste upda...,purchasing total packages charges sweetened tr...
7,cheese,171,757,0.225892,grocery,description rate wrong supposed threw changed ...,total set purchasing wine craving chips goodne...
8,chia,117,426,0.274648,grocery,chinese wouldn selling supposed tasted dark gu...,makes minutes addition daily difference tapioc...
9,chips,311,957,0.324974,grocery,total refund bother tiny throw bits waste mult...,cooked school sauce knew veggies goodness stan...


In [8]:
pd.concat([df,dff]).to_parquet('samples/reviews_key_topics.parquet', index=False)

In [2]:
ktops = pd.read_parquet('samples/reviews_key_topics.parquet')

In [40]:
import os, calendar, pandas as pd, dask
from dask.distributed import Client
client = Client()
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 15.72 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:59870,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 15.72 GiB

0,1
Comm: tcp://127.0.0.1:59901,Total threads: 2
Dashboard: http://127.0.0.1:59903/status,Memory: 3.93 GiB
Nanny: tcp://127.0.0.1:59876,
Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-8yt8lwnl,Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-8yt8lwnl

0,1
Comm: tcp://127.0.0.1:59907,Total threads: 2
Dashboard: http://127.0.0.1:59908/status,Memory: 3.93 GiB
Nanny: tcp://127.0.0.1:59877,
Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-i2etu4e1,Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-i2etu4e1

0,1
Comm: tcp://127.0.0.1:59897,Total threads: 2
Dashboard: http://127.0.0.1:59899/status,Memory: 3.93 GiB
Nanny: tcp://127.0.0.1:59874,
Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-wf8zarh0,Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-wf8zarh0

0,1
Comm: tcp://127.0.0.1:59902,Total threads: 2
Dashboard: http://127.0.0.1:59905/status,Memory: 3.93 GiB
Nanny: tcp://127.0.0.1:59875,
Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-i_gnhutj,Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-i_gnhutj


In [5]:
files = [i for i in os.listdir('samples/labeled_samples_v2/')]

In [10]:
get_monthly_review_counts('apparel_clustered_embeddings.parquet')

Unnamed: 0,labels,January,February,March,April,May,June,July,August,September,October,November,December,product_category
0,baby,171,168,177,155,166,165,169,151,92,145,155,177,apparel
1,bag,81,71,76,62,76,55,82,93,44,59,51,51,apparel
2,belt,165,137,170,156,161,156,161,166,103,105,105,144,apparel
3,bra,399,386,441,437,449,536,530,489,285,276,269,310,apparel
4,costume,277,196,191,185,148,159,164,218,230,834,623,453,apparel
5,dress,633,683,912,1073,1031,979,980,978,435,469,480,624,apparel
6,girls',98,81,89,73,81,55,79,72,38,70,73,119,apparel
7,gloves,241,180,125,84,64,43,49,39,32,75,131,209,apparel
8,hat,585,468,434,415,394,462,434,404,231,307,372,559,apparel
9,hoodie,90,62,63,33,50,38,35,40,25,54,61,110,apparel


In [9]:
def get_monthly_review_counts(file):
    name = file.replace('_clustered_embeddings.parquet','').replace('_',' ')
    df = pd.read_parquet(f'samples/labeled_samples_v2/{file}')
    df.review_date = pd.to_datetime(df.review_date)
    df['month'] = df.review_date.dt.month
    x = {}
    for group in df.groupby('labels'):
        x[group[0]] = group[1].month.value_counts()
    df = pd.DataFrame(x).T.reset_index().rename(
        columns={'index':'labels'}).rename(
            columns=dict(zip([i+1 for i in range(12)
            ], calendar.month_name[:][1:])))
    df['product_category'] = name
    return df

In [11]:
tasks = [dask.delayed(get_monthly_review_counts)(file) for file in files]

In [12]:
tasks_list = dask.delayed()(tasks)

In [13]:
results = tasks_list.compute()

In [15]:
pd.concat(results).to_parquet('samples/monthly_review_counts.parquet', index=False)

  partials[d] = partials_get(d, 0) + n


In [4]:
df = pd.read_parquet('samples/labeled_samples_v3/apparel_clustered_embeddings.parquet')

In [5]:
df.head()

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date,review,e1,e2,clusters,labels,product_category
0,15235042,R21ENC85KLYCRX,B000MX94XE,908407169,Bali Women's Skimp Skamp Brief Panty Number 2633,5,1,1,N,Y,2011-08-26,Best granny panties ever These panties are so ...,-3.855251,-0.202959,19,women,apparel
1,44050726,RZ3AYDOBNL9UF,B00IFE539G,802216157,PLUS SIZE MAXI PAISLEY SURPLICE DRESS,3,3,4,N,Y,2014-06-14,Okay This is a nice dress. This just is not fo...,0.464726,-0.65583,14,dress,apparel
2,13234601,R2UR25346UG5M1,B00NBFGR8U,727214648,EASY Women's Retro Vintage Punk plus size Ripp...,1,1,1,N,Y,2015-06-23,sizes to do match i ordered a large and got a ...,-3.098939,2.540366,-1,unclustered,apparel
3,3377308,R8OKJGCCG0GQT,B00KFEFK8S,223324092,Vikoros Waist Tummy Slimming Body Shapewear Be...,5,2,2,N,Y,2015-04-10,Perfect fit give you shape flatten your tummy ...,-3.208839,-2.01431,18,waist,apparel
4,6438794,RXSGXNTAR5Y7M,B00368CL9Q,387289369,Shock Absorber Women's Ultimate Run Bra,5,1,1,N,Y,2015-01-29,Great bra for exercise. This is a great bra! I...,-5.507154,-1.545273,4,bra,apparel


In [6]:
apparel = ktops[ktops.product_category=='apparel']

In [11]:
apparel

Unnamed: 0,labels,bad,good,ranking,product_category,bad_set,good_set
0,baby,430,1461,0.294319,apparel,poorly rough ripped thinner stitching suspende...,amazing carters family careful thinking beauti...
1,bag,198,603,0.328358,apparel,ripped falling useless coming broke poor broke...,amazing idea easier contents construction plan...
2,belt,591,1138,0.519332,apparel,digs fray pin sending home crap disappointing ...,amazing personally plenty surprise cell catch ...
3,bra,1623,3184,0.509736,apparel,ahh twisted madonna lying meh factor beware un...,blouses heaven duty yay subtle smart expectati...
4,costume,1088,2590,0.420077,apparel,practically refund junk sending sags god edge ...,expectations dirty warned preferred nicer gart...
5,dress,2719,6558,0.414608,apparel,deceiving frayed greatest poor oddly practical...,sunday fullness proper familiar slutty early o...
6,girls',216,712,0.303371,apparel,description poor return waste ripped paid seam...,amazing awesome monster summer durable person ...
7,gloves,379,893,0.424412,apparel,poorly rip refund mail poor ripped holding thi...,dirty stuff bunch arrive clothing tablet sligh...
8,hat,1193,3872,0.30811,apparel,alright paint uneven weren poorly envelope sta...,dirty surgery silk activities faux purchases w...
9,hoodie,162,499,0.324649,apparel,return xxl poor completely wouldn child wrong ...,amazing higher shipped chest live looked satis...


In [26]:
ktops[ktops.product_category=='apparel'].labels.unique()

array(['baby', 'bag', 'belt', 'bra', 'costume', 'dress', "girls'",
       'gloves', 'hat', 'hoodie', 'jacket', 'jeans', 'leggings', 'men',
       'pajamas', 'pants', 'robe', 'scarf', 'scrub', 'shirt', 'skirt',
       'socks', 'swimsuit', 'tie', 'unclustered', 'waist', 'wallet',
       'wig', 'women'], dtype=object)

In [19]:
df.product_category.unique()[0]

'apparel'

In [18]:
def get_bad_subset(df, label):
    bad_words = ktops[(ktops.labels==label)].bad_set.iloc[0].split()
    return df[df.review.apply(lambda x: any(
        i in x for i in bad_words)) & (df.star_rating <= 3) & (df.labels==label)]

In [19]:
%%time
dfs = []
for label in df.labels.unique().tolist():
    dfs.append(get_bad_subset(df, label))

CPU times: total: 1min 7s
Wall time: 1min 8s


In [20]:
bad_reviews = pd.concat(dfs)

In [21]:
def get_good_subset(df, label):
    return df[df.review.apply(lambda x: any(
        i in x for i in ktops.good_set.iloc[0].split())) & \
            (df.star_rating > 3) & (df.labels==label)]

In [22]:
%%time
dfs2 = []
for label in df.labels.unique().tolist():
    dfs2.append(get_good_subset(df, label))

CPU times: total: 1min 6s
Wall time: 1min 7s


In [23]:
good_reviews = pd.concat(dfs2)

In [28]:
bad_reviews[bad_reviews.review.duplicated(keep=False)].review.iloc[0]



In [17]:
ktops.good_set.iloc[0]

'amazing carters family careful thinking beautiful future told superb husband guy excellent customer parents nights target pool chest sweet continue prefer strong comfy hit children slightly functional fantastic scratch walk'

In [28]:
tasks = [dask.delayed(get_subset)(df,'apparel', label) for label in df.labels.unique().tolist()]

In [None]:
tasks_list = dask.delayed()(tasks)

In [None]:
results = tasks_list.compute()

In [35]:
'Rating: ' + bad_reviews.star_rating.astype(str) + ' ' + \
    'Title: ' + bad_reviews.product_title.astype(str) + ' ' + \
        'Review: ' + bad_reviews.review

161      Rating: 1 Title: Underworks Post Delivery Belt...
165      Rating: 2 Title: Fruit of the Loom Women's 6-P...
274      Rating: 2 Title: Women's Daywear Stretch Slim ...
827      Rating: 1 Title: Hanes Women's Cotton Brief Pa...
1070     Rating: 1 Title: Fruit of the Loom Women's 3 P...
                               ...                        
93715    Rating: 1 Title: Grey & White Plaid Flannel Me...
95710    Rating: 1 Title: LED Light Review: Absolute Ga...
96132    Rating: 1 Title: Carter's Watch the Wear Hoode...
97316    Rating: 1 Title: Nickelodeon Paw Patrol Little...
99627    Rating: 1 Title: Cookie Monster Cookie Power S...
Length: 9456, dtype: object

In [29]:
bad_reviews.to_parquet('apparel_bad_reviews.parquet', index=False)
good_reviews.to_parquet('apparel_good_reviews.parquet', index=False)

In [41]:
def add_cats(file):
    name = file.replace('_clustered_embeddings.parquet','').replace('_',' ')
    df = pd.read_parquet(f'samples/labeled_samples_v2/{file}')
    df['product_category'] = name
    df.to_parquet(f'samples/labeled_samples_v3/{file}', index=False)

In [43]:
files = [i for i in os.listdir('samples/labeled_samples_v3/')]

In [44]:
tasks = [dask.delayed(add_cats)(file) for file in files]

In [50]:
dask.delayed()(tasks).compute()

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [53]:
dff = pd.read_parquet('samples/labeled_samples_v3/wireless_clustered_embeddings.parquet')

In [54]:
dff

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date,review,e1,e2,clusters,labels,product_category
0,26129696,RN1NJ4UU01IEY,B0085EXLWW,269963285,Apple 30 Pin USB Cable EZOPower CERTIFIED 6 Fe...,5,1,1,N,Y,2014-09-23,"Five Stars Works well so far, cheaper then app...",12.904141,5.880183,26,cable,wireless
1,12852326,R16Y9IZE9UHBTL,B001S2R8TU,964300755,Sony 4/3/2-Channel ZR-Series Amplifier,5,1,1,N,Y,2012-01-17,Great for 4 6x9's This amp sounded so good tha...,17.273903,10.866034,7,speakers,wireless
2,51557291,R364C9RG4C582B,B00QMPY77E,709352186,"iPhone 6 Battery Case , Maxboost VIVID iPhone ...",5,1,1,N,Y,2015-07-14,I've ordered battery cases before and have bee...,9.899235,4.354083,21,case,wireless
3,33833896,R1HDB0MSI8MB7Y,B00NGNQ47G,419068635,"iPhone 6S Plus Case, Terrapin Trendy [Studded]...",5,3,5,N,Y,2014-11-26,Absolutely fantastic!! I am so thrilled to hav...,5.748475,11.434567,33,case,wireless
4,22125670,RBGN2ZEND2PYT,B00C96CMDG,124792632,Samsung Galaxy S4 Case - OtterBox Defender Ser...,5,1,1,N,Y,2013-12-07,Excellent protection for your Galaxy S4 The Ot...,3.808937,9.236429,30,case,wireless
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,12306293,R1L8HVLZE2G8KT,B00MOEFZQO,133246919,"iPhone SE Case, iPhone 5S Case, Shieldon Genui...",5,1,1,N,Y,2015-01-07,Just what I was looking for Loving the case so...,5.980806,11.558444,33,case,wireless
99996,31869696,R1ULPKRSE7DPI8,B00SXFEA9Q,438471763,UAG Samsung Galaxy S6 [5.1-inch screen] Feathe...,5,30,32,N,Y,2015-04-11,Another great UAG case Typical UAG case perfec...,2.869984,10.604469,-1,unclustered,wireless
99997,1684377,R3IV7OIU1V447E,B00EC4WJPY,348670033,Hot Sale! Z-Design Beautiful Personality& Crea...,2,1,1,N,Y,2014-06-10,Fair The case is nice but it's starting to com...,5.443238,10.432199,39,case,wireless
99998,16773961,R2I3Y050XE096Y,B0099I81IM,98128275,URBAN ARMOR GEAR Case for iPhone 5/5S,4,2,3,N,Y,2013-02-02,GREAT CUSTOMER SERVICE Loved everything about ...,3.921398,9.913481,-1,unclustered,wireless


---

In [27]:
from get_reviews_deprecated import get_reviews_having_keywords

In [28]:
bad_reviews, good_reviews = get_reviews_having_keywords('apparel_clustered_embeddings.parquet')

KeyboardInterrupt: 

In [6]:
def take_sample(file):
    df = pd.read_parquet(f'samples/labeled_samples_v3/{file}')
    df = df[df.clusters!=-1]
    bad = df[df.star_rating <= 3]
    bad = bad.sample(min(5000,bad.shape[0]), random_state=1729)
    good = df[df.star_rating > 3]
    good = good.sample(min(5000,good.shape[0]), random_state=1729)

    bad['text'] = 'Rating: ' + bad.star_rating.astype(str) + ', ' + \
        'Title: ' + bad.product_title.astype(str) + ', ' + \
            'Review: ' + bad.review

    good['text'] = 'Rating: ' + good.star_rating.astype(str) + ', ' + \
        'Title: ' + good.product_title.astype(str) + ', ' + \
            'Review: ' + good.review

    return bad[['product_category','labels','text']], good[['product_category','labels','text']]

In [1]:
import os, calendar, pandas as pd, dask
from dask.distributed import Client
client = Client()
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 15.72 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:61311,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 15.72 GiB

0,1
Comm: tcp://127.0.0.1:61338,Total threads: 2
Dashboard: http://127.0.0.1:61339/status,Memory: 3.93 GiB
Nanny: tcp://127.0.0.1:61315,
Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-u11a01j0,Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-u11a01j0

0,1
Comm: tcp://127.0.0.1:61341,Total threads: 2
Dashboard: http://127.0.0.1:61342/status,Memory: 3.93 GiB
Nanny: tcp://127.0.0.1:61314,
Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-e4tqwtis,Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-e4tqwtis

0,1
Comm: tcp://127.0.0.1:61354,Total threads: 2
Dashboard: http://127.0.0.1:61356/status,Memory: 3.93 GiB
Nanny: tcp://127.0.0.1:61317,
Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-1_vdr2mc,Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-1_vdr2mc

0,1
Comm: tcp://127.0.0.1:61355,Total threads: 2
Dashboard: http://127.0.0.1:61358/status,Memory: 3.93 GiB
Nanny: tcp://127.0.0.1:61316,
Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-e1e332vu,Local directory: C:\Users\E079051\AppData\Local\Temp\dask-worker-space\worker-e1e332vu


In [3]:
files = [i for i in os.listdir('samples/labeled_samples_v3/')]

In [7]:
tasks = [dask.delayed(take_sample)(file) for file in files]

In [8]:
results = dask.delayed()(tasks).compute()

In [12]:
pd.concat([x[0] for x in results]).to_parquet('samples/bad_reviews.parquet', index=False)
pd.concat([x[1] for x in results]).to_parquet('samples/good_reviews.parquet', index=False)

In [11]:
pd.concat([x[0] for x in results])

Unnamed: 0,product_category,labels,text
59154,apparel,shirt,"Rating: 2, Title: Star Trek Starfleet Uniform ..."
37170,apparel,jacket,"Rating: 2, Title: Baihong Men's Military Style..."
72333,apparel,dress,"Rating: 1, Title: Women's 2-in-1 Dress with bu..."
5495,apparel,wallet,"Rating: 1, Title: YCM0201 Mens Magic Wallet Cr..."
7341,apparel,dress,"Rating: 3, Title: Women's Plus Size Off Should..."
...,...,...,...
89036,wireless,battery,"Rating: 2, Title: Zerolemon Travel External Ba..."
1494,wireless,charger,"Rating: 2, Title: Anker 40W 5-Port Family-Size..."
3282,wireless,usb,"Rating: 1, Title: kwmobile micro USB dockingst..."
69880,wireless,speakers,"Rating: 2, Title: Boss AR2600.2 ARMOR 2-Channe..."
