In [2]:
import pandas as pd
import nltk
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import time
import warnings
warnings.filterwarnings("ignore")
import re

In [358]:
df = pd.read_csv('meta_and_review_original.csv')
df = df.rename(columns={'Unnamed: 0':'ori_index'})

In [359]:
df.dropna(subset=['review'], axis=0,inplace=True)
df=df.reset_index(drop=True)

In [360]:
df.shape

(133159, 15)

## Replace apostrophe/short words in python

In [361]:
df['review']=df['review'].apply(lambda x: x.lower())

In [362]:
contractions = {
"i'm":"i am",
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": " he will",
"he'll've": " he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "I would",
"i'd've": "I would have",
"i'll": "I will",
"i'll've": "I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it shall have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she shall",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": " what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who shall have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you shall have",
"you're": "you are",
"you've": "you have"
}

In [363]:
def replace_apostrophe(review):
    for word in review.split():
        if word.lower() in contractions: 
            review = review.replace(word, contractions[word.lower()])
    return review.lower()

In [364]:
%%time
df['review']=df['review'].apply(replace_apostrophe)

CPU times: user 1.94 s, sys: 32.1 ms, total: 1.97 s
Wall time: 1.99 s


## aspect-based sentiment

In [365]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [366]:
def contains_keyword(s,target_list):
    for target in target_list:
        if target in s:
            return True
    return False

In [367]:
def get_sentiment(review,target_list):
    contain_key = False
    pos,neg,neu,polar=[],[],[],[]
    for sentence in nltk.sent_tokenize(review):
        if contains_keyword(sentence,target_list):
            contain_key = True
            score=analyzer.polarity_scores(sentence)
            pos.append(score['pos'])
            neg.append(score['neg'])
            neu.append(score['neu'])
            polar.append(score['compound'])
    if contain_key:
        n=len(pos)
        return sum(pos)/n, sum(neg)/n, sum(neu)/n, sum(polar)/n
    else:
        return 0, 0, 0, 0        

In [368]:
delivery_keyword_list=['delivery','shipment']
price_keyword_list=['price', 'pricing', 'cost', 'charge']
service_keyword_list=['return', 'exchange', 'service', 'support']
package_keyword_list=['pack', 'package']

In [369]:
def add_columns(df,aspect,target_list):
    df1=df.copy()
    pos = 'pos_'+ str(aspect)
    neg = 'neg_'+ str(aspect)
    neu = 'neu_'+ str(aspect)
    polar = 'polarity_'+ str(aspect)
    sentiment_result = df1['review'].apply(lambda x: get_sentiment(x,target_list))
    df1[pos]=sentiment_result.map(lambda x: x[0])
    df1[neg]=sentiment_result.map(lambda x: x[1])
    df1[neu]=sentiment_result.map(lambda x: x[2])
    df1[polar]=sentiment_result.map(lambda x: x[3])
    return df1

In [370]:
%%time
df1= add_columns(df,'delivery',delivery_keyword_list)
df2= add_columns(df1,'price',price_keyword_list)
df3= add_columns(df2,'service',service_keyword_list)
df_full= add_columns(df3,'package',package_keyword_list)

CPU times: user 1min 14s, sys: 300 ms, total: 1min 14s
Wall time: 1min 14s


In [371]:
df_full['review'][3]

'this is a wonderful shampoo. does not make your hair too soft, does not make it dandruffy, your scalp does not itch afterwards, hair is glossy etc. is not cheap, but really good. just compare the prices online and pick the cheapest with free delivery.'

In [372]:
df_full.iloc[3,:]

ori_index                                                            3
asin                                                        5357954771
review               this is a wonderful shampoo. does not make you...
overall                                                              5
helpful                                                         [1, 1]
summary                                             The shampoo to go!
unixReviewTime                                              1270252800
reviewTime                                                  04 3, 2010
reviewerID                                              A36ZKFLSDE8WX5
title                Moisture Repair Conditioner by KMS for Unisex ...
brand                                                              KMS
price                                                            10.79
salesRank                                                       108278
net                                                            ['8.5']
net_le

In [373]:
df_full['review'][4]

'i have a very dry, frizzy hair. i have tried many hair products and cannot make it look sleek. i bought kms product through my friend recommendation. it was not overnight change, but after a month, my hair looks much sleek. the conditioner is very rich and i do like the light smell a lot.the product arrives on time although the package looks different from what is on the picture.'

In [374]:
df_full.iloc[4,:]

ori_index                                                            4
asin                                                        5357954771
review               i have a very dry, frizzy hair. i have tried m...
overall                                                              5
helpful                                                         [0, 0]
summary                                      KMS conditioner is great.
unixReviewTime                                              1341532800
reviewTime                                                  07 6, 2012
reviewerID                                               AVCSVH9L0BPIF
title                Moisture Repair Conditioner by KMS for Unisex ...
brand                                                              KMS
price                                                            10.79
salesRank                                                       108278
net                                                            ['8.5']
net_le

In [375]:
df_full.to_csv('sentiment_by_aspect_xy.csv')

### merge

In [408]:
df2=pd.read_csv('sentiment_by_aspect_xy.csv',index_col=0)
senti_features=['ori_index','pos_delivery', 'neg_delivery', 'neu_delivery',
       'polarity_delivery', 'pos_price', 'neg_price', 'neu_price',
       'polarity_price', 'pos_service', 'neg_service', 'neu_service',
       'polarity_service', 'pos_package', 'neg_package', 'neu_package',
       'polarity_package','review']
df2=df2[senti_features]

In [409]:
df1=pd.read_csv('meta_and_review_with_functions.csv')
df1 =df1.rename(columns={'Unnamed: 0':'ori_index'})
df1=df1.dropna(subset=['review'], axis=0).reset_index(drop=True)

In [395]:
def identify_diff(s1,s2):
    result=[]
    s2_set=s2.unique()
    for i in range(len(s1)):
        if s1[i] not in s2_set:
            result.append(s1[i])
    return result

print(identify_diff(df1['ori_index'],df2['ori_index']))
print(identify_diff(df2['ori_index'],df1['ori_index']))

[92285]
[37819]


In [386]:
print(df1.shape,df2.shape)

(133159, 31) (133159, 17)


In [410]:
df3=df1.merge(df2, how='inner',on='ori_index').drop(columns=['ori_index'])

In [411]:
print(df3.shape)

(133158, 47)


In [423]:
df3.loc[37819,['review_x','review_y']]

review_x    this shampooconditioner takes some getting use...
review_y    this shampoo/conditioner takes some getting us...
Name: 37819, dtype: object

In [389]:
df3.columns

Index(['asin', 'review', 'overall', 'helpful', 'summary', 'unixReviewTime',
       'reviewTime', 'reviewerID', 'title', 'brand', 'price', 'salesRank',
       'net', 'net_len', 'helpful_overall', 'helpful_positive',
       't_func_cleansing', 'r_func_cleansing', 't_func_dandruff',
       'r_func_dandruff', 't_func_growth', 'r_func_growth', 't_func_moisture',
       'r_func_moisture', 't_func_repair', 'r_func_repair', 't_func_color',
       'r_func_color', 'satisfactory', 'temp', 'pos_delivery', 'neg_delivery',
       'neu_delivery', 'polarity_delivery', 'pos_price', 'neg_price',
       'neu_price', 'polarity_price', 'pos_service', 'neg_service',
       'neu_service', 'polarity_service', 'pos_package', 'neg_package',
       'neu_package', 'polarity_package'],
      dtype='object')

In [396]:
df3.to_csv('merged_text.csv')

In [3]:
pd.read_csv('merged_text.csv')

Unnamed: 0.1,Unnamed: 0,asin,review,overall,helpful,summary,unixReviewTime,reviewTime,reviewerID,title,...,neu_price,polarity_price,pos_service,neg_service,neu_service,polarity_service,pos_package,neg_package,neu_package,polarity_package
0,0,1929099886,this was a good buy great shampoo and conditio...,4.0,"[0, 0]",good shampoo and conditioner,1355011200,"12 9, 2012",A2BQ8DVGEGWAFY,bumble and bumble quenching shampoo and condit...,...,0.000,0.0000,0.0,0.0,0.0,0.0,0.000,0.0,0.000,0.0000
1,1,5357954771,i have used kms in the past and always loved t...,3.0,"[0, 0]",not too impressed,1376352000,"08 13, 2013",A2Z9QP4MZ2HXCZ,moisture repair conditioner by kms for unisex ...,...,0.000,0.0000,0.0,0.0,0.0,0.0,0.000,0.0,0.000,0.0000
2,2,5357954771,i remember kms products being a prestige salon...,5.0,"[0, 0]",joey v,1385251200,"11 24, 2013",ACZ94JB8BFMJ9,moisture repair conditioner by kms for unisex ...,...,0.496,0.7263,0.0,0.0,0.0,0.0,0.000,0.0,0.000,0.0000
3,3,5357954771,this is a wonderful shampoo doesnt make your h...,5.0,"[1, 1]",the shampoo to go,1270252800,"04 3, 2010",A36ZKFLSDE8WX5,moisture repair conditioner by kms for unisex ...,...,0.769,0.5106,0.0,0.0,0.0,0.0,0.000,0.0,0.000,0.0000
4,4,5357954771,i have a very dry frizzy hair i have tried man...,5.0,"[0, 0]",kms conditioner is great,1341532800,"07 6, 2012",AVCSVH9L0BPIF,moisture repair conditioner by kms for unisex ...,...,0.000,0.0000,0.0,0.0,0.0,0.0,0.204,0.0,0.796,0.7501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133153,133153,B00KNCOD26,i really love the shampoo even more than the a...,5.0,"[0, 0]",love this cayenne shampoo hair growth serum,1405641600,"07 18, 2014",AZ8SSDP9TZZWD,2 pc set cayenne hair growth shampoo 84 oz amp...,...,0.000,0.0000,0.0,0.0,0.0,0.0,0.000,0.0,0.000,0.0000
133154,133154,B00KYCLTCM,received a sample of this and its matching con...,2.0,"[0, 1]",performance is average smell is gnarly,1404604800,"07 6, 2014",A2JWGWM7C75365,oribe gold lust repair and restore shampoo,...,0.000,0.0000,0.0,0.0,0.0,0.0,0.000,0.0,1.000,0.0000
133155,133155,B00KYCLTCM,this is an amazing shampoo for my thick straig...,5.0,"[0, 0]",this is an amazing shampoo for my thick,1404950400,"07 10, 2014",A24J358C0ZCFLY,oribe gold lust repair and restore shampoo,...,0.000,0.0000,0.0,0.0,0.0,0.0,0.000,0.0,0.000,0.0000
133156,133156,B00KYWMYEE,definitely worth the money,5.0,"[0, 0]",five stars,1404086400,"06 30, 2014",A385AA5J7QK4VB,lcl beauty reclining hydraulic all purpose cut...,...,0.000,0.0000,0.0,0.0,0.0,0.0,0.000,0.0,0.000,0.0000
