In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import re
import HTMLParser
import string
import itertools
import time
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('Reviews.csv')
print df.columns
df.drop('Id',axis=1, inplace=True)

Index([u'Id', u'ProductId', u'UserId', u'ProfileName', u'HelpfulnessNumerator',
       u'HelpfulnessDenominator', u'Score', u'Time', u'Summary', u'Text'],
      dtype='object')


Use the below code if you want to see the full text

**pd.options.display.max_colwidth = 1000**

Before going ahead with the analysis, we will drop rows having scores as 3 and mark Score of 1,2 as bad and 4,5 as good. This is to make the dataset a binary classification dataset which we will analyze later having multiple classes

In [3]:
len(df[df.Score == 3])

42640

In [4]:
len(df)

568454

In [5]:
df.drop(df[df.Score == 3].index, inplace=True)

In [6]:
len(df[df.Score == 3])

0

In [7]:
len(df)

525814

In [8]:
# lambda function to mark scores as bad or good depending on the Score
f = lambda x: 'bad' if x['Score'] < 3 else 'good'

In [9]:
df['Rating'] = df.apply(f, axis=1)

In [10]:
df.head(2)

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Rating
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,good
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,bad


** Let's also convert time to readable form **

In [11]:
# I have checked and the hour, minute, second component are same for all and hence converting to d, m, y
time_f = lambda x: datetime.datetime.fromtimestamp(x['Time']).strftime('%Y-%m-%d')

In [12]:
df['Readable_Time'] = df.apply(time_f, axis=1)

# Univariate Analysis

1. Let's group by products, to find the unique products and the count for each one as well, so that we will be able to find top n products on reviews

In [13]:
group_by_products = pd.DataFrame({'count' : df.groupby( [ "ProductId"] ).size()}).reset_index()

We could have simple run the below code to find the total count, <br>
__len(df.groupby(['ProductId']).size())__ <br>
However, we require the grouped dataframe to draw some insights

In [14]:
print "We have a total of '{}' unique products in out dataset".format(len(group_by_products))

We have a total of '72005' unique products in out dataset


In [15]:
top_20_products_with_reviews = group_by_products.sort_values(by='count',ascending=False).head(20)['ProductId']

In [16]:
top_20_products_with_reviews

69057    B007JFMH8M
41160    B002QWP89S
36919    B0026RQTGE
41154    B002QWHJOU
41161    B002QWP8H0
44997    B003B3OOPA
27918    B001EO5Q64
16573    B000NMJWZO
34331    B001RVFEP2
36815    B0026KNQSA
22733    B0013NUGDE
66847    B006HYLW32
69207    B007M832YY
19993    B000VK8AVK
69208    B007M83302
36819    B0026KPDG8
34332    B001RVFERK
64031    B005K4Q37A
64030    B005K4Q34S
64033    B005K4Q4LK
Name: ProductId, dtype: object

In [17]:
for i in top_20_products_with_reviews:
    print df[df['ProductId'] == i][['UserId','Summary','Time']].head(1)

                UserId     Summary        Time
562970  A368Z46FIKHSEZ  Delicious!  1343433600
               UserId                                          Summary  \
20982  A21U4DR8M6I9QN  addictive! but works for night coughing in dogs   

             Time  
20982  1318896000  
               UserId                                          Summary  \
74631  A21U4DR8M6I9QN  addictive! but works for night coughing in dogs   

             Time  
74631  1318896000  
                UserId                                          Summary  \
329412  A21U4DR8M6I9QN  addictive! but works for night coughing in dogs   

              Time  
329412  1318896000  
                UserId                                          Summary  \
355503  A21U4DR8M6I9QN  addictive! but works for night coughing in dogs   

              Time  
355503  1318896000  
                UserId                        Summary        Time
367524  A2X674TN3P7AVU  Taste good, great with honey!  1330473600
          

The above result tells us is that, a same product can have different ProductId's, however, the UserId's are same and the same user has commented on all similar products having different product id's

__Conclusion__:
 - or maybe the users are copy pasting the same summary/texts for multiple such Product Id's
 - or something wierd is going on!

In [18]:
word_counter = lambda x: len(x['Text'].split(' '))

In [19]:
df['word_count'] = df.apply(word_counter, axis=1)

In [20]:
df['word_count'].max()

2640

In [21]:
df['word_count'].min()

3

In [22]:
df.sort_values('word_count', ascending=False)

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Rating,Readable_Time,word_count
68700,B0051BWU92,A2GZG363BPADYC,Larry Deemer,4,4,5,1323993600,SEARCHING FOR A PET APPETITE ENHANCER?,**********************************************...,good,2011-12-16,2640
541158,B005EF0HTK,#oc-R1ZR5L29T4LSAE,M. Lafferman,9,17,1,1320019200,NOT FUNNY!!!!/ UPDATE: SECOND GIFT BASKET I R...,"I must admit, I haven't gotten many gift baske...",bad,2011-10-31,2233
175972,B007TGO1U8,A2VW4FYZILSXF2,"Jojoleb ""jojoleb""",53,54,4,1342310400,A fine new option for a low calorie sweetner,PROS:<br />Very nearly like sugar<br />Works g...,good,2012-07-15,2128
331318,B001P74NXM,A1WW22EAVQNM56,W. Christ,19,23,5,1312243200,Disagree With Negative Reviews,"A few reviewers commented in a black & white, ...",good,2011-08-02,2096
346183,B002P0Q14C,A1FLQ698D9C0C8,G. Zhang,0,2,5,1252800000,Tea Antioxidants,Tea Antioxidants<br />----------------<br /><b...,good,2009-09-13,2075
407775,B000EMAZPO,A28P0QPSXBJTN,"Captain Aaron ""Captain Aaron of Her Majesty's...",0,10,4,1282089600,The Real Black Pearl: An Adventure Tale,Lipton Black Pearl Tea. Ahhhh. Now only a bl...,good,2010-08-18,1964
209088,B00004RAMY,A9PC5HJNGD22D,Geoffrey Klos,366,371,5,1193875200,Don't follow the directions,Hallelujah!!! The wicked mole is dead.<br /><...,good,2007-11-01,1942
97610,B0010EI0QG,A150QS4IZB3XJ,"April Vawter ""Wilderness Photographer""",12,13,5,1318377600,Saving Whisper's Life!,Update: 3-9-12<br /><br />Tomorrow marks the s...,good,2011-10-12,1865
269916,B003TDYDU8,A17V9XL4CWTQ6G,Chandler,4,6,5,1328140800,"Sets the bar in dog food, most are not human-g...",I spent 5 years studying canine nutrition and ...,good,2012-02-02,1863
175184,B001Z4KOK6,A17V9XL4CWTQ6G,Chandler,8,8,5,1328140800,Most dog foods are not human-grade. There simp...,I spent 5 years studying canine nutrition and ...,good,2012-02-02,1863


In [23]:
# Normalize helpfullness numerator and denominator
min_help_numerator = df['HelpfulnessNumerator'].min()
max_help_numerator = df['HelpfulnessNumerator'].max()
min_help_denominator = df['HelpfulnessNumerator'].min()
max_help_denominator = df['HelpfulnessNumerator'].max()

df['HelpfulnessNumerator_N'] = (df['HelpfulnessNumerator'] - min_help_numerator)/(max_help_numerator - min_help_numerator)
df['HelpfulnessDenominator_N'] = (df['HelpfulnessDenominator'] - min_help_denominator)/(max_help_denominator - min_help_denominator)

In [24]:
df['HelpfulnessIndicator'] = df['HelpfulnessNumerator']*df['HelpfulnessNumerator_N']/df['HelpfulnessDenominator']

### Longer reviews are more helpful

In [25]:
df.sort_values('HelpfulnessIndicator', ascending=False)[['word_count','Rating','Score','HelpfulnessIndicator']].head(20)

Unnamed: 0,word_count,Rating,Score,HelpfulnessIndicator
190733,1223,good,5,0.986333
566779,346,good,5,0.925012
235722,801,bad,1,0.655064
96104,442,good,5,0.642051
373575,442,good,5,0.642051
293135,442,good,5,0.642051
116802,442,good,5,0.642051
130478,442,good,5,0.642051
210511,442,good,5,0.642051
297366,442,good,5,0.642051


### Positive review are common

In [26]:
len(df[df['Rating'] == 'good'])

443777

In [27]:
len(df[df['Rating'] == 'bad'])

82037

### Positive reviews are shorter

In [28]:
df.sort_values('word_count', ascending=False)[['Rating','word_count','HelpfulnessIndicator']].head(20)

Unnamed: 0,Rating,word_count,HelpfulnessIndicator
68700,good,2640,0.004619
541158,bad,2233,0.005502
175972,good,2128,0.060068
331318,good,2096,0.018124
346183,good,2075,0.0
407775,good,1964,0.0
209088,good,1942,0.416937
97610,good,1865,0.012791
269916,good,1863,0.003079
175184,good,1863,0.009238


In [29]:
group_by_users = pd.DataFrame({'Count' : df.groupby( ['UserId','word_count','Rating'] ).size()}).reset_index()

In [30]:
group_by_users.sort_values(by='Count',ascending=False).head(20)

Unnamed: 0,UserId,word_count,Rating,Count
268054,A3TVZM3ZIXG8YW,864,bad,199
76556,A1TMAVN4CEM8U8,514,good,137
119112,A29JUMRL1US6YP,346,good,56
206334,A36JDIN9RAAIEC,173,good,51
119113,A29JUMRL1US6YP,347,good,46
256538,A3PJZ8TU8FDQ1K,242,good,42
303019,AF3BYMPWKWO8F,203,bad,41
79341,A1UQBFCERIP7VJ,109,good,38
107799,A25C5MVVCIYT5D,280,good,34
95708,A20P8VC55KPPCT,105,good,31
