In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Amazon food reviews

Data Source: https://www.kaggle.com/snap/amazon-fine-food-reviews

The Amazon Fine Food Reviews dataset consists of reviews of fine foods from Amazon.

Number of reviews: 568,454
Number of users: 256,059
Number of products: 74,258
Timespan: Oct 1999 - Oct 2012
Number of Attributes/Columns in data: 10

Attribute Information:

1. Id
2. ProductId - unique identifier for the product
3. UserId - unqiue identifier for the user
4. ProfileName
5. HelpfulnessNumerator - number of users who found the review helpful
6. HelpfulnessDenominator - number of users who indicated whether they found the review helpful or not
7. Score - rating between 1 and 5
8. Time - timestamp for the review
9. Summary - brief summary of the review
10. Text - text of the review


[Q] How to determine if a review is positive or negative?

[Ans] We could use the Score/Rating. A rating of 4 or 5 could be cosnidered a positive review. A review of 1 or 2 could be considered negative. A review of 3 is nuetral and ignored. This is an approximate and proxy way of determining the polarity (positivity/negativity) of a review.

## Reading the data

In [3]:
df = pd.read_csv("Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
df.shape

(568454, 10)

## 2. EDA

In [5]:
# check for duplicates
df.groupby('UserId').count()['Id']>1

UserId
#oc-R103C0QSV1DF5E    False
#oc-R109MU5OBBZ59U    False
#oc-R10LFEMQEW6QGZ    False
#oc-R10LT57ZGIB140    False
#oc-R10UA029WVWIUI    False
                      ...  
AZZV9PDNMCOZW          True
AZZVNIMTTMJH6         False
AZZY649VYAHQS         False
AZZYCJOJLUDYR         False
AZZZOVIBXHGDR         False
Name: Id, Length: 256059, dtype: bool

In [6]:
# drop duplicates

df1 = df.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)

In [7]:
df1.shape

(393933, 10)

In [8]:
# data remaining
(df1.shape[0]/df.shape[0])*100

69.29901100176971

In [9]:
# creating a target variable using score

def get_review(x):
    if x<3:
        return 0
    else:
        return 1

In [10]:
df1['review'] = df1['Score'].apply(lambda x: get_review(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['review'] = df1['Score'].apply(lambda x: get_review(x))


In [11]:
df1['review'].value_counts()

1    336825
0     57108
Name: review, dtype: int64

In [12]:
df1[df1['HelpfulnessNumerator']>df1['HelpfulnessDenominator']]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,review
44736,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...,1
64421,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...,1


In [13]:
df1 = df1[df1['HelpfulnessNumerator']<=df1['HelpfulnessDenominator']]

## Text processing

In [14]:
df.iloc[1]['Text']

'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".'

In [15]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sumanth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# there are html tags like <br> need to remove that

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned
print(len(stop))

179


In [17]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
# this code takes a while to run as it needs to run on 500k sentences.
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in df1['Text'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (df1['Score'].values)[i] == '1': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(df1['Score'].values)[i] == '0':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [18]:
df1['CleanedText']=final_string

In [19]:
df1.iloc[0]['CleanedText']

b'bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better'

In [20]:
df1.iloc[0]['Text']

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

In [22]:
df1.shape

(393931, 12)

# Bag Of Words

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
count_vect = CountVectorizer()
final_counts = count_vect.fit_transform(df1['Text'].values)

In [25]:
final_counts.shape

(393931, 120251)

In [35]:
final_counts

<393931x120251 sparse matrix of type '<class 'numpy.int64'>'
	with 21180238 stored elements in Compressed Sparse Row format>

# Bi-Gram, n-gram

In [36]:
#bi-gram, tri-gram and n-gram

#removing stop words like "not" should be avoided before building n-grams
count_vect = CountVectorizer(ngram_range=(1,2) ) #in scikit-learn

# ngram_range=(1,1) only unigram, (1,2) both uni and bi, (2,2) onyl bi-grams
final_bigram_counts = count_vect.fit_transform(df1['Text'].values)

In [37]:
unigram_vect = CountVectorizer(ngram_range=(1,1))

final_unigram_counts = unigram_vect.fit_transform(df1['Text'].values)

final_unigram_counts.shape

(393931, 120251)

In [38]:
bigram_vect = CountVectorizer(ngram_range=(2,2))

final_bigram_counts = bigram_vect.fit_transform(df1['Text'].values)

final_bigram_counts.shape

(393931, 2966537)

In [39]:
# no of trigrams>no of bigrams>no of unigrams

# Tf-Idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(df1['Text'].values)

In [None]:
features = tf_idf_vect.get_feature_names()
len(features)


In [None]:
features[100000:100010]
