# Amazon Beauty Products Review-Sentiment Analysis

## Import Necessary Libraries

In [1]:
# Dataframe
import pandas as pd

# Array
import numpy as np

# Visualizations
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import matplotlib.colors as colors
%matplotlib inline

# Datetime
from datetime import datetime

## Warnings
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

## Reading the Inspected Dataset

In [2]:
# Read the file
df2 = pd.read_csv('C:/Users/guzel/Documents/amazon_beauty_products/data_inspected.csv')

In [3]:
# Sample observations
df2.head(3)

Unnamed: 0,customer,product,rating,review_text,pos_feedback,neg_feedback,rating_class,time
0,A6VPK7X53QNAQ,B0000CC64W,5.0,If I had to choose only one product to take ca...,5,0,good,2009-06-18
1,A3CHMHGSJSQ02J,B0000CC64W,5.0,Makes my skin lovely and smooth As a woman nea...,2,0,good,2013-01-18
2,A1V1EP514B5H7Y,B0000CC64W,5.0,Works well at a reasonable price I've used thi...,0,0,good,2011-11-29


## Preprocessing the new_text

In [4]:
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata
tokenizer = ToktokTokenizer()
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)

**Removing HTML tags**

We will write a function to remove the HTML tags which typically does not add much value towards understanding and analyzing text. 

In [5]:
def strip_html_tags(text):
    
    "Removes html tags in the text"
    
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

**Removing accented characters**

We will write a function to convert and standardize accented characters/letters into ASCII characters.

In [6]:
def remove_accented_chars(text):
    
    "Removes and standardize accented characters/letters"
    
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

**Expanding Contractions**

We will write a function to convert each contraction to its expanded, orginal form in order to help with text standardization. 

In [7]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    "Convert contractions into their original forms"
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

**Removing Special Characters**

We will use simple regular expressions(regexes) to remove special characters and symbols which are usually non-alphanumeric characters or even occasional numeric characters.

In [8]:
def remove_special_characters(text, remove_digits=False):
    
    "Remove special characters/symbols"
    
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

**Lemmatization**

We will remove word affixes to get to the base form of a word, known as root word. 

In [9]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

lemmatize_text("My system keeps crashing! the car crashed yesterday, ours crashes daily")

'My system keep crash ! the car crash yesterday , ours crash daily'

**Removing stopwords**

We will write a function to remove stopwords which have little or no significance in the text. 

In [10]:
# Instantiate Tokenizer
tokenizer = ToktokTokenizer()

# Create stopword list
stopword_list = nltk.corpus.stopwords.words('english')

# 'no' and 'not' may give us information so those are removed from stop list
stopword_list.remove('no')
stopword_list.remove('not')

In [11]:
def remove_stopwords(text, is_lower_case=False):
    
    "Remove stopwords in the text except 'no' and 'not'"
    
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

**Building a Text Normalizer**

Based on the functions which we have written above and also with additional text correction techniques, we will build a text normalizer in order to help us to preproces the new_text document. 

In [12]:
def normalize_corpus(doc, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
     
    # strip HTML
    if html_stripping:
        doc = strip_html_tags(doc)
            
    # remove accented characters
    if accented_char_removal:
        doc = remove_accented_chars(doc)
            
    # expand contractions    
    if contraction_expansion:
        doc = expand_contractions(doc)
            
    # lowercase the text    
    if text_lower_case:
        doc = doc.lower()
            
    # remove extra newlines
    doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        
    # lemmatize text
    if text_lemmatization:
        doc = lemmatize_text(doc)
            
    # remove special characters and\or digits    
    if special_char_removal:
        # insert spaces between special characters to isolate them    
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        doc = remove_special_characters(doc, remove_digits=remove_digits)  
            
    # remove extra whitespace
    doc = re.sub(' +', ' ', doc)
        
    # remove ' (apostrophe) sign
    doc = re.sub(r"'", r'', doc)
        
    # remove stopwords
    if stopword_removal:
        doc = remove_stopwords(doc, is_lower_case=text_lower_case)
        
    return doc

**Applying text normalizer to "review_text"**

In [13]:
# Create a new column for cleaned text 
df2['clean_text'] = df2['review_text'].map(lambda doc: normalize_corpus(doc))

**Applying tokenizer to create tokens for the clean text**

In [14]:
# Apply tokenizer to create a new column named as tokens which consists of the list of the reviews
tokenizer = RegexpTokenizer(r'\w+')
df2["tokens"] = df2["clean_text"].apply(tokenizer.tokenize)
df2.head()

Unnamed: 0,customer,product,rating,review_text,pos_feedback,neg_feedback,rating_class,time,clean_text,tokens
0,A6VPK7X53QNAQ,B0000CC64W,5.0,If I had to choose only one product to take ca...,5,0,good,2009-06-18,choose one product take care face rest life wo...,"[choose, one, product, take, care, face, rest,..."
1,A3CHMHGSJSQ02J,B0000CC64W,5.0,Makes my skin lovely and smooth As a woman nea...,2,0,good,2013-01-18,make skin lovely smooth woman near need help g...,"[make, skin, lovely, smooth, woman, near, need..."
2,A1V1EP514B5H7Y,B0000CC64W,5.0,Works well at a reasonable price I've used thi...,0,0,good,2011-11-29,work well reasonable price use regenerating se...,"[work, well, reasonable, price, use, regenerat..."
3,A1X2LENOF84LCQ,B0000CC64W,4.0,This does work ladies I have tried so many pro...,62,13,good,2005-04-13,work lady try many product totally disappointe...,"[work, lady, try, many, product, totally, disa..."
4,A2PATWWZAXHQYA,B0000CC64W,1.0,Did not like the feel/texture of this serum I ...,1,0,bad,2013-12-21,not like feel texture serum love oil olay prim...,"[not, like, feel, texture, serum, love, oil, o..."


In [15]:
# Tokens status
all_words = [word for tokens in df2["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in df2["tokens"]]
vocabulary = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(vocabulary)))
print("Max review length is (word based) %s" % max(sentence_lengths))
print("Max review length is (word based) %s" % min(sentence_lengths))

2024818 words total, with a vocabulary size of 27739
Max review length is (word based) 1090
Max review length is (word based) 1


## Prepate the Text Proprocessed dataset to Modeling by selecting required columns:

In [16]:
# Arrange the time converting to only year info
df2['year'] = pd.DatetimeIndex(df2['time']).year

In [17]:
# Select columns
df2 = df2[['customer','product','review_text','rating_class','year','clean_text','tokens']]

In [18]:
# Sample observations
df2.head(3)

Unnamed: 0,customer,product,review_text,rating_class,year,clean_text,tokens
0,A6VPK7X53QNAQ,B0000CC64W,If I had to choose only one product to take ca...,good,2009,choose one product take care face rest life wo...,"[choose, one, product, take, care, face, rest,..."
1,A3CHMHGSJSQ02J,B0000CC64W,Makes my skin lovely and smooth As a woman nea...,good,2013,make skin lovely smooth woman near need help g...,"[make, skin, lovely, smooth, woman, near, need..."
2,A1V1EP514B5H7Y,B0000CC64W,Works well at a reasonable price I've used thi...,good,2011,work well reasonable price use regenerating se...,"[work, well, reasonable, price, use, regenerat..."


## Write dataframe to CSV file

In [19]:
 df2.to_csv('cleaned_dataset.csv', sep = ',', encoding = 'utf-8', index = False)