# Text Analysis

In [None]:
# import Python packages
import pandas as pd
import numpy as np
from google.colab import files
import matplotlib.pyplot as plt

# regression package
import statsmodels.api as sm

# sentiment analysis packages
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from textblob import TextBlob

# topic modeling packages
import gensim
from gensim import corpora

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Sentiment Analysis

In [None]:
# product review data
uploaded = files.upload()
reviews = pd.read_csv('product_reviews.csv') 
# take a look at the data
reviews.head(2)

Saving product_reviews.csv to product_reviews.csv


Unnamed: 0,Review_ID,Item_ID,Base_item_ID,Review_date,Reviewer_ID,Real_name,Verified_purchase,Rating,Title,Content,...,Helpful_votes_week15,Helpful_votes_week16,Helpful_votes_week17,Helpful_votes_week18,Helpful_votes_week19,Helpful_votes_week20,Helpful_votes_week21,Helpful_votes_week22,Helpful_votes_week23,Helpful_votes_week24
0,R100E6MT94PK6L,B0051VVOB2,,1/8/2012,A1HGATCAMGXTGF,False,True,5,Love My Kindle Fire!,I love my fire and highly recommend it to anyo...,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5
1,R100HU42LKLLD0,B0057O9O6K,,4/10/2012,A3GGO95QT2PP47,False,True,2,Not the best Tablet or a good buy,The operating system is is an early android. Y...,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1


In [None]:
reviews.shape

(40741, 58)

In [None]:
# keep only the review text (Content) and star ratings
reviews_sample = reviews[['Rating','Content']]

In [None]:
reviews_sample.head(2)

Unnamed: 0,Rating,Content
0,5,I love my fire and highly recommend it to anyo...
1,2,The operating system is is an early android. Y...


### Text Preprocessing

In [None]:
# remove punctuation and numbers, lower case the text
def clean(text):
    # replace any non-letters with a space
    text = re.sub('[^A-Za-z]+', ' ', text)
    # lower case the text
    text = text.lower()
    return text

# apply the function clean to each review
reviews_sample['Cleaned Reviews'] = reviews_sample['Content'].apply(clean)
reviews_sample.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Rating,Content,Cleaned Reviews
0,5,I love my fire and highly recommend it to anyo...,i love my fire and highly recommend it to anyo...
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...


In [None]:
# we only want the reviews related to kindle
reviews_sample = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('kindle')]

In [None]:
reviews_sample.shape

(16381, 3)

In [None]:
# tokenize, remove stop words, stem
# we use the Porter stemmer, a process for removing suffixes from words in English
ps = PorterStemmer() 

def token_stop_stem(text):
    tokens = word_tokenize(text) # tokenize the text
    newlist = []
    new_review = ""
    for word in tokens:
        if word not in set(stopwords.words('english')): # remove stop words
          word_stem = ps.stem(word) # stem each word
          newlist.append(word_stem)
          new_review = new_review + " " + word_stem
    return new_review

reviews_sample['Final Reviews'] = reviews_sample['Cleaned Reviews'].apply(token_stop_stem)
reviews_sample.head(2)

Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...,oper system earli android cant use download k...
2,2,I have been reading on Kindle since the Kindle...,i have been reading on kindle since the kindle...,read kindl sinc kindl releas pretti heavili i...


In [None]:
#Variables
screen = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('screen')]
customer = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('customer service')]
weight = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('weight')]
price = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('price')]

### Analyzing polarity

In [None]:
# we will score the polarity of each review
# polarity ranges from -1 (negative) to 1 (positive)
# under the hood, TextBlob uses a lexicon-based method for scoring
# for details, see https://github.com/sloria/TextBlob/blob/dev/textblob/_text.py 
def getPolarityScore(review):
    return TextBlob(review).sentiment.polarity

# function to analyze the reviews
def getPolarity(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [None]:
screen['Score'] = screen['Final Reviews'].apply(getPolarityScore) 
screen['Polarity'] = screen['Score'].apply(getPolarity)
screen.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews,Score,Polarity
9,4,"This is a nice little mini computer, but I am ...",this is a nice little mini computer but i am a...,nice littl mini comput littl disappoint bough...,0.166667,Positive
22,4,"Kindle Fire, Full Color 7"" Multi-touch Display...",kindle fire full color multi touch display wi ...,kindl fire full color multi touch display wi ...,0.444444,Positive


In [None]:
screen['Polarity'].value_counts()

Positive    5417
Negative     343
Neutral       46
Name: Polarity, dtype: int64

In [None]:
customer['Score'] = customer['Final Reviews'].apply(getPolarityScore) 
customer['Polarity'] = customer['Score'].apply(getPolarity)
customer.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews,Score,Polarity
242,4,I pre-ordered a Kindle Fire even before the pr...,i pre ordered a kindle fire even before the pr...,pre order kindl fire even product st appear m...,0.023512,Positive
328,5,My first two kindles (pre-fire) both died spon...,my first two kindles pre fire both died sponta...,first two kindl pre fire die spontan without ...,0.05,Positive


In [None]:
customer['Polarity'].value_counts()

Positive    526
Negative     69
Neutral       8
Name: Polarity, dtype: int64

In [None]:
weight['Score'] = weight['Final Reviews'].apply(getPolarityScore) 
weight['Polarity'] = weight['Score'].apply(getPolarity)
weight.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews,Score,Polarity
30,5,The Kindle Fire does everything it promised it...,the kindle fire does everything it promised it...,kindl fire everyth promis would opinion much ...,0.191304,Positive
60,5,Everything the average person could want in a ...,everything the average person could want in a ...,everyth averag person could want tablet devic...,0.316667,Positive


In [None]:
weight['Polarity'].value_counts()

Positive    893
Negative     35
Neutral       3
Name: Polarity, dtype: int64

In [None]:
price['Score'] = price['Final Reviews'].apply(getPolarityScore) 
price['Polarity'] = price['Score'].apply(getPolarity)
price.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews,Score,Polarity
22,4,"Kindle Fire, Full Color 7"" Multi-touch Display...",kindle fire full color multi touch display wi ...,kindl fire full color multi touch display wi ...,0.444444,Positive
27,5,"I bought the Fire for my wife for Christmas, k...",i bought the fire for my wife for christmas kn...,bought fire wife christma know match ipad use...,0.25,Positive


In [None]:
price['Polarity'].value_counts()

Positive    3828
Negative     143
Neutral       27
Name: Polarity, dtype: int64

## Topic Modeling

In [None]:
# tokenize, remove stop words, return tokens

def token_stop(text):
    tokens = word_tokenize(text) # tokenize the text
    newlist = []
    for word in tokens:
        if word not in set(stopwords.words('english')): # remove stop words
          newlist.append(word)
    return newlist

reviews_sample['LDA Reviews'] = reviews_sample['Cleaned Reviews'].apply(token_stop)
reviews_sample.head(2)

Unnamed: 0,Rating,Content,Cleaned Reviews,Final Reviews,LDA Reviews
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...,oper system earli android cant use download k...,"[operating, system, early, android, cant, use,..."
2,2,I have been reading on Kindle since the Kindle...,i have been reading on kindle since the kindle...,read kindl sinc kindl releas pretti heavili i...,"[reading, kindle, since, kindle, released, pre..."


In [None]:
dict_ = corpora.Dictionary(reviews_sample['LDA Reviews'])
print(dict_)

Dictionary(28518 unique tokens: ['android', 'barns', 'cant', 'com', 'download']...)


In [None]:
# convert list of documents (corpus) into Document Term Matrix using the dictionary 
doc_term_matrix = [dict_.doc2bow(i) for i in reviews_sample['LDA Reviews']]

In [None]:
num_topics = 3
chunksize = 1000
passes = 20
iterations = 400
eval_every = None  # don't evaluate model perplexity, takes too much time

lda = gensim.models.LdaModel

lda_model_3 = lda(doc_term_matrix,
    num_topics=num_topics,
    id2word = dict_,
    iterations=iterations,
    passes=passes,
    eval_every=eval_every,
    random_state=9651
)

In [None]:
lda_model_3.print_topics(num_words=10)

[(0,
  '0.038*"kindle" + 0.037*"fire" + 0.014*"books" + 0.012*"ipad" + 0.011*"great" + 0.011*"love" + 0.011*"read" + 0.010*"reading" + 0.010*"use" + 0.010*"like"'),
 (1,
  '0.039*"kindle" + 0.028*"fire" + 0.017*"amazon" + 0.012*"one" + 0.010*"would" + 0.009*"get" + 0.006*"christmas" + 0.006*"got" + 0.006*"bought" + 0.005*"time"'),
 (2,
  '0.014*"tablet" + 0.010*"device" + 0.010*"android" + 0.009*"app" + 0.009*"apps" + 0.008*"screen" + 0.007*"ipad" + 0.007*"amazon" + 0.007*"like" + 0.006*"use"')]

In [None]:
num_topics = 5
chunksize = 1000
passes = 20
iterations = 400
eval_every = None  # don't evaluate model perplexity, takes too much time

lda = gensim.models.LdaModel

lda_model_5 = lda(doc_term_matrix,
    num_topics=num_topics,
    id2word = dict_,
    iterations=iterations,
    passes=passes,
    eval_every=eval_every,
    random_state=9651
)

In [None]:
lda_model_5.print_topics(num_words=10)

[(0,
  '0.025*"screen" + 0.012*"kindle" + 0.012*"reading" + 0.012*"battery" + 0.011*"touch" + 0.010*"use" + 0.009*"like" + 0.008*"read" + 0.008*"life" + 0.008*"e"'),
 (1,
  '0.050*"kindle" + 0.046*"fire" + 0.017*"books" + 0.015*"love" + 0.015*"amazon" + 0.011*"read" + 0.010*"one" + 0.010*"would" + 0.010*"use" + 0.009*"great"'),
 (2,
  '0.015*"tablet" + 0.013*"android" + 0.012*"device" + 0.012*"app" + 0.012*"apps" + 0.010*"amazon" + 0.007*"like" + 0.006*"screen" + 0.005*"use" + 0.005*"one"'),
 (3,
  '0.030*"kindle" + 0.019*"fire" + 0.018*"amazon" + 0.011*"one" + 0.010*"get" + 0.010*"would" + 0.007*"problem" + 0.007*"back" + 0.006*"customer" + 0.006*"service"'),
 (4,
  '0.067*"ipad" + 0.023*"fire" + 0.023*"kindle" + 0.015*"great" + 0.014*"tablet" + 0.013*"price" + 0.011*"apple" + 0.009*"device" + 0.009*"screen" + 0.008*"good"')]

In [None]:
num_topics = 7
chunksize = 1000
passes = 20
iterations = 400
eval_every = None  # don't evaluate model perplexity, takes too much time

lda = gensim.models.LdaModel

lda_model_7 = lda(doc_term_matrix,
    num_topics=num_topics,
    id2word = dict_,
    iterations=iterations,
    passes=passes,
    eval_every=eval_every,
    random_state=9651
)

In [None]:
lda_model_7.print_topics(num_words=10)

[(0,
  '0.040*"tablet" + 0.021*"android" + 0.012*"apps" + 0.012*"app" + 0.011*"screen" + 0.009*"good" + 0.009*"market" + 0.009*"battery" + 0.008*"great" + 0.007*"get"'),
 (1,
  '0.048*"kindle" + 0.036*"fire" + 0.016*"one" + 0.015*"amazon" + 0.013*"books" + 0.011*"love" + 0.011*"would" + 0.010*"get" + 0.009*"bought" + 0.008*"christmas"'),
 (2,
  '0.009*"use" + 0.009*"usb" + 0.008*"app" + 0.007*"one" + 0.007*"computer" + 0.007*"files" + 0.006*"keyboard" + 0.006*"need" + 0.006*"pc" + 0.006*"laptop"'),
 (3,
  '0.026*"kindle" + 0.020*"amazon" + 0.014*"fire" + 0.011*"problem" + 0.010*"service" + 0.010*"customer" + 0.010*"would" + 0.009*"get" + 0.009*"back" + 0.008*"device"'),
 (4,
  '0.054*"kindle" + 0.045*"fire" + 0.026*"love" + 0.026*"great" + 0.018*"books" + 0.018*"use" + 0.016*"easy" + 0.012*"read" + 0.012*"movies" + 0.011*"games"'),
 (5,
  '0.020*"amazon" + 0.020*"ipad" + 0.019*"device" + 0.015*"fire" + 0.011*"apps" + 0.010*"kindle" + 0.009*"app" + 0.008*"tablet" + 0.007*"like" + 0.007*

In [None]:
lda_model_3.log_perplexity(doc_term_matrix)

-7.399004212703191

In [None]:
lda_model_5.log_perplexity(doc_term_matrix)

-7.413189683999696

In [None]:
lda_model_7.log_perplexity(doc_term_matrix)

-7.420609963631983