In [1]:
import json
import pandas as pd
import itertools
import csv
import string
import pickle
import matplotlib.pyplot as plt

from collections import Counter
import re
import spacy
import nltk
from spacy.lang.en import English
from nltk.util import ngrams
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

[nltk_data] Downloading package wordnet to /Users/xzhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/xzhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
plt.style.use('seaborn')

In [3]:
pkl_path = '/Users/xzhou/github/project_files/amazon/amazon_alexa_reviews_w_sentiment.pkl'

with open(pkl_path, 'rb') as pkl_file:
    df = pd.read_pickle(pkl_file)

df.head()

Unnamed: 0,rating,date,review,sentiment
2,4,2018-07-31,"Sometimes while playing a game, you can answer...",1
3,5,2018-07-31,I have had a lot of fun with this thing. My 4 ...,1
5,5,2018-07-31,I received the echo as a gift. I needed anothe...,1
6,3,2018-07-31,"Without having a cellphone, I cannot use many ...",0
7,5,2018-07-31,I think this is the 5th one I've purchased. I'...,1


In [4]:
# Slice data based on sentiment

df_positive = df[df['sentiment']==1]
df_neutral = df[df['sentiment']==0]
df_negative = df[df['sentiment']==-1]

In [5]:
X = df['review']
X_positive = df_positive['review']
X_neutral = df_neutral['review']
X_negative = df_negative['review']

In [6]:
# Define a topic modeling engine for repetitive use. 
# This can help test out models and identify optimal result.

class TopicModelEngine:
    
    def __init__(self, raw_data):
        self.raw_data = raw_data
        self.stop_words = set(stopwords.words('English'))
        
        
    def get_lemma(self, word):
        """
        Get the root words
        """
        try:
            lemma = wn.morphy(word)
        except:
            lemma = word
        return lemma
    
    def stop_words_extention(self, add_stop_words):
        """
        Default stop words are provided by nltk
        This is to extend list of stop words. Need to provide a list.  
        """
        add_words_set = set (add_stop_words)
        self.stop_words = self.stop_words.union(add_words_set)
        
        
    def tokenize(self, text):
        """
        This is to perform word tokenizing. It performs ngram (1,2)
        """
        lda_tokens = []
        text=text.strip().lower()

        # Extract word portion out of sentence and excludes spaace
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        tokens = [token for token in tokens if (not token.isspace())]

        for token in tokens:
            lda_tokens.append(token)

        # Extract bigrams for additional analysis
        bigrams = ngrams(tokens, 2)

        for bg, count in Counter(bigrams).most_common():
            token_string = []
            token_string = str(bg[0]) + " " + str(bg[1])
            lda_tokens.append(token_string)

        return lda_tokens
    
    def text_processing(self, sentence):
        """
        This is to break down text sentence into words, and 
        clean up the words to prepare for LDA modeling.
        """
    
        tokens = self.tokenize(sentence)
        tokens = [word for word in tokens if len(tokens)>5]
        tokens = [word for word in tokens if word not in self.stop_words]
        tokens = [self.get_lemma(word) for word in tokens]

        return tokens 
    
    def LDA_preprocessing(self):
        """
        Provided with raw data, the proprocessing will automatically prepare and
        clean up the data, and generate necessay documents for LDA's use.
        """
        self.text_data = []

        for line in self.raw_data:
            tokens = self.text_processing(line)
            tokens=[token for token in tokens if token is not None]
            self.text_data.append(tokens)
        
        self.dictionary = corpora.Dictionary(self.text_data)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.text_data]

    def LDA_model(self, num_of_topics, num_passes, file_name_to_save_as):
        """
        This is to perform topic modeling and save the model under the name
        user provides
        """
        self.LDA_preprocessing()
        num_topics = num_of_topics
        self.lda = LdaModel(corpus=self.corpus,  num_topics=num_topics, 
                       id2word=self.dictionary, passes=num_passes) 
        self.lda.save(file_name_to_save_as)
        
        return self.lda
    
    def print_topics(self, num_of_words):
        """
        Print out topics for users's decision making. 
        User can specify number of words to print out
        """
        topics = self.lda.print_topics(num_words=num_of_words)
        for topic in topics:
            print('Topic ' + str(topic[0]+1)+': '+str(topic[1]))

In [7]:
# Default stop words are provided by nltk
# Additional words to exlude. Those words don't contain meaningful information. 
# They were identified during multiple iternations

additional_words_to_exclude = set(['get', 'one', 'say', 'still', 'amazon',
                        'try', 'need', '34', 'echo', 'dot', 'even'
                        'like', 'go'])

In [8]:
# Identify top 5 negative topics

topics_neg5 = TopicModelEngine(X_negative)
topics_neg5.stop_words_extention(additional_words_to_exclude)
topics_neg5.LDA_model(num_of_topics=5, num_passes=20, file_name_to_save_as='topics_neg5')
topics_neg5.print_topics(num_of_words=10)

Topic 1: 0.014*"use" + 0.013*"music" + 0.013*"know" + 0.010*"question" + 0.010*"ask" + 0.009*"buy" + 0.009*"answer" + 0.009*"plug" + 0.009*"try" + 0.008*"able"
Topic 2: 0.019*"music" + 0.015*"play" + 0.013*"work" + 0.010*"device" + 0.010*"time" + 0.008*"home" + 0.008*"thing" + 0.008*"google" + 0.007*"much" + 0.006*"use"
Topic 3: 0.018*"work" + 0.016*"sound" + 0.015*"like" + 0.012*"music" + 0.011*"really" + 0.011*"use" + 0.011*"speaker" + 0.010*"good" + 0.010*"answer" + 0.010*"ask"
Topic 4: 0.016*"product" + 0.014*"buy" + 0.011*"use" + 0.009*"work" + 0.009*"music" + 0.008*"like" + 0.008*"want" + 0.008*"phone" + 0.007*"month" + 0.007*"connect"
Topic 5: 0.016*"device" + 0.010*"buy" + 0.010*"connect" + 0.009*"work" + 0.008*"time" + 0.008*"return" + 0.007*"speaker" + 0.007*"month" + 0.007*"working" + 0.007*"works"


In [9]:
# Identify top 10 negative topics

topics_neg10 = TopicModelEngine(X_negative)
topics_neg10.stop_words_extention(additional_words_to_exclude)
topics_neg10.LDA_model(num_of_topics=10, num_passes=20, file_name_to_save_as='topics_neg10')
topics_neg10.print_topics(num_of_words=10)

Topic 1: 0.021*"device" + 0.014*"work" + 0.013*"use" + 0.010*"connect" + 0.009*"help" + 0.009*"customer" + 0.009*"warranty" + 0.009*"connection" + 0.007*"tv" + 0.006*"internet"
Topic 2: 0.020*"music" + 0.017*"buy" + 0.013*"use" + 0.012*"product" + 0.011*"device" + 0.011*"month" + 0.011*"purchase" + 0.010*"prime" + 0.009*"connect" + 0.008*"play"
Topic 3: 0.016*"device" + 0.014*"use" + 0.013*"bulb" + 0.010*"hub" + 0.010*"get" + 0.009*"really" + 0.009*"hue" + 0.008*"find" + 0.008*"things" + 0.008*"month"
Topic 4: 0.018*"time" + 0.017*"music" + 0.013*"work" + 0.011*"ask" + 0.011*"question" + 0.010*"light" + 0.009*"times" + 0.009*"play" + 0.009*"use" + 0.008*"like"
Topic 5: 0.025*"work" + 0.011*"connect" + 0.011*"try" + 0.011*"phone" + 0.009*"thing" + 0.009*"wifi" + 0.009*"instructions" + 0.009*"disconnect" + 0.009*"buy" + 0.009*"product"
Topic 6: 0.013*"time" + 0.012*"month" + 0.011*"working" + 0.010*"set" + 0.009*"dot" + 0.009*"product" + 0.009*"stop" + 0.009*"play" + 0.008*"device" + 0.0

In [10]:
# Identify top 5 positive topics

topics_pos5 = TopicModelEngine(X_positive)
topics_pos5.stop_words_extention(additional_words_to_exclude)
topics_pos5.LDA_model(num_of_topics=5, num_passes=20, file_name_to_save_as='topics_pos5')
topics_pos5.print_topics(num_of_words=10)

Topic 1: 0.051*"music" + 0.037*"love" + 0.032*"play" + 0.020*"use" + 0.020*"weather" + 0.020*"ask" + 0.019*"fun" + 0.014*"question" + 0.014*"like" + 0.013*"much"
Topic 2: 0.017*"like" + 0.016*"new" + 0.010*"love" + 0.010*"things" + 0.009*"skill" + 0.008*"add" + 0.008*"time" + 0.008*"use" + 0.008*"list" + 0.008*"voice"
Topic 3: 0.066*"love" + 0.029*"buy" + 0.019*"room" + 0.015*"house" + 0.014*"music" + 0.014*"gift" + 0.013*"use" + 0.013*"speaker" + 0.013*"great" + 0.011*"another"
Topic 4: 0.090*"great" + 0.038*"works" + 0.031*"product" + 0.030*"speaker" + 0.029*"sound" + 0.026*"good" + 0.021*"love" + 0.016*"better" + 0.014*"use" + 0.014*"device"
Topic 5: 0.048*"easy" + 0.030*"use" + 0.030*"set" + 0.021*"home" + 0.020*"smart" + 0.018*"light" + 0.014*"tv" + 0.013*"connect" + 0.013*"love" + 0.013*"control"


In [11]:
# Identify top 10 positive topics

topics_pos10 = TopicModelEngine(X_positive)
topics_pos10.stop_words_extention(additional_words_to_exclude)
topics_pos10.LDA_model(num_of_topics=10, num_passes=20, file_name_to_save_as='topics_pos10')
topics_pos10.print_topics(num_of_words=10)

Topic 1: 0.058*"product" + 0.054*"great" + 0.041*"works" + 0.023*"like" + 0.021*"well" + 0.016*"new" + 0.015*"work" + 0.015*"device" + 0.014*"expect" + 0.011*"recommend"
Topic 2: 0.061*"speaker" + 0.056*"sound" + 0.031*"good" + 0.028*"better" + 0.026*"great" + 0.025*"quality" + 0.018*"device" + 0.015*"price" + 0.015*"connect" + 0.015*"nice"
Topic 3: 0.065*"love" + 0.033*"buy" + 0.025*"use" + 0.022*"home" + 0.019*"house" + 0.016*"dot" + 0.015*"day" + 0.012*"much" + 0.012*"another" + 0.011*"2"
Topic 4: 0.155*"love" + 0.066*"great" + 0.030*"gift" + 0.029*"works" + 0.022*"buy" + 0.019*"use" + 0.019*"good" + 0.019*"amaze" + 0.018*"learning" + 0.017*"things"
Topic 5: 0.021*"best" + 0.021*"buy" + 0.015*"know" + 0.015*"thing" + 0.015*"need" + 0.015*"old" + 0.014*"year" + 0.013*"ever" + 0.011*"take" + 0.011*"think"
Topic 6: 0.117*"easy" + 0.063*"use" + 0.062*"set" + 0.032*"love" + 0.023*"helpful" + 0.022*"setup" + 0.021*"great" + 0.016*"fire" + 0.016*"fun" + 0.013*"tv"
Topic 7: 0.025*"home" + 0