In [24]:
import json
import pandas as pd
import itertools
import csv
import string
import pickle
import matplotlib.pyplot as plt

from collections import Counter
import re
import spacy
import nltk
from spacy.lang.en import English
from nltk.util import ngrams
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

[nltk_data] Downloading package wordnet to /Users/xzhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/xzhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
plt.style.use('seaborn')

In [5]:
pkl_path = '/Users/xzhou/github/project_files/amazon/amazon_alexa_reviews_w_sentiment.pkl'

with open(pkl_path, 'rb') as pkl_file:
    df = pd.read_pickle(pkl_file)

df.head()

Unnamed: 0,rating,date,review,sentiment
2,4,2018-07-31,"Sometimes while playing a game, you can answer...",1
3,5,2018-07-31,I have had a lot of fun with this thing. My 4 ...,1
5,5,2018-07-31,I received the echo as a gift. I needed anothe...,1
6,3,2018-07-31,"Without having a cellphone, I cannot use many ...",0
7,5,2018-07-31,I think this is the 5th one I've purchased. I'...,1


In [7]:
# Slice data based on sentiment

df_positive = df[df['sentiment']==1]
df_neutral = df[df['sentiment']==0]
df_negative = df[df['sentiment']==-1]

In [8]:
X = df['review']
X_positive = df_positive['review']
X_neutral = df_neutral['review']
X_negative = df_negative['review']

In [25]:
# Use WordNetLemmatizer to get the root word.

def get_lemma(word):
    try:
        lemma = wn.morphy(word)
    except:
        lemma = word
    return lemma

In [27]:
stop_words = set(stopwords.words('English'))

In [50]:
irrelevant_words = set(['get', 'one', 'say', 'still', 'amazon',
                        'try', 'need', '34', 'echo', 'dot', 'even'])

In [51]:
stop_words = stop_words.union(irrelevant_words)

In [52]:
def tokenize(text):
    lda_tokens = []
    text=text.strip().lower()
    
    # Extract word portion out of sentence and excludes spaace
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    tokens = [token for token in tokens if (not token.isspace())]
    
    for token in tokens:
        lda_tokens.append(token)
        
    # Extract bigrams for additional analysis
    bigrams = ngrams(tokens, 2)
     
    for bg, count in Counter(bigrams).most_common():
        token_string = []
        token_string = str(bg[0]) + " " + str(bg[1])
        lda_tokens.append(token_string)

    return lda_tokens

In [53]:
def text_processing(sentence):
    
    tokens = tokenize(sentence)
    tokens = [word for word in tokens if len(tokens)>5]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [get_lemma(word) for word in tokens]
    
    return tokens    

In [54]:
text_data = []

for line in X_negative:
    tokens = text_processing(line)
    tokens=[token for token in tokens if token is not None]
    text_data.append(tokens)

In [55]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [56]:
num_topics = 10

lda = LdaModel(corpus=corpus,  num_topics=num_topics, id2word=dictionary, passes=15) 
lda.save('test')


In [58]:
topics = lda.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.014*"music" + 0.014*"back" + 0.013*"return" + 0.013*"buy" + 0.011*"time" + 0.011*"works" + 0.010*"sent" + 0.010*"work" + 0.008*"respond" + 0.008*"get"')
(1, '0.014*"device" + 0.013*"product" + 0.012*"warranty" + 0.011*"like" + 0.010*"month" + 0.008*"support" + 0.008*"3" + 0.008*"light" + 0.008*"customer" + 0.007*"work"')
(2, '0.021*"question" + 0.017*"music" + 0.017*"ask" + 0.015*"answer" + 0.013*"time" + 0.013*"use" + 0.010*"work" + 0.010*"money" + 0.010*"like" + 0.010*"know"')
(3, '0.015*"song" + 0.014*"music" + 0.013*"play" + 0.010*"use" + 0.009*"speaker" + 0.007*"device" + 0.007*"thing" + 0.007*"understand" + 0.007*"never" + 0.006*"like"')
(4, '0.030*"work" + 0.015*"device" + 0.013*"devices" + 0.012*"buy" + 0.011*"problem" + 0.010*"support" + 0.009*"another" + 0.009*"want" + 0.008*"time" + 0.008*"money"')
(5, '0.029*"connect" + 0.018*"music" + 0.016*"work" + 0.015*"speaker" + 0.015*"play" + 0.011*"google" + 0.011*"internet" + 0.011*"terrible" + 0.010*"like" + 0.010*"use"')
(

In [75]:
# Define a topic modeling engine for repetitive use. 
# This can help test out models and identify optimal result.

class TopicModelEngine:
    
    def __init__(self, raw_data):
        self.raw_data = raw_data
        self.stop_words = set(stopwords.words('English'))
        
        
    def get_lemma(self, word):
        """
        Get the root words
        """
        try:
            lemma = wn.morphy(word)
        except:
            lemma = word
        return lemma
    
    def stop_words_extention(self, add_stop_words):
        """
        This is to extend list of stop words. Need to provide a list.  
        """
        add_words_set = set (add_stop_words)
        self.stop_words = self.stop_words.union(add_words_set)
        
        
    def tokenize(self, text):
        """
        This is to perform word tokenizing. It performs ngram (1,2)
        """
        lda_tokens = []
        text=text.strip().lower()

        # Extract word portion out of sentence and excludes spaace
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        tokens = [token for token in tokens if (not token.isspace())]

        for token in tokens:
            lda_tokens.append(token)

        # Extract bigrams for additional analysis
        bigrams = ngrams(tokens, 2)

        for bg, count in Counter(bigrams).most_common():
            token_string = []
            token_string = str(bg[0]) + " " + str(bg[1])
            lda_tokens.append(token_string)

        return lda_tokens
    
    def text_processing(self, sentence):
        """
        This is to break down text sentence into words, and 
        clean up the words to prepare for LDA modeling.
        """
    
        tokens = tokenize(sentence)
        tokens = [word for word in tokens if len(tokens)>5]
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [get_lemma(word) for word in tokens]

        return tokens 
    
    def LDA_preprocessing(self):
        """
        Provided with raw data, the proprocessing will automatically prepare and
        clean up the data, and generate necessay documents for LDA's use.
        """
        self.text_data = []

        for line in self.raw_data:
            tokens = text_processing(line)
            tokens=[token for token in tokens if token is not None]
            self.text_data.append(tokens)
        
        self.dictionary = corpora.Dictionary(self.text_data)
        self.corpus = [dictionary.doc2bow(text) for text in self.text_data]

    def LDA_model(self, num_of_topics, file_name_to_save_as):
        """
        This is to perform topic modeling and save the model under the name
        user provides
        """
        self.LDA_preprocessing()
        num_topics = num_of_topics
        self.lda = LdaModel(corpus=self.corpus,  num_topics=num_topics, 
                       id2word=self.dictionary, passes=15) 
        lda.save(file_name_to_save_as)
        
        return self.lda
    
    def print_topics(self, num_of_words):
        """
        Print out topics for users's decision making. 
        User can specify number of words to print out
        """
        topics = lda.print_topics(num_words=num_of_words)
        for topic in topics:
            print('Topic ' + str(topic[0]+1)+': '+str(topic[1]))

In [76]:

topic_test = TopicModelEngine(X_negative)
topic_test.stop_words_extention(additional_stop_words)
topic_test.LDA_model(10, 'test')
topic_test.print_topics(10)

Topic 1: 0.014*"music" + 0.014*"back" + 0.013*"return" + 0.013*"buy" + 0.011*"time" + 0.011*"works" + 0.010*"sent" + 0.010*"work" + 0.008*"respond" + 0.008*"get"
Topic 2: 0.014*"device" + 0.013*"product" + 0.012*"warranty" + 0.011*"like" + 0.010*"month" + 0.008*"support" + 0.008*"3" + 0.008*"light" + 0.008*"customer" + 0.007*"work"
Topic 3: 0.021*"question" + 0.017*"music" + 0.017*"ask" + 0.015*"answer" + 0.013*"time" + 0.013*"use" + 0.010*"work" + 0.010*"money" + 0.010*"like" + 0.010*"know"
Topic 4: 0.015*"song" + 0.014*"music" + 0.013*"play" + 0.010*"use" + 0.009*"speaker" + 0.007*"device" + 0.007*"thing" + 0.007*"understand" + 0.007*"never" + 0.006*"like"
Topic 5: 0.030*"work" + 0.015*"device" + 0.013*"devices" + 0.012*"buy" + 0.011*"problem" + 0.010*"support" + 0.009*"another" + 0.009*"want" + 0.008*"time" + 0.008*"money"
Topic 6: 0.029*"connect" + 0.018*"music" + 0.016*"work" + 0.015*"speaker" + 0.015*"play" + 0.011*"google" + 0.011*"internet" + 0.011*"terrible" + 0.010*"like" + 0