In [65]:
import json
import pandas as pd
import itertools
import csv
import string
import pickle
import matplotlib.pyplot as plt

from collections import Counter
import re
import spacy
import nltk
from spacy.lang.en import English
from nltk.util import ngrams
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

[nltk_data] Downloading package wordnet to /Users/xzhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/xzhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
plt.style.use('seaborn')

In [3]:
pkl_path = '/Users/xzhou/github/project_files/amazon/amazon_alexa_reviews.pkl'

with open(pkl_path, 'rb') as pkl_file:
    df = pd.read_pickle(pkl_file)

df.head()

Unnamed: 0,rating,date,review,review_by_word,word_count
2,4,2018-07-31,"Sometimes while playing a game, you can answer...","[Sometimes, while, playing, a, game, ,, you, c...",41
3,5,2018-07-31,I have had a lot of fun with this thing. My 4 ...,"[I, have, had, a, lot, of, fun, with, this, th...",38
5,5,2018-07-31,I received the echo as a gift. I needed anothe...,"[I, received, the, echo, as, a, gift, ., I, ne...",38
6,3,2018-07-31,"Without having a cellphone, I cannot use many ...","[Without, having, a, cellphone, ,, I, can, not...",84
7,5,2018-07-31,I think this is the 5th one I've purchased. I'...,"[I, think, this, is, the, 5th, one, I, 've, pu...",45


In [4]:
df.shape

(7248, 5)

In [5]:
# Assign sentiment:
# 1 represents positive sentiment, 0 represents neutral, 
# and -1 represents negative sentiment

def assign_sentiment(x):
    sentiment = ''
    if x < 3:
        sentiment = -1
    elif x == 3:
        sentiment = 0
    else:
        sentiment =1
    return sentiment

In [6]:
df['sentiment'] = df['rating'].apply(lambda x: assign_sentiment(x))

In [116]:
# Slice data based on sentiment

df_positive = df[df['sentiment']==1]
df_neutral = df[df['sentiment']==0]
df_negative = df[df['sentiment']==-1]

In [118]:
X = df['review']
X_positive = df_positive['review']
X_neutral = df_neutral['review']
X_negative = df_negative['review']

In [9]:
class KMeansEngine:
    def __init__(self, data, vectorizer_name, max_df_, min_df_, vector_max_features_):
        """
        The function supports tfidf and count vecotrizer only. Please select 'tiidf' or 'count'
        for vecotrizer. Provided with desired arguments, it will create a vector, 
        and vectorize data
        """
        self.vector_max_features = vector_max_features_
        self.vectorizer(vectorizer_name, max_df_, min_df_)
        self.vectorize_data(data)
            
    
    def vectorizer(self, vectorizer_name, max_df_, min_df_):
        """
        The function supports tfidf and count vecotrizer only. Provided with desired
        argument, return a vector
        """
        if vectorizer_name.lower() == 'tfidf':
            self.vector = TfidfVectorizer(ngram_range=(1,2),
                               stop_words='english',
                               token_pattern="\\b[a-z][a-z]+\\b",
                               lowercase=True,
                               max_df=max_df_,
                               min_df=min_df_,
                               max_features=self.vector_max_features)
            return self.vector
        
        elif vectorizer_name.lower() == 'count':
            self.vector = CountVectorizer(ngram_range=(1,2),
                               stop_words='english',
                               token_pattern="\\b[a-z][a-z]+\\b",
                               lowercase=True,
                               max_df=max_df_,
                               min_df=min_df_,
                               max_features=self.vector_max_features)
            return self.vector
        
        else:
            return 'error'
        
    
    def vectorize_data(self, data):
        """
        Return vectorized data using specificed vector
        """
        self.vectorized_data = self.vector.fit_transform(data)       
        return self.vectorized_data
    
    def dimension_reduction (self, reduction_method, reduction_max_features_):
        """
        Support svd and nmf only.
        """
        if reduction_method.lower() == 'svd':
            lsa = TruncatedSVD(n_components=reduction_max_features_)
            self.reduced_data = lsa.fit_transform(self.vectorized_data)
            
        elif reduction_method.lower() == 'nmf':
            nmf = NMF(n_components=reduction_max_features_)
            self.reduced_data = nmf.fit_transform(self.vectorized_data)
            
        return self.reduced_data
            
    
    def KMeans_iterations(self, max_num_clusters):
        """
        Perform Kmeans fit from 2 to maximum number of clusters specified,
        return sil score and SSE for each number of cluster 
        """
        self.Sil_coefs = []
        self.SSEs = []
        self.k_clusters = range(2, max_num_clusters+1)

        for k in range(2, max_num_clusters+1):
            km = KMeans(n_clusters=k, random_state=99, n_jobs=-1)
            km.fit(self.reduced_data)
            labels = km.labels_
            self.Sil_coefs.append(silhouette_score(self.reduced_data, labels, metric='euclidean'))
            self.SSEs.append(km.inertia_) 
  
            
    def draw_KMeans_plot(self):
        """
        Plot two subplots for Sil_coefficients, and SSEs
        """

        fig, ax = plt.subplots(1,2, figsize=(15,5), sharex=True, dpi=200)

        ax[0].plot(self.k_clusters, self.Sil_coefs)
        ax[0].set_xlabel('number of clusters')
        ax[0].set_ylabel('Silhouette coefficient')

        ax[1].plot(self.k_clusters, self.SSEs)
        ax[1].set_xlabel('number of clusters')
        ax[1].set_ylabel('SSE')
        
    def KMeans_model(self, num_clusters):
        """
        This is to build one KMeans model with given argument
        """
        km = KMeans(n_clusters=num_clusters, random_state=99, n_jobs=-1)
        km.fit(self.reduced_data)
        labels = km.labels_
        
        self.optimized_model=km
        return km      

In [None]:
# min_df=0.002, max_feature =25

# Case 1: 'tfidf', 'svd', min_df=0.002, reduced_features=25
lsa_tfidf_0002_25 = KMeansEngine(X, 'tfidf', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
lsa_tfidf_0002_25.dimension_reduction('svd', reduction_max_features_=25)
lsa_tfidf_0002_25.KMeans_iterations(10)


# Case 2: 'count', 'svd', min_df=0.002, reduced_features=25
lsa_cv_0002_25 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
lsa_cv_0002_25.dimension_reduction('svd', reduction_max_features_=25)
lsa_cv_0002_25.KMeans_iterations(10)


# Case 3: 'count', 'nmf', min_df=0.002, reduced_features=25
nmf_cv_0002_25 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
nmf_cv_0002_25.dimension_reduction('nmf', reduction_max_features_=25)
nmf_cv_0002_25.KMeans_iterations(10)


# Plot for all three cases
lsa_tfidf_0002_25.draw_KMeans_plot()
lsa_cv_0002_25.draw_KMeans_plot()
nmf_cv_0002_25.draw_KMeans_plot()

In [None]:
# min_df=0.002, max_feature =50

# Case 4: 'tfidf', 'svd', min_df=0.002, reduced_features=50
lsa_tfidf_0002_50 = KMeansEngine(X_negative, 'tfidf', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
lsa_tfidf_0002_50.dimension_reduction('svd', reduction_max_features_=50)
lsa_tfidf_0002_50.KMeans_iterations(10)


# Case 5: 'count', 'svd', min_df=0.002, reduced_features=50
lsa_cv_0002_50 = KMeansEngine(X_negative, 'count', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
lsa_cv_0002_50.dimension_reduction('svd', reduction_max_features_=50)
lsa_cv_0002_50.KMeans_iterations(10)


# Case 6: 'count', 'nmf', min_df=0.002, reduced_features=50
nmf_cv_0002_50 = KMeansEngine(X_negative, 'count', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
nmf_cv_0002_50.dimension_reduction('nmf', reduction_max_features_=50)
nmf_cv_0002_50.KMeans_iterations(10)


# Plot for all three cases
lsa_tfidf_0002_50.draw_KMeans_plot()
lsa_cv_0002_50.draw_KMeans_plot()
nmf_cv_0002_50.draw_KMeans_plot()

In [None]:
# min_df=0.002, max_feature =100

# Case 7: 'tfidf', 'svd', min_df=0.002, reduced_features=100
lsa_tfidf_0002_100 = KMeansEngine(X, 'tfidf', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
lsa_tfidf_0002_100.dimension_reduction('svd', reduction_max_features_=100)
lsa_tfidf_0002_100.KMeans_iterations(10)


# Case 8: 'count', 'svd', min_df=0.002, reduced_features=100
lsa_cv_0002_100 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
lsa_cv_0002_100.dimension_reduction('svd', reduction_max_features_=100)
lsa_cv_0002_100.KMeans_iterations(10)


# Case 9: 'count', 'nmf', min_df=0.002, reduced_features=100
nmf_cv_0002_100 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
nmf_cv_0002_100.dimension_reduction('nmf', reduction_max_features_=100)
nmf_cv_0002_100.KMeans_iterations(10)


# Plot for all three cases
lsa_tfidf_0002_100.draw_KMeans_plot()
lsa_cv_0002_100.draw_KMeans_plot()
nmf_cv_0002_100.draw_KMeans_plot()

In [None]:
# min_df=0.002, max_feature =200

# Case 10: 'tfidf', 'svd', min_df=0.002, reduced_features=200
lsa_tfidf_0002_200 = KMeansEngine(X, 'tfidf', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
lsa_tfidf_0002_200.dimension_reduction('svd', reduction_max_features_=200)
lsa_tfidf_0002_200.KMeans_iterations(10)


# Case 11: 'count', 'svd', min_df=0.002, reduced_features=200
lsa_cv_0002_200 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
lsa_cv_0002_200.dimension_reduction('svd', reduction_max_features_=200)
lsa_cv_0002_200.KMeans_iterations(10)


# Case 12: 'count', 'nmf', min_df=0.002, reduced_features=200
nmf_cv_0002_200 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.002, vector_max_features_=2000)
nmf_cv_0002_200.dimension_reduction('nmf', reduction_max_features_=200)
nmf_cv_0002_200.KMeans_iterations(10)


# Plot for all three cases
lsa_tfidf_0002_200.draw_KMeans_plot()
lsa_cv_0002_200.draw_KMeans_plot()
nmf_cv_0002_200.draw_KMeans_plot()


In [None]:
# min_df=0.005, max_feature =25

# Case 13: 'tfidf', 'svd', min_df=0.005, reduced_features=25
lsa_tfidf_0005_25 = KMeansEngine(X, 'tfidf', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
lsa_tfidf_0005_25.dimension_reduction('svd', reduction_max_features_=25)
lsa_tfidf_0005_25.KMeans_iterations(10)


# Case 14: 'count', 'svd', min_df=0.005, reduced_features=25
lsa_cv_0005_25 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
lsa_cv_0005_25.dimension_reduction('svd', reduction_max_features_=25)
lsa_cv_0005_25.KMeans_iterations(10)


# Case 15: 'count', 'nmf', min_df=0.005, reduced_features=25
nmf_cv_0005_25 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
nmf_cv_0005_25.dimension_reduction('nmf', reduction_max_features_=25)
nmf_cv_0005_25.KMeans_iterations(10)


# Plot for all three cases
lsa_tfidf_0005_25.draw_KMeans_plot()
lsa_cv_0005_25.draw_KMeans_plot()
nmf_cv_0005_25.draw_KMeans_plot()

In [None]:
# min_df=0.005, max_feature =50

# Case 16: 'tfidf', 'svd', min_df=0.005, reduced_features=50
lsa_tfidf_0005_50 = KMeansEngine(X, 'tfidf', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
lsa_tfidf_0005_50.dimension_reduction('svd', reduction_max_features_=50)
lsa_tfidf_0005_50.KMeans_iterations(10)


# Case 17: 'count', 'svd', min_df=0.005, reduced_features=50
lsa_cv_0005_50 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
lsa_cv_0005_50.dimension_reduction('svd', reduction_max_features_=50)
lsa_cv_0005_50.KMeans_iterations(10)


# Case 18: 'count', 'nmf', min_df=0.005, reduced_features=50
nmf_cv_0005_50 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
nmf_cv_0005_50.dimension_reduction('nmf', reduction_max_features_=50)
nmf_cv_0005_50.KMeans_iterations(10)


# Plot for all three cases
lsa_tfidf_0005_50.draw_KMeans_plot()
lsa_cv_0005_50.draw_KMeans_plot()
nmf_cv_0005_50.draw_KMeans_plot()

In [None]:
# min_df=0.005, max_feature =100

# Case 19: 'tfidf', 'svd', min_df=0.005, reduced_features=100
lsa_tfidf_0005_100 = KMeansEngine(X, 'tfidf', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
lsa_tfidf_0005_100.dimension_reduction('svd', reduction_max_features_=100)
lsa_tfidf_0005_100.KMeans_iterations(10)


# Case 20: 'count', 'svd', min_df=0.005, reduced_features=100
lsa_cv_0005_100 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
lsa_cv_0005_100.dimension_reduction('svd', reduction_max_features_=100)
lsa_cv_0005_100.KMeans_iterations(10)


# Case 21: 'count', 'nmf', min_df=0.005, reduced_features=100
nmf_cv_0005_100 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
nmf_cv_0005_100.dimension_reduction('nmf', reduction_max_features_=100)
nmf_cv_0005_100.KMeans_iterations(10)


# Plot for all three cases
lsa_tfidf_0005_100.draw_KMeans_plot()
lsa_cv_0005_100.draw_KMeans_plot()
nmf_cv_0005_100.draw_KMeans_plot()

In [None]:
# min_df=0.005, max_feature =200

# Case 22: 'tfidf', 'svd', min_df=0.005, reduced_features=200
lsa_tfidf_0005_200 = KMeansEngine(X, 'tfidf', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
lsa_tfidf_0005_200.dimension_reduction('svd', reduction_max_features_=200)
lsa_tfidf_0005_200.KMeans_iterations(10)


# Case 23: 'count', 'svd', min_df=0.005, reduced_features=200
lsa_cv_0005_200 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
lsa_cv_0005_200.dimension_reduction('svd', reduction_max_features_=200)
lsa_cv_0005_200.KMeans_iterations(10)


# Case 24: 'count', 'nmf', min_df=0.005, reduced_features=200
nmf_cv_0005_200 = KMeansEngine(X, 'count', max_df_=0.05, min_df_=0.005, vector_max_features_=2000)
nmf_cv_0005_200.dimension_reduction('nmf', reduction_max_features_=200)
nmf_cv_0005_200.KMeans_iterations(10)


# Plot for all three cases
lsa_tfidf_0005_200.draw_KMeans_plot()
lsa_cv_0005_200.draw_KMeans_plot()
nmf_cv_0005_200.draw_KMeans_plot()

In [None]:
# optimized model is nmf_cv_0002_100

km_optimized = nmf_cv_0005_25.KMeans_model(2)
labels = km_optimized.labels_

In [None]:
# Create a list of models for pickle

model_list=[lsa_tfidf_0002_50, lsa_cv_0002_50, nmf_cv_0002_50,
           lsa_tfidf_0002_100, lsa_cv_0002_100, nmf_cv_0002_100,
           lsa_tfidf_0002_200, lsa_cv_0002_200, nmf_cv_0002_200,
           lsa_tfidf_0005_50, lsa_cv_0005_50, nmf_cv_0005_50,
           lsa_tfidf_0005_100, lsa_cv_0005_100, nmf_cv_0005_100,
           lsa_tfidf_0005_200, lsa_cv_0005_200, nmf_cv_0005_200]

In [None]:
# This is to load pickle file if data is available,
# or store data when pickle file isn't available

pkl_path2 = '/Users/xzhou/github/project_files/amazon/amazon_optimized_km.pkl'

try:    
    with open(pkl_path2, 'rb') as pkl_file:
        km_optimized = pickle.load(pkl_file)
except:
    with open(pkl_path2, 'wb') as pkl_file:
        pickle.dump(km_optimized, pkl_file)

In [None]:
# This is to load pickle file if data is available,
# or store data when pickle file isn't available

pkl_path3 = '/Users/xzhou/github/project_files/amazon/amazon_km_models.pkl'
models = {}

try:    
    with open(pkl_path3, 'rb') as pkl_file:
        for i in range (18):
            models[i] = pickle.load(pkl_file)
except:
    with open(pkl_path3, 'wb') as pkl_file:
        for item in model_list:
            pickle.dump(item, pkl_file)

In [None]:
# how to save the model?
# This is to load pickle file if data is available,
# or store data when pickle file isn't available

# pkl_path3 = '/Users/xzhou/github/project_files/amazon/amazon_km_7clusters.pkl'

# try:    
#     with open(pkl_path2, 'rb') as pkl_file:
#         km_optimized = pickle.load(pkl_file)
# except:
#     with open(pkl_path2, 'wb') as pkl_file:
#         pickle.dump(km_optimized, pkl_file)

In [None]:
centroids = km_optimized.cluster_centers_

In [None]:
centroids.shape

In [None]:
km_optimized.cluster_centersers_

In [None]:
print (centroids_reversed)

In [None]:
df['label'] = labels

In [None]:
df[df['label']==0]

In [None]:
df[df['label']==1]

In [None]:
len(X)

In [None]:
cv_data=CountVectorizer().fit_transform(X)

In [None]:
X

In [None]:
cv_data.toarray()

In [None]:
count_vector.get_feature_names()

In [None]:
# Use WordNetLemmatizer to get the root word.

def get_lemma(word):
    try:
        lemma = wn.morphy(word)
    except:
        lemma = word
    return lemma

def get_lemma2(sentence):
    return [WordNetLemmatizer().lemmatize(word) for word in word_tokenize(sentence)]


In [None]:
stop_words = set(stopwords.words('English'))
other_words = 'one', 'fun'
stop_words.extend()

In [None]:
s='abc adef iiiel ll'
b=re.findall(r"\b[a-z][a-z]+\b", s)
b


In [None]:
parser = English()


def tokenize(text):
    lda_tokens = []
    tokens = parser(text.strip().lower())
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        else:
            lda_tokens.append(token.lower_)
#     for index, token in enumerate(lda_tokens):
#         try:
#             n_gram2 = str(lda_tokens[index]) + ' ' +  str(lda_tokens[index+1])
#             lda_tokens.append(n_gram2)
#         except:
#             contine
        
    return lda_tokens

In [None]:
def text_processing(sentence):
    
    words = tokenize(sentence)
    words = [word for word in words if len(words)>5]
    words = [word for word in words if word not in stop_words]
    words = [get_lemma(word) for word in words]
    
    return words    

In [None]:
text_data = []

for line in X:
    tokens = text_processing(line)
    tokens=[token for token in tokens if token is not None]
    text_data.append(tokens)

In [None]:
text_data

In [None]:
dictionary = corpora.Dictionary(text_data)
#print(dictionary)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [None]:
num_topics = 5

lda = LdaModel(corpus=corpus,  num_topics=num_topics, id2word=dictionary, passes=5) #
lda.save('test')


In [None]:
topics = lda.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [10]:
# Use WordNetLemmatizer to get the root word.

def get_lemma(word):
    try:
        lemma = wn.morphy(word)
    except:
        lemma = word
    return lemma

def get_lemma2(sentence):
    return [WordNetLemmatizer().lemmatize(word) for word in word_tokenize(sentence)]


In [99]:
parser = English()

from nltk.tokenize import RegexpTokenizer



def tokenize(text):
    lda_tokens = []
    text=text.strip().lower()
    
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    tokens = [token for token in tokens if (not token.isspace())]
    
    
   # tokens = nltk.word_tokenize(text)
    #tokens = [token for token in tokens if token not in string.punctuation]
    bigrams = ngrams(tokens, 2)
    
    for token in tokens:
        lda_tokens.append(token)
    
    for bg, count in Counter(bigrams).most_common():
        token_string = []
        token_string = str(bg[0]) + " " + str(bg[1])
        lda_tokens.append(token_string)

    
   # tokens = parser(text.strip().lower())
#     for token in tokens:
#         if token.orth_.isspace():
#             continue
#         elif token.like_url:
#             lda_tokens.append('URL')
#         else:
#             lda_tokens.append(token.lower_)
#     for index, token in enumerate(lda_tokens):
#         try:
#             n_gram2 = str(lda_tokens[index]) + ' ' +  str(lda_tokens[index+1])
#             lda_tokens.append(n_gram2)
#         except:
#             contine
    #print(lda_tokens)
    return lda_tokens

In [100]:
def text_processing(sentence):
    
    words = tokenize(sentence)
    words = [word for word in words if len(words)>5]
    words = [word for word in words if word not in stop_words]
    words = [get_lemma(word) for word in words]
    
    return words    

In [120]:
text_data = []

for line in X_negative:
    tokens = text_processing(line)
    tokens=[token for token in tokens if token is not None]
    text_data.append(tokens)
text_data

['it', 's', 'like', 'siri', 'in', 'fact', 'siri', 'answers', 'more', 'accurately', 'then', 'alexa', 'i', 'don', 't', 'see', 'a', 'real', 'need', 'for', 'it', 'in', 'my', 'household', 'though', 'it', 'was', 'a', 'good', 'bargain', 'on', 'prime', 'day', 'deals', 'it s', 's like', 'like siri', 'siri in', 'in fact', 'fact siri', 'siri answers', 'answers more', 'more accurately', 'accurately then', 'then alexa', 'alexa i', 'i don', 'don t', 't see', 'see a', 'a real', 'real need', 'need for', 'for it', 'it in', 'in my', 'my household', 'household though', 'though it', 'it was', 'was a', 'a good', 'good bargain', 'bargain on', 'on prime', 'prime day', 'day deals']
['sound', 'is', 'terrible', 'if', 'u', 'want', 'good', 'music', 'too', 'get', 'a', 'bose', 'sound is', 'is terrible', 'terrible if', 'if u', 'u want', 'want good', 'good music', 'music too', 'too get', 'get a', 'a bose']
['stopped', 'working', 'after', '2', 'weeks', 'didn', 't', 'follow', 'commands', 'really', 'fun', 'when', 'it', 

['nope', 'still', 'a', 'lot', 'to', 'be', 'improved', 'for', 'most', 'of', 'the', 'things', 'we', 'ask', 'it', 'says', 'hmmmm', 'i', 'dont', 'know', 'that', 'nope still', 'still a', 'a lot', 'lot to', 'to be', 'be improved', 'improved for', 'for most', 'most of', 'of the', 'the things', 'things we', 'we ask', 'ask it', 'it says', 'says hmmmm', 'hmmmm i', 'i dont', 'dont know', 'know that']
['i', 'reached', 'out', 'to', 'amazon', 'because', 'the', 'device', 'wanted', 'to', 'sync', 'my', 'phone', 'number', 'but', 'it', 'would', 'not', 'allow', 'because', 'it', 'said', 'my', 'number', 'was', 'already', 'in', 'use', 'customer', 'service', 'couldn', 't', 'help', 'they', 'basically', 'told', 'me', 'to', 'contact', 'sprint', 'to', 'assist', 'so', 'echo', 'does', 'the', 'bare', 'minimum', 'without', 'access', 'to', 'my', 'phone', 'for', 'set', 'up', 'so', 'its', 'kind', 'of', 'pointless', 'to', 'have', 'and', 'pay', 'for', 'my phone', 'i reached', 'reached out', 'out to', 'to amazon', 'amazon 

['it', 'was', 'not', 'too', 'hard', 'to', 'set', 'up', 'but', 'it', 'only', 'worked', 'for', 'about', '1', 'hour', 'after', 'answering', 'a', 'few', 'questions', 'about', 'the', 'weather', 'and', 'testing', 'the', 'timer', 'function', 'it', 'stopped', 'responding', 'i', 'restarted', 'rebooted', 'reset', 'the', 'echo', 'it', 'would', 'come', 'on', 'and', 'say', 'hello', 'but', 'would', 'not', 'respond', 'i', 'tried', 'all', 'of', 'the', 'different', 'names', 'for', 'the', 'echo', 'pressed', 'the', 'button', 'to', 'get', 'a', 'direct', 'response', 'from', 'echo', 'but', 'would', 'not', 'respond', 'seems', 'like', 'the', 'microphone', 'failed', 'about', '1', 'hour', 'of', 'being', 'set', 'up', 'alexa', 'was', 'kind', 'of', 'cool', 'when', 'it', 'worked', 'but', 'failing', 'after', '1', 'hour', 'of', 'use', 'seems', 'pretty', 'poor', 'i', 'sent', 'it', 'back', 'for', 'a', 'refund', 'maybe', 'next', 'year', 'they', 'will', 'improve', 'the', 'product', '1 hour', 'set up', 'about 1', 'the ech

['speakers', 'suck', 'it', 'is', 'not', 'wireless', 'essentially', 'the', 'flop', 'alexa', 'and', 'when', 'you', 'buy', 'portable', 'speakers', 'for', 'it', 'it', 's', 'almost', 'the', 'same', 'price', 'as', 'just', 'buying', 'one', 'of', 'those', 'speakers suck', 'suck it', 'it is', 'is not', 'not wireless', 'wireless essentially', 'essentially the', 'the flop', 'flop alexa', 'alexa and', 'and when', 'when you', 'you buy', 'buy portable', 'portable speakers', 'speakers for', 'for it', 'it it', 'it s', 's almost', 'almost the', 'the same', 'same price', 'price as', 'as just', 'just buying', 'buying one', 'one of', 'of those']
['very', 'hard', 'to', 'configure', 'limited', 'use', 'claim', 'to', 'control', 'august', 'lock', 'and', 'got', 'error', 'while', 'trying', 'very hard', 'hard to', 'to configure', 'configure limited', 'limited use', 'use claim', 'claim to', 'to control', 'control august', 'august lock', 'lock and', 'and got', 'got error', 'error while', 'while trying']
['still', '

[['like',
  'fact',
  'answer',
  'accurately',
  'see',
  'real',
  'need',
  'household',
  'though',
  'good',
  'bargain',
  'prime',
  'day',
  'deal'],
 ['sound', 'terrible', 'u', 'want', 'good', 'music', 'get', 'bose'],
 ['stop',
  'working',
  '2',
  'week',
  'follow',
  'command',
  'really',
  'fun',
  'working'],
 [],
 ['really',
  'disappoint',
  'plug',
  'wall',
  'socket',
  'time',
  'fault',
  'check',
  'make',
  'assumption',
  'company',
  'technologically',
  'advance',
  'amazon',
  'sell',
  'product',
  'rechargeable',
  'battery',
  'return',
  'apple',
  'music',
  'boom',
  'speaker',
  'give',
  'flexibility'],
 ['get',
  'great',
  'sound',
  'bass',
  'work',
  'time',
  'still',
  'hot',
  'miss',
  'recognize',
  'things'],
 ['super',
  'impress',
  'prime',
  'lapse',
  'play',
  'smart',
  'enough',
  'differentiate',
  'account',
  'use',
  'either',
  'randomly',
  'speak',
  'nobody',
  'talking',
  'today',
  'unplug',
  'sure',
  'ever',
  'use']

In [121]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [122]:
num_topics = 5

lda = LdaModel(corpus=corpus,  num_topics=num_topics, id2word=dictionary, passes=15) 
lda.save('test')


In [123]:
topics = lda.print_topics(num_words=8)
for topic in topics:
    print(topic)

(0, '0.016*"get" + 0.015*"echo" + 0.014*"ask" + 0.014*"work" + 0.013*"question" + 0.013*"buy" + 0.012*"one" + 0.012*"music"')
(1, '0.027*"sound" + 0.017*"speaker" + 0.016*"work" + 0.014*"use" + 0.013*"music" + 0.012*"get" + 0.011*"good" + 0.011*"like"')
(2, '0.017*"echo" + 0.017*"dot" + 0.014*"amazon" + 0.012*"device" + 0.011*"work" + 0.011*"get" + 0.008*"say" + 0.008*"time"')
(3, '0.025*"echo" + 0.023*"amazon" + 0.018*"music" + 0.012*"dot" + 0.011*"play" + 0.011*"get" + 0.010*"use" + 0.009*"money"')
(4, '0.014*"amazon" + 0.013*"product" + 0.012*"echo" + 0.011*"dot" + 0.011*"try" + 0.011*"34" + 0.010*"connect" + 0.009*"get"')
