In [None]:
import pandas as pd
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cust_rev = pd.read_csv('customer_review.txt', delimiter='\t')

In [None]:
cust_rev['review'].head()

In [None]:

tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=1, max_features = 100, use_idf=True)

tfidf = tfidf_vectorizer.fit_transform(cust_rev['review'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
doc_term_matrix_tfidf = pd.DataFrame(tfidf.toarray(), columns=list(tfidf_feature_names))
doc_term_matrix_tfidf

In [None]:
nmf = NMF(n_components=7, random_state=0, alpha=.1, init='nndsvd').fit(tfidf)

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                          for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(nmf, tfidf_feature_names, no_top_words=7)

In [None]:
#Defining Topic Labels
nmf_remap = {0: 'Freshness and Quality', 1: 'Good customer Service', 2: 'Delivery Satisfaction', 3: 'Expensive products', 4: 'Easy Usage', 
             5: 'Local produce', 6: 'No English Version'}

In [None]:
nmf_topic_values = nmf.transform(tfidf)
cust_rev['nmf_topics'] = nmf_topic_values.argmax(axis=1)
#cust_rev.head()
cust_rev['nmf_topics'] = cust_rev['nmf_topics'].map(nmf_remap)

In [None]:
nmf_x = cust_rev['nmf_topics'].value_counts()
nmf_y = nmf_x.sort_index()
plt.figure(figsize=(50,30))
sns.barplot(nmf_x, nmf_y.index)
plt.title("NMF Topic Distribution", fontsize=50)
plt.ylabel('Customer Review Topics', fontsize=50)
plt.yticks(fontsize=40)
plt.xlabel('Frequency', fontsize=50)
plt.xticks(fontsize=40)

In [None]:
##trying LDA
rom sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.9, min_df=1, max_features=5000)
tf = tf_vectorizer.fit_transform(cust_rev['review'].values.astype('U'))
tf_feature_names = tf_vectorizer.get_feature_names()
doc_term_matrix = pd.DataFrame(tf.toarray(), columns=list(tf_feature_names))
doc_term_matrix
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=500, random_state=0).fit(tf)
display_topics(lda_model, tf_feature_names, no_top_words=9)

lda_remap = {0: 'Customer Service', 1: 'Local', 2: 'Communication', 3: 'Price', 
             4: 'Great Service', 5: '', 6: 'Care about Employees', 
             7: 'Option for English', 8: 'Customer Service', 9: 'Unknown1'}