In [1]:
#importing libraries
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#### Loading, cleaning and preparing the data

In [4]:
dfm = pd.read_csv('Hotel_Reviews_2.csv')

In [5]:
dfm = dfm.drop_duplicates()

In [6]:
dfm_sub = dfm[['Negative_Review','neg_count','Tags']]

In [7]:
dfm_sub.head()

Unnamed: 0,Negative_Review,neg_count,Tags
0,i am so angry that i made this post available...,0,"[' Leisure trip ', ' Couple ', ' Duplex Double..."
1,no negative,1,"[' Leisure trip ', ' Couple ', ' Duplex Double..."
2,rooms are nice but for elderly a bit difficul...,0,"[' Leisure trip ', ' Family with young childre..."
3,my room was dirty and i was afraid to walk ba...,0,"[' Leisure trip ', ' Solo traveler ', ' Duplex..."
4,you when i booked with your company on line y...,0,"[' Leisure trip ', ' Couple ', ' Suite ', ' St..."


In [14]:
dfm_sub.rename(columns={'Negative_Review':'ReviewText',
                          'neg_count':'labels','Tags':'tags'}, 
                 inplace=True)

In [15]:
df = dfm_sub


In [16]:
df.head()

Unnamed: 0,ReviewText,labels,tags
0,i am so angry that i made this post available...,0,"' Leisure trip ', ' Couple ', ' Duplex Double ..."
1,no negative,1,"' Leisure trip ', ' Couple ', ' Duplex Double ..."
2,rooms are nice but for elderly a bit difficul...,0,"' Leisure trip ', ' Family with young children..."
3,my room was dirty and i was afraid to walk ba...,0,"' Leisure trip ', ' Solo traveler ', ' Duplex ..."
4,you when i booked with your company on line y...,0,"' Leisure trip ', ' Couple ', ' Suite ', ' Sta..."


In [17]:
df['tags'] =  df['tags'].apply(lambda x: x.replace('[','').replace(']','')) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
df['CombinedText'] = df['tags'].map(str) + df['ReviewText'].map(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
df['ReviewTextLower'] = df.ReviewText
df['ReviewTextLower'] = df.ReviewTextLower.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Preparing data for modelling
count vectorizer and tfidf

In [20]:
count_vectorizer = CountVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)

cv_data = count_vectorizer.fit_transform(df.ReviewTextLower)
tfidf_data = tfidf_vectorizer.fit_transform(df.ReviewTextLower)

In [21]:
# def functions for topic modelings
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def display_topics2(model, feature_names, no_top_words=10, topic_names = None):
    for index, topic in enumerate(model.components_):
        if not topic_names or not topic_names[index]:
            print(f"\nTopic {index}")
        else:
            print(f"\nTopic {topic_names[index]}:")
        msg = ", ".join([f'{feature_names[i]} ({topic[i]:6.4f})' 
                             for i in topic.argsort()[:-no_top_words-1:-1]])
        print(msg)

### Instantiating and Fitting the model

In [30]:
n_comp = 50
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_tfidf = NMF(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
lsa_cv_data = lsa_cv.fit_transform(cv_data)
nmf_tfidf_data = nmf_tfidf.fit_transform(tfidf_data)
nmf_cv_data = nmf_cv.fit_transform(cv_data)

### Fitting the models

In [24]:
display_topics2(lsa_tfidf, tfidf_vectorizer.get_feature_names(),3)


Topic 0
room small (0.9968), small room (0.0391), size room (0.0170)

Topic 1
small room (0.9959), room size (0.0212), bit small (0.0171)

Topic 2
rooms small (0.9959), small rooms (0.0432), room service (0.0290)

Topic 3
didn like (0.9758), room service (0.1738), air conditioning (0.0383)

Topic 4
room service (0.9556), air conditioning (0.1053), breakfast expensive (0.1033)

Topic 5
breakfast expensive (0.9845), air conditioning (0.0950), room bit (0.0270)

Topic 6
air conditioning (0.9636), didn work (0.0633), conditioning room (0.0570)

Topic 7
room bit (0.6182), bit small (0.6122), room bit small (0.4342)

Topic 8
small rooms (0.9944), breakfast included (0.0307), wi fi (0.0220)

Topic 9
breakfast included (0.9486), included price (0.1777), breakfast included price (0.1687)

Topic 10
wi fi (0.9765), free wi (0.0712), free wi fi (0.0711)

Topic 11
room little (0.7040), little small (0.5153), room little small (0.3999)

Topic 12
tea coffee (0.7136), facilities room (0.2712), making

In [26]:
display_topics2(lsa_cv, count_vectorizer.get_feature_names(),8)


Topic 0
room (0.8161), hotel (0.2784), small (0.1422), breakfast (0.1244), staff (0.1167), bed (0.0959), rooms (0.0888), night (0.0860)

Topic 1
hotel (0.7201), breakfast (0.2447), staff (0.1783), negative (0.0949), rooms (0.0871), stay (0.0748), like (0.0705), didn (0.0673)

Topic 2
negative (0.9955), room (0.0415), small (0.0055), room small (0.0034), small room (0.0019), size (0.0015), double (0.0013), negative say (0.0013)

Topic 3
breakfast (0.8376), staff (0.1155), expensive (0.0883), food (0.0695), coffee (0.0686), poor (0.0666), included (0.0642), good (0.0634)

Topic 4
small (0.7524), rooms (0.3561), bed (0.1837), bathroom (0.1742), room small (0.1340), bit (0.1173), shower (0.0916), little (0.0775)

Topic 5
hotel (0.3693), breakfast (0.3382), room (0.1962), expensive (0.0456), price (0.0373), included (0.0330), star (0.0318), star hotel (0.0277)

Topic 6
bed (0.3865), didn (0.2863), shower (0.2700), bathroom (0.2308), night (0.1683), like (0.1592), water (0.1293), work (0.10

In [31]:
display_topics2(nmf_tfidf, tfidf_vectorizer.get_feature_names(),10)


Topic 0
room small (7.3859), room small bed (0.1085), small bed (0.1036), size room small (0.0979), room small bathroom (0.0848), single room small (0.0757), single room (0.0738), room small price (0.0700), small room small (0.0682), breakfast room small (0.0678)

Topic 1
small room (6.6174), small room small (0.1114), small room bathroom (0.0959), room bed (0.0918), room small room (0.0885), room bathroom (0.0754), room price (0.0690), extremely small room (0.0671), room view (0.0618), room noisy (0.0617)

Topic 2
rooms small (6.1962), small price (0.0736), standard rooms (0.0720), single rooms (0.0616), small breakfast (0.0572), hotel rooms (0.0553), size rooms (0.0475), small hotel (0.0393), small people (0.0387), small expected (0.0385)

Topic 3
didn like (6.1054), thing didn like (0.2126), thing didn (0.2113), wasn didn like (0.1996), wasn didn (0.1991), didn like fact (0.1888), like fact (0.1831), didn like hotel (0.1464), like hotel (0.1428), think didn like (0.1298)

Topic 4
r

In [29]:
display_topics2(nmf_cv, count_vectorizer.get_feature_names(),5)


Topic 0
room (30.5580), view (0.7726), booked (0.7221), given (0.6206), double (0.5565)

Topic 1
hotel (21.6359), star (1.7160), star hotel (1.2118), like (0.7551), old (0.6331)

Topic 2
negative (18.9104), say (0.0378), negative say (0.0246), thing (0.0181), really (0.0167)

Topic 3
breakfast (17.0821), expensive (1.6968), included (1.3368), price (1.2971), breakfast included (0.9235)

Topic 4
small (19.6331), room small (4.0953), small room (2.1133), rooms small (0.9522), bit small (0.7507)

Topic 5
staff (22.2124), reception (2.0348), rude (2.0173), friendly (1.8064), helpful (1.7068)

Topic 6
shower (19.3878), bath (3.0340), toilet (1.3759), head (1.1778), shower head (0.9826)

Topic 7
bed (14.1502), double (3.0656), double bed (1.8034), beds (1.5589), single (1.3951)

Topic 8
rooms (15.5501), rooms small (1.2128), old (0.7545), small rooms (0.6621), need (0.5547)

Topic 9
didn (14.0458), like (6.2356), work (2.9716), didn work (2.4742), didn like (2.4111)

Topic 10
bit (16.2525),

In [51]:
# initialize vectorizers
count_vectorizer2 = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)
tfidf_vectorizer2 = TfidfVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)

# transfomred my text data using vectorizers
cv_data = count_vectorizer2.fit_transform(df.ReviewTextLower)
tfidf_data = tfidf_vectorizer2.fit_transform(df.ReviewTextLower)

In [52]:
# initialized reducers with dimensions
n_comp = 5
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_tfidf = NMF(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

# transformed my vectorizers data using reducers
lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
lsa_cv_data = lsa_cv.fit_transform(cv_data)
nmf_tfidf_data = nmf_tfidf.fit_transform(tfidf_data)
nmf_cv_data = nmf_cv.fit_transform(cv_data)

In [53]:
# initialize standardscaler
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()

# transform my reducer data using standardscaler
lsa_tfidf_data_sclaed = SS.fit_transform(lsa_tfidf_data)
lsa_cv_data_sclaed = SS.fit_transform(lsa_cv_data)
nmf_tfidf_data_scaled = SS.fit_transform(nmf_tfidf_data)
nmf_cv_data_scaled = SS.fit_transform(nmf_cv_data)

In [55]:
display_topics2(lsa_tfidf, tfidf_vectorizer.get_feature_names(),8)


Topic 0
perfect stay (1.0000), little bit (0.0006), helpful staff excellent (0.0003), spacious breakfast (0.0002), good reception (0.0002), feel comfortable (0.0001), handy location (0.0001), station tram (0.0001)

Topic 1
little bit (0.9169), spacious breakfast (0.1482), great customer (0.1384), good reception (0.1380), great location excellent (0.0994), gluten free (0.0897), good thing (0.0814), extremely nice (0.0741)

Topic 2
spacious breakfast (0.4411), gluten free (0.3037), handy location (0.2395), good reception (0.2215), room best (0.2069), great customer (0.1918), breakfast helpful staff (0.1894), helpful staff excellent (0.1693)

Topic 3
good reception (0.6384), good thing (0.2696), breakfast helpful staff (0.2567), location business (0.1499), room best (0.1361), breakfast really good (0.1114), good selection (0.1107), beds excellent (0.0986)

Topic 4
great customer (0.6215), great location excellent (0.3792), room best (0.1615), location center (0.1388), beds excellent (0.1

In [57]:
#lsa = TruncatedSVD(n_components=50)
#lsa_tfidf_data = lsa.fit_transform(tfidf_data)
terms = tfidf_vectorizer.get_feature_names()

for cv in range(0,10):
    word_list=[]
    print("Topic%d:"% cv)
    for j in lsa_tfidf.components_.argsort()[cv, -16:-1]:
        word_list.append(terms[j])
    print(word_list)

Topic0:
['metro easy', 'bathroom really', 'clean room comfortable', 'breakfast helpful staff', 'extremely nice', 'great customer', 'gluten free', 'room best', 'station tram', 'handy location', 'feel comfortable', 'good reception', 'spacious breakfast', 'helpful staff excellent', 'little bit']
Topic1:
['location business', 'metro easy', 'good clean', 'clean room comfortable', 'breakfast helpful staff', 'helpful staff excellent', 'handy location', 'room best', 'extremely nice', 'good thing', 'gluten free', 'great location excellent', 'good reception', 'great customer', 'spacious breakfast']
Topic2:
['beds excellent', 'good breakfast friendly', 'extremely nice', 'comfortable bed staff', 'staff awesome', 'metro easy', 'clean room comfortable', 'good clean', 'helpful staff excellent', 'breakfast helpful staff', 'great customer', 'room best', 'good reception', 'handy location', 'gluten free']
Topic3:
['hot chocolate', 'comfortable large', 'staff beautiful', 'room decent size', 'helpful staff

IndexError: index 5 is out of bounds for axis 0 with size 5

In [58]:

SSEs = []
Sil_coefs = []
for k in range(2,10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(lsa_tfidf_data_sclaed)
    labels = km.labels_
    Sil_coefs.append(silhouette_score(lsa_tfidf_data_sclaed, labels, metric='euclidean'))
    SSEs.append(km.inertia_)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True, dpi=200)
k_clusters = range(2,10)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')

# plot here on ax2
ax2.plot(k_clusters, SSEs)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('SSE');

In [None]:
inertia = [0,0]

for n_clusters in range(2, 25):
    km = KMeans(n_clusters = n_clusters)
    km.fit(lsa_tfidf_data_sclaed)
    msg = f"""# clusters: {n_clusters:2d}   Inertia: {km.inertia_:8.6f}"""
    inertia.append(km.inertia_)
    print(msg)

In [None]:

plt.figure(figsize=(20,10))
plt.plot(inertia)
plt.xlabel('# of clusters')
plt.xlim((2,25))
plt.ylabel('inertia scores')
#plt.ylim((650,1200))

In [None]:
k = 3
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(lsa_tfidf_data_sclaed)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = tfidf_vectorizer.get_feature_names()

for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(terms[j])
    print(word_list)

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=92, n_iter=300)
X_ne = tsne.fit_transform(lsa_tfidf_data_sclaed[2000:])

figsize=(20,15)
plt.figure(dpi=300)
sns.scatterplot(X_ne[:, 0], X_ne[:, 1], hue=kmeans.labels_[2000:], alpha=0.5, size = 0.5, palette='rainbow', legend='full');

In [None]:
for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(terms[j])
    print(word_list)

In [None]:
kmeans.labels_

In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==0]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")

In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==1]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")

In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==2]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")

In [None]:
SSEs = []
Sil_coefs = []
for k in range(2,10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(nmf_tfidf_data_scaled)
    labels = km.labels_
    Sil_coefs.append(silhouette_score(nmf_tfidf_data_scaled, labels, metric='euclidean'))
    SSEs.append(km.inertia_)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True, dpi=200)
k_clusters = range(2,10)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')

# plot here on ax2
ax2.plot(k_clusters, SSEs)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('SSE');

In [None]:
inertia = [0,0]

for n_clusters in range(2, 25):
    km = KMeans(n_clusters = n_clusters)
    km.fit(nmf_tfidf_data_scaled)
    msg = f"""# clusters: {n_clusters:2d}   Inertia: {km.inertia_:8.6f}"""
    inertia.append(km.inertia_)
    print(msg)

In [None]:

plt.figure(figsize=(20,10))
plt.plot(inertia)
plt.xlabel('# of clusters')
plt.xlim((2,25))
plt.ylabel('inertia scores')
#plt.ylim((650,1200))

In [None]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(nmf_tfidf_data_scaled)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = tfidf_vectorizer.get_feature_names()

for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(terms[j])
    print(word_list)

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=92, n_iter=300)
X_ne = tsne.fit_transform(nmf_tfidf_data_scaled[2000:])

figsize=(20,15)
plt.figure(dpi=300)
sns.scatterplot(X_ne[:, 0], X_ne[:, 1], hue=kmeans.labels_[2000:], alpha=0.5, size = 0.5, palette='rainbow', legend='full');

In [None]:
# running cluster
k = 6
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(nmf_tfidf_data_scaled)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = tfidf_vectorizer.get_feature_names()

for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(terms[j])
    print(word_list)

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=92, n_iter=300)
X_ne = tsne.fit_transform(nmf_tfidf_data_scaled[2000:])

figsize=(20,15)
plt.figure(dpi=300)
sns.scatterplot(X_ne[:, 0], X_ne[:, 1], hue=kmeans.labels_[2000:], alpha=0.5, size = 0.5, palette='rainbow', legend='full');

In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==3]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")

In [None]:
# initialize vectorizers
count_vectorizer = CountVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)

# transfomred my text data using vectorizers
cv_data = count_vectorizer.fit_transform(df.ReviewTextLower)
tfidf_data = tfidf_vectorizer.fit_transform(df.ReviewTextLower)

In [None]:
# initialized reducers with dimensions
n_comp = 5
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_tfidf = NMF(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

# transformed my vectorizers data using reducers
lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
lsa_cv_data = lsa_cv.fit_transform(cv_data)
nmf_tfidf_data = nmf_tfidf.fit_transform(tfidf_data)
nmf_cv_data = nmf_cv.fit_transform(cv_data)

In [None]:
# initialize standardscaler
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()

# transform my reducer data using standardscaler
lsa_tfidf_data_sclaed = SS.fit_transform(lsa_tfidf_data)
lsa_cv_data_sclaed = SS.fit_transform(lsa_cv_data)
nmf_tfidf_data_scaled = SS.fit_transform(nmf_tfidf_data)
nmf_cv_data_scaled = SS.fit_transform(nmf_cv_data)

In [None]:
SSEs = []
Sil_coefs = []
for k in range(2,10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(nmf_tfidf_data_scaled)
    labels = km.labels_
    Sil_coefs.append(silhouette_score(nmf_tfidf_data_scaled, labels, metric='euclidean'))
    SSEs.append(km.inertia_)

In [None]:

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True, dpi=200)
k_clusters = range(2,10)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')

# plot here on ax2
ax2.plot(k_clusters, SSEs)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('SSE');

In [None]:
inertia = [0,0]

for n_clusters in range(2, 25):
    km = KMeans(n_clusters = n_clusters)
    km.fit(nmf_tfidf_data_scaled)
    msg = f"""# clusters: {n_clusters:2d}   Inertia: {km.inertia_:8.6f}"""
    inertia.append(km.inertia_)
    print(msg)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(inertia)
plt.xlabel('# of clusters')
plt.xlim((2,25))
plt.ylabel('inertia scores')
#plt.ylim((650,1200))

In [None]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(nmf_tfidf_data_scaled)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = tfidf_vectorizer.get_feature_names()

for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(terms[j])
    print(word_list)

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=92, n_iter=300, random_state=42)
X_ne = tsne.fit_transform(nmf_tfidf_data_scaled[2000:])

figsize=(20,15)
plt.figure(dpi=300)
sns.scatterplot(X_ne[:, 0], X_ne[:, 1], hue=kmeans.labels_[2000:], alpha=0.5, size = 0.5, palette='rainbow', legend='full');

In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==0]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")


In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==0]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")