### DATA CLEANING AND PREPARATION

In [4]:
#getting libraries
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [5]:
dfm = pd.read_csv('Hotel_Reviews_2.csv')

In [6]:
dfm = dfm.drop_duplicates()

In [7]:
dfm_sub = dfm[['Positive_Review','pos_count','Tags']]

In [8]:
dfm_sub.head()

Unnamed: 0,Positive_Review,pos_count,Tags
0,only the park outside of the hotel was beauti...,1,"[' Leisure trip ', ' Couple ', ' Duplex Double..."
1,no real complaints the hotel was great great ...,1,"[' Leisure trip ', ' Couple ', ' Duplex Double..."
2,location was good and staff were ok it is cut...,1,"[' Leisure trip ', ' Family with young childre..."
3,great location in nice surroundings the bar a...,1,"[' Leisure trip ', ' Solo traveler ', ' Duplex..."
4,amazing location and building romantic setting,1,"[' Leisure trip ', ' Couple ', ' Suite ', ' St..."


In [9]:
dfm_sub.rename(columns={'Positive_Review':'ReviewText',
                          'pos_count':'labels','Tags':'tags'}, 
                 inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [10]:
df = dfm_sub


In [11]:
df.head()

Unnamed: 0,ReviewText,labels,tags
0,only the park outside of the hotel was beauti...,1,"[' Leisure trip ', ' Couple ', ' Duplex Double..."
1,no real complaints the hotel was great great ...,1,"[' Leisure trip ', ' Couple ', ' Duplex Double..."
2,location was good and staff were ok it is cut...,1,"[' Leisure trip ', ' Family with young childre..."
3,great location in nice surroundings the bar a...,1,"[' Leisure trip ', ' Solo traveler ', ' Duplex..."
4,amazing location and building romantic setting,1,"[' Leisure trip ', ' Couple ', ' Suite ', ' St..."


In [12]:
df['tags'] =  df['tags'].apply(lambda x: x.replace('[','').replace(']','')) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
df['CombinedText'] = df['tags'].map(str) + df['ReviewText'].map(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
df['ReviewTextLower'] = df.ReviewText
df['ReviewTextLower'] = df.ReviewTextLower.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#### Preparing data for modelling using count vectorizer and tf-idf

In [15]:
count_vectorizer = CountVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)

cv_data = count_vectorizer.fit_transform(df.ReviewTextLower)
tfidf_data = tfidf_vectorizer.fit_transform(df.ReviewTextLower)

In [16]:
# def functions for topic modelings
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def display_topics2(model, feature_names, no_top_words=10, topic_names = None):
    for index, topic in enumerate(model.components_):
        if not topic_names or not topic_names[index]:
            print(f"\nTopic {index}")
        else:
            print(f"\nTopic {topic_names[index]}:")
        msg = ", ".join([f'{feature_names[i]} ({topic[i]:6.4f})' 
                             for i in topic.argsort()[:-no_top_words-1:-1]])
        print(msg)

### Fitting the models to the data

In [17]:
n_comp = 20
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_tfidf = NMF(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
lsa_cv_data = lsa_cv.fit_transform(cv_data)
nmf_tfidf_data = nmf_tfidf.fit_transform(tfidf_data)
nmf_cv_data = nmf_cv.fit_transform(cv_data)

In [19]:
display_topics2(lsa_tfidf, tfidf_vectorizer.get_feature_names(),8)


Topic 0
great location (0.8223), friendly staff (0.3034), staff friendly (0.2000), friendly helpful (0.1818), good location (0.1756), helpful staff (0.1367), staff friendly helpful (0.1131), location friendly (0.1026)

Topic 1
good location (0.5221), staff friendly (0.4865), friendly helpful (0.3583), staff friendly helpful (0.2701), helpful staff (0.1558), friendly helpful staff (0.1153), friendly staff (0.0802), location good (0.0541)

Topic 2
good location (0.6311), friendly staff (0.4327), location friendly staff (0.0961), location friendly (0.0880), staff good (0.0505), friendly staff good (0.0382), staff good location (0.0360), good location friendly (0.0359)

Topic 3
friendly staff (0.7677), location friendly staff (0.1412), location friendly (0.1393), staff friendly (0.0847), excellent location (0.0735), friendly helpful (0.0552), staff friendly helpful (0.0469), helpful friendly staff (0.0438)

Topic 4
location good (0.6755), staff helpful (0.5661), helpful friendly (0.1199),

In [20]:
display_topics2(lsa_cv, count_vectorizer.get_feature_names(),10)


Topic 0
staff (0.4331), location (0.3608), room (0.3380), hotel (0.3263), good (0.2494), great (0.2419), friendly (0.2032), helpful (0.1828), breakfast (0.1780), nice (0.1487)

Topic 1
room (0.5687), hotel (0.3149), good (0.1280), nice (0.1274), bed (0.1060), comfortable (0.1044), clean (0.0932), breakfast (0.0647), bathroom (0.0551), station (0.0393)

Topic 2
staff (0.4420), friendly (0.2500), helpful (0.2391), hotel (0.1501), friendly helpful (0.0981), staff friendly (0.0961), room (0.0862), staff helpful (0.0528), staff friendly helpful (0.0505), friendly staff (0.0474)

Topic 3
hotel (0.7402), great (0.2305), stay (0.0684), great location (0.0651), location (0.0595), walk (0.0478), rooms (0.0469), station (0.0461), hotel great (0.0412), just (0.0399)

Topic 4
good (0.6099), hotel (0.2648), breakfast (0.1552), good location (0.1093), breakfast good (0.0610), good breakfast (0.0586), staff (0.0512), friendly (0.0503), location good (0.0502), rooms (0.0436)

Topic 5
great (0.6329), b

In [21]:
display_topics2(nmf_tfidf, tfidf_vectorizer.get_feature_names(),10)


Topic 0
great location (9.8165), staff great (0.5296), staff great location (0.4866), hotel great (0.4673), hotel great location (0.4615), great location friendly (0.2810), great location great (0.2539), great location close (0.2347), great location good (0.2233), great location staff (0.2192)

Topic 1
staff friendly (7.5696), staff friendly helpful (3.8216), friendly helpful (3.3562), location staff friendly (0.2897), hotel staff friendly (0.2656), hotel staff (0.2510), friendly helpful room (0.2484), helpful room (0.2322), clean staff friendly (0.2196), clean staff (0.2005)

Topic 2
good location (7.8591), staff good location (0.3408), staff good (0.3180), hotel good location (0.2913), hotel good (0.2889), good location nice (0.2355), good location close (0.2174), good location good (0.2161), good location friendly (0.2146), location nice (0.2145)

Topic 3
friendly staff (7.1566), helpful friendly staff (0.4342), friendly staff good (0.3740), helpful friendly (0.3637), friendly staf

In [50]:
display_topics2(nmf_cv, count_vectorizer.get_feature_names(),3)


Topic 0
staff (42.1521), welcoming (1.0889), attentive (1.0682)

Topic 1
room (20.4011), size (0.7430), small (0.6900)

Topic 2
location (19.9206), excellent location (0.8731), location great (0.8662)

Topic 3
hotel (21.1837), hotel staff (0.7900), recommend (0.7165)

Topic 4
good (20.9916), location good (2.0598), good breakfast (1.5613)

Topic 5
great (21.4341), location great (2.8553), great staff (1.5603)

Topic 6
nice (20.0354), staff nice (1.3761), room nice (1.1939)

Topic 7
breakfast (21.2371), breakfast good (1.6567), good breakfast (1.5585)

Topic 8
comfortable (18.7343), beds (2.1369), comfortable room (1.8000)

Topic 9
friendly (14.8236), friendly staff (9.8613), location friendly (1.7178)

Topic 10
helpful (26.3814), staff helpful (11.9427), helpful friendly (3.9155)

Topic 11
clean (19.2916), room clean (2.3470), clean comfortable (1.5918)

Topic 12
excellent (30.5553), excellent location (5.5557), location excellent (3.4969)

Topic 13
walk (19.9751), minutes (5.3970), m

In [51]:
# initialize vectorizers
count_vectorizer2 = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)
tfidf_vectorizer2 = TfidfVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)

# transfomred my text data using vectorizers
cv_data = count_vectorizer2.fit_transform(df.ReviewTextLower)
tfidf_data = tfidf_vectorizer2.fit_transform(df.ReviewTextLower)

In [52]:
# initialized reducers with dimensions
n_comp = 5
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_tfidf = NMF(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

# transformed my vectorizers data using reducers
lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
lsa_cv_data = lsa_cv.fit_transform(cv_data)
nmf_tfidf_data = nmf_tfidf.fit_transform(tfidf_data)
nmf_cv_data = nmf_cv.fit_transform(cv_data)

In [53]:
# initialize standardscaler
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()

# transform my reducer data using standardscaler
lsa_tfidf_data_sclaed = SS.fit_transform(lsa_tfidf_data)
lsa_cv_data_sclaed = SS.fit_transform(lsa_cv_data)
nmf_tfidf_data_scaled = SS.fit_transform(nmf_tfidf_data)
nmf_cv_data_scaled = SS.fit_transform(nmf_cv_data)

In [55]:
display_topics2(lsa_tfidf, tfidf_vectorizer.get_feature_names(),8)


Topic 0
perfect stay (1.0000), little bit (0.0006), helpful staff excellent (0.0003), spacious breakfast (0.0002), good reception (0.0002), feel comfortable (0.0001), handy location (0.0001), station tram (0.0001)

Topic 1
little bit (0.9169), spacious breakfast (0.1482), great customer (0.1384), good reception (0.1380), great location excellent (0.0994), gluten free (0.0897), good thing (0.0814), extremely nice (0.0741)

Topic 2
spacious breakfast (0.4411), gluten free (0.3037), handy location (0.2395), good reception (0.2215), room best (0.2069), great customer (0.1918), breakfast helpful staff (0.1894), helpful staff excellent (0.1693)

Topic 3
good reception (0.6384), good thing (0.2696), breakfast helpful staff (0.2567), location business (0.1499), room best (0.1361), breakfast really good (0.1114), good selection (0.1107), beds excellent (0.0986)

Topic 4
great customer (0.6215), great location excellent (0.3792), room best (0.1615), location center (0.1388), beds excellent (0.1

In [57]:
#lsa = TruncatedSVD(n_components=50)
#lsa_tfidf_data = lsa.fit_transform(tfidf_data)
terms = tfidf_vectorizer.get_feature_names()

for cv in range(0,10):
    word_list=[]
    print("Topic%d:"% cv)
    for j in lsa_tfidf.components_.argsort()[cv, -16:-1]:
        word_list.append(terms[j])
    print(word_list)

Topic0:
['metro easy', 'bathroom really', 'clean room comfortable', 'breakfast helpful staff', 'extremely nice', 'great customer', 'gluten free', 'room best', 'station tram', 'handy location', 'feel comfortable', 'good reception', 'spacious breakfast', 'helpful staff excellent', 'little bit']
Topic1:
['location business', 'metro easy', 'good clean', 'clean room comfortable', 'breakfast helpful staff', 'helpful staff excellent', 'handy location', 'room best', 'extremely nice', 'good thing', 'gluten free', 'great location excellent', 'good reception', 'great customer', 'spacious breakfast']
Topic2:
['beds excellent', 'good breakfast friendly', 'extremely nice', 'comfortable bed staff', 'staff awesome', 'metro easy', 'clean room comfortable', 'good clean', 'helpful staff excellent', 'breakfast helpful staff', 'great customer', 'room best', 'good reception', 'handy location', 'gluten free']
Topic3:
['hot chocolate', 'comfortable large', 'staff beautiful', 'room decent size', 'helpful staff

IndexError: index 5 is out of bounds for axis 0 with size 5

In [58]:

SSEs = []
Sil_coefs = []
for k in range(2,10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(lsa_tfidf_data_sclaed)
    labels = km.labels_
    Sil_coefs.append(silhouette_score(lsa_tfidf_data_sclaed, labels, metric='euclidean'))
    SSEs.append(km.inertia_)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True, dpi=200)
k_clusters = range(2,10)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')

# plot here on ax2
ax2.plot(k_clusters, SSEs)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('SSE');

In [None]:
inertia = [0,0]

for n_clusters in range(2, 25):
    km = KMeans(n_clusters = n_clusters)
    km.fit(lsa_tfidf_data_sclaed)
    msg = f"""# clusters: {n_clusters:2d}   Inertia: {km.inertia_:8.6f}"""
    inertia.append(km.inertia_)
    print(msg)

In [None]:

plt.figure(figsize=(20,10))
plt.plot(inertia)
plt.xlabel('# of clusters')
plt.xlim((2,25))
plt.ylabel('inertia scores')
#plt.ylim((650,1200))

In [None]:
k = 3
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(lsa_tfidf_data_sclaed)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = tfidf_vectorizer.get_feature_names()

for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(terms[j])
    print(word_list)

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=92, n_iter=300)
X_ne = tsne.fit_transform(lsa_tfidf_data_sclaed[2000:])

figsize=(20,15)
plt.figure(dpi=300)
sns.scatterplot(X_ne[:, 0], X_ne[:, 1], hue=kmeans.labels_[2000:], alpha=0.5, size = 0.5, palette='rainbow', legend='full');

In [None]:
for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(terms[j])
    print(word_list)

In [None]:
kmeans.labels_

In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==0]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")

In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==1]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")

In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==2]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")

In [None]:
SSEs = []
Sil_coefs = []
for k in range(2,10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(nmf_tfidf_data_scaled)
    labels = km.labels_
    Sil_coefs.append(silhouette_score(nmf_tfidf_data_scaled, labels, metric='euclidean'))
    SSEs.append(km.inertia_)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True, dpi=200)
k_clusters = range(2,10)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')

# plot here on ax2
ax2.plot(k_clusters, SSEs)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('SSE');

In [None]:
inertia = [0,0]

for n_clusters in range(2, 25):
    km = KMeans(n_clusters = n_clusters)
    km.fit(nmf_tfidf_data_scaled)
    msg = f"""# clusters: {n_clusters:2d}   Inertia: {km.inertia_:8.6f}"""
    inertia.append(km.inertia_)
    print(msg)

In [None]:

plt.figure(figsize=(20,10))
plt.plot(inertia)
plt.xlabel('# of clusters')
plt.xlim((2,25))
plt.ylabel('inertia scores')
#plt.ylim((650,1200))

In [None]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(nmf_tfidf_data_scaled)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = tfidf_vectorizer.get_feature_names()

for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(terms[j])
    print(word_list)

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=92, n_iter=300)
X_ne = tsne.fit_transform(nmf_tfidf_data_scaled[2000:])

figsize=(20,15)
plt.figure(dpi=300)
sns.scatterplot(X_ne[:, 0], X_ne[:, 1], hue=kmeans.labels_[2000:], alpha=0.5, size = 0.5, palette='rainbow', legend='full');

In [None]:
# running cluster
k = 6
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit(nmf_tfidf_data_scaled)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = tfidf_vectorizer.get_feature_names()

for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(terms[j])
    print(word_list)

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=92, n_iter=300)
X_ne = tsne.fit_transform(nmf_tfidf_data_scaled[2000:])

figsize=(20,15)
plt.figure(dpi=300)
sns.scatterplot(X_ne[:, 0], X_ne[:, 1], hue=kmeans.labels_[2000:], alpha=0.5, size = 0.5, palette='rainbow', legend='full');

In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==3]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")

In [None]:
# initialize vectorizers
count_vectorizer = CountVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)

# transfomred my text data using vectorizers
cv_data = count_vectorizer.fit_transform(df.ReviewTextLower)
tfidf_data = tfidf_vectorizer.fit_transform(df.ReviewTextLower)

In [None]:
# initialized reducers with dimensions
n_comp = 5
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_tfidf = NMF(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

# transformed my vectorizers data using reducers
lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
lsa_cv_data = lsa_cv.fit_transform(cv_data)
nmf_tfidf_data = nmf_tfidf.fit_transform(tfidf_data)
nmf_cv_data = nmf_cv.fit_transform(cv_data)

In [None]:
# initialize standardscaler
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()

# transform my reducer data using standardscaler
lsa_tfidf_data_sclaed = SS.fit_transform(lsa_tfidf_data)
lsa_cv_data_sclaed = SS.fit_transform(lsa_cv_data)
nmf_tfidf_data_scaled = SS.fit_transform(nmf_tfidf_data)
nmf_cv_data_scaled = SS.fit_transform(nmf_cv_data)

In [None]:
SSEs = []
Sil_coefs = []
for k in range(2,10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(nmf_tfidf_data_scaled)
    labels = km.labels_
    Sil_coefs.append(silhouette_score(nmf_tfidf_data_scaled, labels, metric='euclidean'))
    SSEs.append(km.inertia_)

In [None]:

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True, dpi=200)
k_clusters = range(2,10)
ax1.plot(k_clusters, Sil_coefs)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')

# plot here on ax2
ax2.plot(k_clusters, SSEs)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('SSE');

In [None]:
inertia = [0,0]

for n_clusters in range(2, 25):
    km = KMeans(n_clusters = n_clusters)
    km.fit(nmf_tfidf_data_scaled)
    msg = f"""# clusters: {n_clusters:2d}   Inertia: {km.inertia_:8.6f}"""
    inertia.append(km.inertia_)
    print(msg)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(inertia)
plt.xlabel('# of clusters')
plt.xlim((2,25))
plt.ylabel('inertia scores')
#plt.ylim((650,1200))

In [None]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(nmf_tfidf_data_scaled)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = tfidf_vectorizer.get_feature_names()

for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:15]:
        word_list.append(terms[j])
    print(word_list)

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=92, n_iter=300, random_state=42)
X_ne = tsne.fit_transform(nmf_tfidf_data_scaled[2000:])

figsize=(20,15)
plt.figure(dpi=300)
sns.scatterplot(X_ne[:, 0], X_ne[:, 1], hue=kmeans.labels_[2000:], alpha=0.5, size = 0.5, palette='rainbow', legend='full');

In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==0]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")


In [None]:
indices_max = [index for index, value in enumerate(kmeans.labels_) if value==0]
for rev_index in indices_max[:5]:
    print(rev_index, str(df.ReviewText[rev_index]))
    print("\n")

In [1]:
1+1


2