In [8]:
import os
import csv
import pandas as pd
import numpy as np
import sklearn
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk import tokenize

In [2]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [3]:
def display_topics(model, feature_names, num_topics, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        if topic_idx < num_topics:
            print("{:11}".format("Topic %d:" %(topic_idx)), end='')
            print(", ".join(['{:04.3f}*'.format(topic[i])+feature_names[i] \
                             for i in topic.argsort()[:-no_top_words-1:-1]]))

In [54]:
# Read in Data
data = pd.read_csv('hash_house.csv')
data['userid'] = data['Unnamed: 0']
data.head()

Unnamed: 0.1,Unnamed: 0,name,stars_y,text,userid
0,0,Hash House A Go Go,5,"Firstly, this restaurant is in The Linq Hotel,...",0
1,1,Hash House A Go Go,4,This place had monsterous proportions OMG! One...,1
2,2,Hash House A Go Go,5,This place freaking rocks. Must go to when in ...,2
3,3,Hash House A Go Go,3,Visited HHAGG ago go for the first time on 5/5...,3
4,4,Hash House A Go Go,3,Big portions. Sharing is highly recommended. H...,4


In [69]:
# Split reviews into individual sentences 
df = pd.DataFrame(columns=['userid','sentence','stars'])
for i in range(0,len(data),1):
    sentences = tokenize.sent_tokenize(data.text[i])
    for j in sentences:
        df = df.append({'userid':data.userid[i],'sentence':j,'stars':data.stars_y[i]},ignore_index=True)

In [70]:
df.head()

Unnamed: 0,userid,sentence,stars
0,0,"Firstly, this restaurant is in The Linq Hotel,...",5
1,0,Expect a line.,5
2,0,"Waited only about 15 minutes to be seated, tho...",5
3,0,Greeted by Tony our waiter who was really warm...,5
4,0,Ordered the Sage Fried Chicken and Waffles.,5


In [73]:
# Create Corpus for TFIDF
corpus = []
for i in df.sentence:
        corpus.append(i)

## Number of Topics
### 3 Topics

In [85]:
n_components = 3
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: food good great service place wait amazing vegas really time breakfast worth just delicious definitely
Topic #1: chicken waffles fried sage benedict ordered bacon got eggs delicious hash waffle andy potatoes amazing
Topic #2: huge portions large delicious big share portion food people prices plate massive enormous tasty hungry



### 4 Topics

In [86]:
n_components = 4
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: food great service place wait amazing vegas time delicious worth breakfast definitely just come long
Topic #1: chicken waffles fried sage benedict ordered bacon got eggs delicious hash andy waffle potatoes amazing
Topic #2: huge portions large big delicious share portion people food prices plate massive enormous tasty hungry
Topic #3: good really service pretty food just overall potatoes biscuits thing bloody mary taste coffee biscuit



### 5 Topics

In [87]:
n_components = 5
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: food great service amazing delicious awesome excellent friendly man vs just price came server experience
Topic #1: chicken waffles fried sage benedict ordered bacon got eggs delicious andy waffle potatoes amazing hash
Topic #2: huge portions large big delicious share portion people prices plate massive enormous food hungry tasty
Topic #3: good really pretty service food just overall potatoes biscuits bloody thing mary taste coffee biscuit
Topic #4: place wait vegas worth time definitely breakfast come hash try long house love eat minutes



### 6 Topics

In [88]:
n_components = 6
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: great service friendly place excellent experience customer staff slow server fast atmosphere breakfast attentive awesome
Topic #1: chicken waffles fried sage benedict ordered bacon got eggs delicious andy waffle potatoes hash crispy
Topic #2: huge portions large big share portion delicious people prices plate massive enormous hungry meal tasty
Topic #3: good really service pretty overall just potatoes biscuits bloody thing mary taste coffee biscuit eggs
Topic #4: place wait vegas worth time definitely breakfast come hash try long house love eat minutes
Topic #5: food amazing delicious man vs awesome just came lot price excellent took quality tasty large



### 7 Topics

In [89]:
n_components = 7
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: great service friendly excellent experience staff customer slow server fast atmosphere attentive waiter quick bad
Topic #1: chicken waffles fried sage benedict ordered bacon got eggs delicious andy waffle potatoes crispy hash
Topic #2: huge portions large big share portion delicious people prices plate massive enormous hungry meal tasty
Topic #3: good really pretty service overall just potatoes biscuits bloody thing mary taste coffee biscuit wasn
Topic #4: place vegas breakfast definitely hash love house try time come eat best recommend just las
Topic #5: food amazing delicious man vs awesome just came lot price excellent took quality tasty large
Topic #6: wait worth long time minutes hour seated 30 table minute 45 20 come definitely 10



### 8 Topics

In [90]:
n_components = 8
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: great service friendly excellent experience staff customer slow server fast atmosphere attentive waiter quick bad
Topic #1: chicken waffles fried sage benedict ordered bacon got eggs andy waffle potatoes crispy amazing hash
Topic #2: huge portions large big share portion people prices plate massive hungry enormous meal tasty size
Topic #3: good really pretty service overall just potatoes biscuits bloody thing mary taste coffee biscuit looked
Topic #4: place vegas breakfast definitely hash love try house time come eat best recommend just las
Topic #5: food amazing man vs awesome just came lot price excellent took quality tasty large like
Topic #6: wait worth long time minutes hour seated 30 table minute 45 20 come definitely 10
Topic #7: delicious absolutely bloody mary hash biscuit potatoes pancake fresh house looked coffee bacon biscuits crispy



### 9 Topics

In [91]:
n_components = 9
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: great service friendly excellent experience staff customer slow server fast atmosphere attentive waiter quick breakfast
Topic #1: chicken waffles fried sage benedict ordered bacon got eggs andy waffle potatoes amazing crispy best
Topic #2: huge portions large big share portion people prices plate massive hungry enormous meal tasty size
Topic #3: good really pretty service overall just potatoes biscuits bloody thing mary taste coffee looked wasn
Topic #4: place vegas definitely breakfast love try come recommend time eat best awesome amazing just las
Topic #5: food amazing man vs awesome just came lot price excellent took quality tasty like large
Topic #6: wait worth long time minutes hour seated 30 definitely table come minute 45 20 10
Topic #7: delicious absolutely bloody mary biscuit potatoes pancake fresh looked coffee bacon biscuits crispy tried toast
Topic #8: hash house vegas beef corned time ordered breakfa

### 10 Topics

In [92]:
n_components = 10
n_top_words = 15

# TFIDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(corpus)

# NMF reduction
nmf = NMF(n_components=n_components).fit(tfidf)
W_pos = nmf.fit_transform(tfidf)

# Output Topics
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: great service friendly experience excellent customer staff slow server fast atmosphere attentive waiter quick nice
Topic #1: chicken waffles fried sage benedict ordered bacon got eggs andy waffle potatoes crispy best try
Topic #2: huge portions large big share portion people prices plate massive hungry enormous meal tasty size
Topic #3: good really pretty service overall just potatoes biscuits bloody thing mary taste coffee looked wasn
Topic #4: place vegas definitely breakfast love try come time recommend eat best awesome just las visit
Topic #5: food man vs awesome just came lot price excellent like took quality tasty large big
Topic #6: wait worth long time minutes hour seated 30 table definitely come minute 45 20 10
Topic #7: delicious absolutely bloody mary biscuit potatoes pancake fresh looked coffee bacon biscuits crispy tried toast
Topic #8: hash house vegas beef corned time ordered breakfast eggs linq lo