In [2]:
import pandas as pd
import numpy as np
import ast
import seaborn
from collections import Counter
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
from bs4 import BeautifulSoup
import logging, gensim, bz2



In [3]:
import json
import urlparse
from itertools import chain
flatten = chain.from_iterable
from nltk import word_tokenize

In [4]:
def load(filename):
    file = open(filename,'rb')
    obj = pickle.load(file)
    file.close()
    return obj

def dump(obj,filename):
    filehandler = open(filename,"wb")
    pickle.dump(obj,filehandler)
    filehandler.close()

### Load Pickled Data

In [5]:
edinburgh_review = load("../Pickled_files/edinburgh_review.pkl")

In [6]:
corpus = load("../Pickled_files/corpus_edinburgh.pkl")
dictionary = load("../Pickled_files/dictionary_edinburgh.pkl")

### Start LDA

In [7]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word = dictionary, passes=20)
print(ldamodel.print_topics(topics=50, topn=10))

[u'0.043*restaur + 0.028*food + 0.019*menu + 0.015*dish + 0.013*dine + 0.013*servic + 0.012*meal + 0.012*experi + 0.010*well + 0.009*excel', u'0.050*baguett + 0.049*oyster + 0.029*inn + 0.023*tower + 0.021*basil + 0.019*villag + 0.018*turkey + 0.018*sheep + 0.017*fed + 0.015*pepper', u'0.050*curri + 0.038*indian + 0.034*dish + 0.034*chicken + 0.024*food + 0.020*lamb + 0.018*rice + 0.015*order + 0.015*restaur + 0.013*naan', u'0.034*year + 0.027*night + 0.023*time + 0.019*last + 0.017*visit + 0.014*week + 0.013*ago + 0.011*alway + 0.010*get + 0.010*use', u'0.149*thai + 0.028*pad + 0.014*introduc + 0.013*bank + 0.013*stir + 0.012*monkfish + 0.011*august + 0.011*wetherspoon + 0.010*dive + 0.010*account', u'0.034*tempura + 0.031*korean + 0.026*gut + 0.025*shell + 0.020*tang + 0.017*aber + 0.016*wasabi + 0.015*teriyaki + 0.014*kim + 0.014*bonsai', u'0.051*steak + 0.019*rare + 0.015*medium + 0.013*voucher + 0.009*cut + 0.009*vinegar + 0.008*ask + 0.008*grill + 0.007*water + 0.007*oh', u'0.030

In [8]:
ldamodel.save("lda_edinburgh.model")

In [9]:
model =  gensim.models.LdaModel.load('lda_edinburgh.model')

In [10]:
model.print_topics(50,topn=30)

[u'0.043*restaur + 0.028*food + 0.019*menu + 0.015*dish + 0.013*dine + 0.013*servic + 0.012*meal + 0.012*experi + 0.010*well + 0.009*excel + 0.009*visit + 0.009*one + 0.008*edinburgh + 0.008*present + 0.008*staff + 0.007*even + 0.006*would + 0.006*beauti + 0.006*dinner + 0.006*decor + 0.006*qualiti + 0.006*enjoy + 0.006*recommend + 0.006*chef + 0.006*tast + 0.006*friend + 0.005*wine + 0.005*special + 0.005*attent + 0.005*feel',
 u'0.050*baguett + 0.049*oyster + 0.029*inn + 0.023*tower + 0.021*basil + 0.019*villag + 0.018*turkey + 0.018*sheep + 0.017*fed + 0.015*pepper + 0.014*pret + 0.014*kfc + 0.013*heid + 0.013*thyme + 0.012*spa + 0.011*nicolson + 0.011*creation + 0.010*nasti + 0.009*immens + 0.009*fav + 0.009*distilleri + 0.009*manger + 0.008*einer + 0.008*sweetcorn + 0.008*hoos + 0.007*unhealthi + 0.007*express + 0.007*masala + 0.007*tuna + 0.006*valley',
 u'0.050*curri + 0.038*indian + 0.034*dish + 0.034*chicken + 0.024*food + 0.020*lamb + 0.018*rice + 0.015*order + 0.015*restaur 

### Word Cloud generator

In [11]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [12]:
ldamodel =  gensim.models.LdaModel.load('lda_edinburgh.model')
print(ldamodel.show_topics(topics=50, topn=10))

[u'0.043*restaur + 0.028*food + 0.019*menu + 0.015*dish + 0.013*dine + 0.013*servic + 0.012*meal + 0.012*experi + 0.010*well + 0.009*excel', u'0.050*baguett + 0.049*oyster + 0.029*inn + 0.023*tower + 0.021*basil + 0.019*villag + 0.018*turkey + 0.018*sheep + 0.017*fed + 0.015*pepper', u'0.050*curri + 0.038*indian + 0.034*dish + 0.034*chicken + 0.024*food + 0.020*lamb + 0.018*rice + 0.015*order + 0.015*restaur + 0.013*naan', u'0.034*year + 0.027*night + 0.023*time + 0.019*last + 0.017*visit + 0.014*week + 0.013*ago + 0.011*alway + 0.010*get + 0.010*use', u'0.149*thai + 0.028*pad + 0.014*introduc + 0.013*bank + 0.013*stir + 0.012*monkfish + 0.011*august + 0.011*wetherspoon + 0.010*dive + 0.010*account', u'0.034*tempura + 0.031*korean + 0.026*gut + 0.025*shell + 0.020*tang + 0.017*aber + 0.016*wasabi + 0.015*teriyaki + 0.014*kim + 0.014*bonsai', u'0.051*steak + 0.019*rare + 0.015*medium + 0.013*voucher + 0.009*cut + 0.009*vinegar + 0.008*ask + 0.008*grill + 0.007*water + 0.007*oh', u'0.030

In [13]:
plt.figure()
temp = ldamodel.show_topic(3, 50)
temp = map(lambda x: x[::-1],temp)
plt.imshow(WordCloud().fit_words(temp))
plt.axis("off")
plt.title("Topic #" + str(3))
plt.show()

AttributeError: 'list' object has no attribute 'items'

In [None]:
ldamodel.print_topics(50,topn=30)

### Trying more viz

In [14]:
model =  gensim.models.LdaModel.load('lda_edinburgh.model')

In [15]:
lda = model
n_topics = 50

In [16]:
term_list = []
for i in range(0, n_topics):
    temp = lda.show_topic(i, 20)
    terms = []
    for term in temp:
            terms.append(term)
    term_list.append(terms)
    print "Top 10 terms for topic #" + str(i) + ": "+ ", ".join([i[1] for i in terms])

Top 10 terms for topic #0: restaur, food, menu, dish, dine, servic, meal, experi, well, excel, visit, one, edinburgh, present, staff, even, would, beauti, dinner, decor
Top 10 terms for topic #1: baguett, oyster, inn, tower, basil, villag, turkey, sheep, fed, pepper, pret, kfc, heid, thyme, spa, nicolson, creation, nasti, immens, fav
Top 10 terms for topic #2: curri, indian, dish, chicken, food, lamb, rice, order, restaur, naan, spici, meal, delici, flavour, tri, portion, sauc, meat, authent, great
Top 10 terms for topic #3: year, night, time, last, visit, week, ago, alway, get, use, old, christma, chang, still, parti, weekend, live, sinc, place, month
Top 10 terms for topic #4: thai, pad, introduc, bank, stir, monkfish, august, wetherspoon, dive, account, encourag, parsnip, patisseri, intens, liquid, scotsman, novel, rabbi, chap, silver
Top 10 terms for topic #5: tempura, korean, gut, shell, tang, aber, wasabi, teriyaki, kim, bonsai, von, einen, hostel, bru, dem, master, irn, gibt, zu

In [17]:
from os import path
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [18]:
K=14

def terms_to_wordcounts(terms, multiplier=1000):
    return  " ".join([" ".join(int(multiplier*i[0]) * [i[1]]) for i in terms])



wordcloud = WordCloud().generate(terms_to_wordcounts(term_list[K]))


plt.imshow(wordcloud)
plt.axis("off")
plt.savefig("terms" + str(K))
plt.close()

In [19]:
def wordcloudgen():
    for K in range(0,50):
        wordcloud = WordCloud().generate(terms_to_wordcounts(term_list[K]))
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.savefig("terms" + str(K))
        plt.close()


In [20]:
wordcloudgen()

In [16]:
from sklearn.feature_extraction import DictVectorizer

In [18]:
def topics_to_vectorspace(n_topics, n_words=100):
    rows = []
    for i in xrange(n_topics):
            temp = lda.show_topic(i, n_words)
            row = dict(((i[1],i[0]) for i in temp))
            rows.append(row)
    return rows    

In [19]:
vec = DictVectorizer()
X = vec.fit_transform(topics_to_vectorspace(n_topics))
X.shape

(50, 3092)

In [22]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit(X.toarray()).transform(X.toarray())
plt.figure()
for i in xrange(X_pca.shape[0]):
    plt.scatter(X_pca[i, 0], X_pca[i, 1], alpha=.5)
    plt.text(X_pca[i, 0], X_pca[i, 1], s=' ' + str(i),fontsize=8)    
plt.title('PCA Topics of Yelp restaurant')
plt.savefig("pca_topic")
plt.close()

In [23]:
from scipy.cluster.hierarchy import linkage, dendrogram
plt.figure(figsize=(12,6))
R = dendrogram(linkage(X_pca))
plt.savefig("dendro")
plt.close()

In [26]:
## correlation matrix
from scipy.spatial.distance import pdist, squareform

cor = squareform(pdist(X.toarray(), metric="euclidean"))

plt.figure(figsize=(12,6))
R = dendrogram(linkage(cor))
plt.savefig("corr")

plt.close()

## network
import networkx as nx
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
pca_norm = make_pipeline(PCA(n_components=20), Normalizer(copy=False))
X_pca_norm = pca_norm.fit(X.toarray()).transform(X.toarray())
cor = squareform(pdist(X_pca_norm, metric="euclidean"))
G = nx.Graph()
for i in xrange(cor.shape[0]):
    for j in xrange(cor.shape[1]):
        if i == j:
            G.add_edge(i, j, {"weight":0})
        else:
            G.add_edge(i, j, {"weight":1.0/cor[i,j]})

edges = [(i, j) for i, j, w in G.edges(data=True) if w['weight'] > .8]

edge_weight=dict([((u,v,),int(d['weight'])) for u,v,d in G.edges(data=True)])
#pos = nx.graphviz_layout(G, prog="twopi") # twopi, neato, circo
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, node_size=100, alpha=.5)
nx.draw_networkx_edges(G, pos, edgelist=edges, width=1)
#nx.draw_networkx_edge_labels(G, pos ,edge_labels=edge_weight)
nx.draw_networkx_labels(G, pos, font_size=8, font_family='sans-serif')
plt.savefig("network")
plt.close()



In [35]:
def printhello():
    a = 2+3
    return a

In [39]:
%timeit -n 1 a=printhello()

1 loop, best of 3: 0 ns per loop


In [37]:
In [51]: import matplotlib.pyplot as plt

In [52]: num_top_words = 10

In [53]: fontsize_base = 70 / np.max(word_topic) # font size for word with largest share in corpus

In [54]: for t in range(num_topics):
        plt.subplot(1, num_topics, t + 1)  # plot numbering starts with 1
        plt.ylim(0, num_top_words + 0.5)  # stretch the y-axis to accommodate the words
   ....:     plt.xticks([])  # remove x-axis markings ('ticks')
   ....:     plt.yticks([]) # remove y-axis markings ('ticks')
   ....:     plt.title('Topic #{}'.format(t))
   ....:     top_words_idx = np.argsort(word_topic[:,t])[::-1]  # descending order
   ....:     top_words_idx = top_words_idx[:num_top_words]
   ....:     top_words = mallet_vocab[top_words_idx]
   ....:     top_words_shares = word_topic[top_words_idx, t]
   ....:     for i, (word, share) in enumerate(zip(top_words, top_words_shares)):
   ....:         plt.text(0.3, num_top_words-i-0.5, word, fontsize=fontsize_base*share)
   ....: 

In [55]: plt.tight_layout()

NameError: name 'a' is not defined

In [43]:
predict = [7,1,4,2,5,9,23,13]
k=5
I = sorted(range(len(predict)), key=lambda k: predict[k], reverse = True)
I = I[:k]

In [44]:
I

[6, 7, 5, 0, 4]