In [2]:
import pandas as pd
import json

data = pd.read_csv("../14/data/stumbleupon.tsv.gz", sep='\t',
                  encoding="utf-8")
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=False,
                     stop_words='english',
                     min_df=3)

docs = cv.fit_transform(data.body.dropna())

id2word = dict(enumerate(cv.get_feature_names()))

In [8]:
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus

corpus = Sparse2Corpus(docs, documents_columns=False)

lda_model = LdaModel(corpus=corpus, id2word = id2word, num_topics=15)

In [9]:
num_topics = 15
num_words_per_topic = 10
for ti, topic in enumerate(lda_model.show_topics(num_topics = num_topics, 
                                                 num_words = num_words_per_topic)):
    print("Topic: {}".format(ti))
    print topic[1::]

Topic: 0
(u'0.015*"images" + 0.014*"image" + 0.011*"2011" + 0.011*"link" + 0.010*"small" + 0.010*"track" + 0.009*"buzz" + 0.009*"campaign" + 0.009*"sports" + 0.008*"jpg"',)
Topic: 1
(u'0.007*"people" + 0.007*"said" + 0.006*"news" + 0.004*"time" + 0.004*"health" + 0.003*"year" + 0.003*"like" + 0.003*"world" + 0.003*"years" + 0.003*"just"',)
Topic: 2
(u'0.006*"new" + 0.006*"sports" + 0.005*"just" + 0.005*"like" + 0.004*"world" + 0.004*"time" + 0.004*"swimsuit" + 0.004*"year" + 0.004*"si" + 0.003*"said"',)
Topic: 3
(u'0.009*"video" + 0.005*"twitter" + 0.004*"append" + 0.004*"text" + 0.004*"apple" + 0.004*"like" + 0.004*"news" + 0.004*"new" + 0.004*"function" + 0.004*"left"',)
Topic: 4
(u'0.006*"use" + 0.006*"skin" + 0.004*"data" + 0.004*"health" + 0.004*"like" + 0.004*"food" + 0.004*"information" + 0.004*"water" + 0.003*"products" + 0.003*"using"',)
Topic: 5
(u'0.009*"fashion" + 0.007*"10" + 0.005*"11" + 0.005*"pm" + 0.005*"dress" + 0.004*"12" + 0.004*"just" + 0.004*"look" + 0.004*"funny"

In [10]:
from gensim.models.word2vec import Word2Vec

text = data.body.dropna().map(lambda x: x.split())

In [11]:
model = Word2Vec(text, size=100, window=5, min_count=5, workers=4)

In [12]:
model.most_similar(positive=['cookie', 'brownie'])

[(u'cupcake', 0.9114097356796265),
 (u'crust', 0.8440617918968201),
 (u'pie', 0.8420896530151367),
 (u'cake', 0.8395797610282898),
 (u'cheesecake', 0.8327450752258301),
 (u'tart', 0.8247241973876953),
 (u'brownies', 0.8213274478912354),
 (u'mini', 0.815024733543396),
 (u'candy', 0.8143845200538635),
 (u'pancake', 0.8102335929870605)]