In [1]:
import gensim
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
from itertools import *
import csv
import pyodbc
import re
import pyLDAvis.gensim

In [2]:
# fetch tweets from database

connection = pyodbc.connect(dsn='Twitter')
cursor = connection.cursor()
cursor.execute("SELECT tweet_text FROM tweets")

documents = []

for row in cursor.fetchall():
    documents.append(row)
    
connection.close()

In [3]:
# process tweets

document_process = []

for i in documents:
    for j in i:
        j = re.sub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", j)
        j = re.sub("@\\w+", " ", j)
        j = re.sub("#\\w+", " ", j)
        j = re.sub("(f|ht)(tp)(s?)(://)\\S+\\s*", " ", j)
        j = re.sub("[ \t]{2,}", " ", j)
        j = re.sub("^\\s+|\\s+$", "", j)
        document_process.append(j)

In [4]:
# lemmatize

documents_lemma = [gensim.utils.lemmatize(i) for i in document_process]

In [5]:
# find associated words

model = gensim.models.Word2Vec(documents_lemma, size=100, window=5, min_count=3, workers=4)
print(model.most_similar("catch/VB"))

[('rare/JJ', 0.9995890855789185), ('live/VB', 0.9995372891426086), ('do/VB', 0.9993615746498108), ('get/VB', 0.9993143677711487), ('guy/NN', 0.9992812275886536), ('let/VB', 0.9992256760597229), ('make/VB', 0.9991819858551025), ('anyone/NN', 0.9991755485534668), ('walk/VB', 0.9991623759269714), ('try/VB', 0.9991471171379089)]


In [6]:
# remove stopwords and words with only one letter

texts = [[words for words in line if words.split('/')[0] not in STOPWORDS and len(words.split('/')[0]) >= 2] \
         for line in documents_lemma]

In [7]:
# remove words that appear only once

all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]

In [8]:
# find most frequent words

from collections import Counter

words = [item for sublist in texts for item in sublist]
c = Counter(words)
print c.most_common()[:10] # top 10

[('catch/VB', 453), ('pokemon/NN', 416), ('play/VB', 365), ('follower/NN', 346), ('lucky/JJ', 334), ('coin/NN', 332), ('giveaway/NN', 332), ('tonight/NN', 327), ('pm/NN', 326), ('person/NN', 323)]


In [9]:
# build topic models

In [10]:
# generate 2-gram

from nltk import ngrams 
 
terms_bigram = [list(ngrams(i,2))+i for i in texts]

texts_full = []

for i in terms_bigram:
    sublist = []
    for j in i:
        if isinstance(j, tuple):
            sublist.append('_'.join(j))
        else:
            sublist.append(j)
    texts_full.append(sublist)

In [11]:
# create corpus

id2word = corpora.Dictionary(texts_full)
mm = [id2word.doc2bow(text) for text in texts_full]

# corpora.MmCorpus.serialize('/tmp/projectmm.mm', mm) # store to disk, for later use
# corpus = corpora.MmCorpus('/tmp/corpus.mm')

In [12]:
# calculate TF-IDF

tfidf = gensim.models.tfidfmodel.TfidfModel(mm)
corpus = tfidf[mm]

In [19]:
# build model

lda = models.ldamodel.LdaModel(corpus, id2word=id2word, num_topics=50, update_every=1, chunksize=500, passes=20)

# lda.save('pokemongo_model.lda')
# lda = gensim.models.LdaModel.load('pokemongo_model.lda')

In [20]:
# print topic models

lda.print_topics(num_topics=50, num_words=5)

[(0,
  u'0.073*phone/NN_playing/NN + 0.073*hold/VB_phone/NN + 0.073*playing/NN_perfect/JJ + 0.073*release/VB_glass/NN + 0.073*time/NN_release/VB'),
 (1,
  u'0.038*new/JJ + 0.018*ask/VB + 0.010*battle/NN + 0.009*new/JJ_pokemon/NN + 0.008*new/JJ_game/NN'),
 (2,
  u'0.054*life/NN + 0.049*save/VB_life/NN + 0.049*save/VB + 0.047*rt/NN_save/VB + 0.047*rt/NN'),
 (3,
  u'0.067*server/NN + 0.026*street/NN + 0.018*correct/JJ + 0.015*bump/VB + 0.015*bump/VB_street/NN'),
 (4,
  u'0.039*romance/NN + 0.039*player/NN_injure/VB + 0.039*phone/NN_romance/NN + 0.039*injure/VB_phone/NN + 0.039*injure/VB'),
 (5,
  u'0.028*tell/VB + 0.028*pidgey/NN + 0.014*library/NN + 0.013*ll/VB + 0.009*center/NN'),
 (6,
  u'0.205*compare/VB + 0.033*help/VB + 0.011*video/NN + 0.010*grow/VB + 0.007*chart/NN'),
 (7,
  u'0.094*turn/VB + 0.038*look/VB + 0.023*walk/VB + 0.018*uk/NN + 0.015*guide/NN'),
 (8,
  u'0.041*think/VB + 0.036*gym/NN + 0.033*right/NN + 0.030*house/NN + 0.024*control/VB'),
 (9,
  u'0.096*glass/NN + 0.042*

In [21]:
# visualize topic models

pyLDAvis.display(pyLDAvis.gensim.prepare(lda, corpus, id2word))