In [1]:
import gensim
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
from itertools import *
import csv
import pyodbc
import re
import pyLDAvis.gensim

In [2]:
# fetch tweets from database

connection = pyodbc.connect(dsn='Twitter')
cursor = connection.cursor()
cursor.execute("SELECT tweet_text FROM tweets")

documents = []

for row in cursor.fetchall():
    documents.append(row)
    
connection.close()

In [3]:
# process tweets

document_process = []

for i in documents:
    for j in i:
        j = re.sub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", j)
        j = re.sub("@\\w+", " ", j)
        j = re.sub("#\\w+", " ", j)
        j = re.sub("(f|ht)(tp)(s?)(://)\\S+\\s*", " ", j)
        j = re.sub("[ \t]{2,}", " ", j)
        j = re.sub("^\\s+|\\s+$", "", j)
        document_process.append(j)

In [4]:
# lemmatize

documents_lemma = [gensim.utils.lemmatize(i) for i in document_process]

In [6]:
# find associated words

model = gensim.models.Word2Vec(documents_lemma, size=100, window=5, min_count=3, workers=4)

model.most_similar("pokemon/NN")

[('catch/VB', 0.9861735701560974),
 ('go/VB', 0.9795925617218018),
 ('stay/VB', 0.9724176526069641),
 ('win/VB', 0.9718842506408691),
 ('backpack/NN', 0.9690099358558655),
 ('powerbank/NN', 0.968267023563385),
 ('time/NN', 0.9642168283462524),
 ('oneplus/JJ', 0.9622034430503845),
 ('cop/NN', 0.962091863155365),
 ('gotta/VB', 0.9618346095085144)]

In [7]:
# remove stopwords and words with only one letter

texts = [[words for words in line if words.split('/')[0] not in STOPWORDS and len(words.split('/')[0]) >= 2] \
         for line in documents_lemma]

In [8]:
# remove words that appear only once

all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]

In [9]:
# find most frequent words

from collections import Counter

words = [item for sublist in texts for item in sublist]
c = Counter(words)

c.most_common()[:10] # top 10

[('catch/VB', 1101),
 ('pokemon/NN', 1085),
 ('time/NN', 687),
 ('legendary/JJ', 528),
 ('play/VB', 309),
 ('person/NN', 305),
 ('phone/NN', 148),
 ('new/JJ', 143),
 ('pokeball/NN', 127),
 ('night/NN', 121)]

In [10]:
# build topic models

In [11]:
# generate 2-gram

from nltk import ngrams 
 
terms_bigram = [list(ngrams(i,2))+i for i in texts]

texts_full = []

for i in terms_bigram:
    sublist = []
    for j in i:
        if isinstance(j, tuple):
            sublist.append('_'.join(j))
        else:
            sublist.append(j)
    texts_full.append(sublist)

In [12]:
# create corpus

id2word = corpora.Dictionary(texts_full)
mm = [id2word.doc2bow(text) for text in texts_full]

# corpora.MmCorpus.serialize('/tmp/projectmm.mm', mm) # store to disk, for later use
# corpus = corpora.MmCorpus('/tmp/corpus.mm')

In [13]:
# calculate TF-IDF

tfidf = gensim.models.tfidfmodel.TfidfModel(mm)
corpus = tfidf[mm]

In [14]:
# build model

lda = models.ldamodel.LdaModel(corpus, id2word=id2word, num_topics=50, update_every=1, chunksize=500, passes=20)

# lda.save('pokemongo_model.lda')
# lda = gensim.models.LdaModel.load('pokemongo_model.lda')

In [15]:
# print topic models

lda.print_topics(num_topics=50, num_words=5)

[(0,
  u'0.031*best/JJ + 0.029*bring/VB + 0.024*love/NN + 0.018*church/NN + 0.018*best/JJ_person/NN'),
 (1,
  u'0.168*meet/VB + 0.082*gotta/VB + 0.081*gotta/VB_catch/VB + 0.049*worst/JJ_pokemon/NN + 0.049*worst/JJ'),
 (2,
  u'0.092*playing/NN + 0.086*public/NN + 0.071*playing/NN_public/NN + 0.049*man/NN + 0.039*catch/VB_man/NN'),
 (3,
  u'0.132*trainer/NN_campus/NN + 0.132*serve/VB_pokemon/NN + 0.132*guard/NN_serve/VB + 0.121*guard/NN + 0.025*pokemon/NN'),
 (4,
  u'0.049*person/NN + 0.027*lame/JJ + 0.026*person/NN_lame/JJ + 0.026*lame/JJ_play/VB + 0.019*value/NN'),
 (5,
  u'0.132*way/NN_woman/NN + 0.132*woman/NN_heart/NN + 0.131*heart/NN + 0.130*woman/NN + 0.102*way/NN'),
 (6,
  u'0.170*meet/VB_friday/NN + 0.170*official/NN_meet/VB + 0.157*official/NN + 0.014*thank/NN + 0.010*latest/JJ'),
 (7,
  u'0.017*fan/NN + 0.015*head/NN + 0.012*cute/JJ + 0.010*charmander/NN + 0.009*catch/VB_charmander/NN'),
 (8,
  u'0.073*rt/VB + 0.043*feel/VB + 0.038*pok\xe9mon/NN + 0.025*day/NN + 0.022*catch/VB

In [16]:
# visualize topic models

pyLDAvis.display(pyLDAvis.gensim.prepare(lda, corpus, id2word))