In [1]:
import gensim
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
from itertools import *
import csv
import pyodbc
import re
import pyLDAvis.gensim

In [2]:
# fetch tweets from the database

connection = pyodbc.connect(dsn='Twitter')
cursor = connection.cursor()
cursor.execute("SELECT tweet_text FROM tweets")

documents = []

for row in cursor.fetchall():
    documents.append(row)
    
connection.close()

In [3]:
document_process = []

for i in documents:
    for j in i:
        j = re.sub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", j)
        j = re.sub("@\\w+", " ", j)
        j = re.sub("#\\w+", " ", j)
        j = re.sub("(f|ht)(tp)(s?)(://)\\S+\\s*", " ", j)
        j = re.sub("[ \t]{2,}", " ", j)
        j = re.sub("^\\s+|\\s+$", "", j)
        document_process.append(j)

In [4]:
# lemmatize
documents_lemma = [gensim.utils.lemmatize(i) for i in document_process]

In [5]:
# remove stopwords and words with only one letter
texts = [[words for words in line if words.split('/')[0] not in STOPWORDS and len(words.split('/')[0]) >= 2] \
         for line in documents_lemma]

In [6]:
# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]

In [7]:
# create 2-gram

from nltk import ngrams 
 
terms_bigram = [list(ngrams(i,2))+i for i in texts]

texts_full = []

for i in terms_bigram:
    sublist = []
    for j in i:
        if isinstance(j, tuple):
            sublist.append('_'.join(j))
        else:
            sublist.append(j)
    texts_full.append(sublist)

In [11]:
# topic modeling

In [12]:
id2word = corpora.Dictionary(texts_full)
mm = [id2word.doc2bow(text) for text in texts_full]

# corpora.MmCorpus.serialize('/tmp/projectmm.mm', mm) # store to disk, for later use
# corpus = corpora.MmCorpus('/tmp/corpus.mm')

In [13]:
tfidf = gensim.models.tfidfmodel.TfidfModel(mm)
corpus = tfidf[mm]

In [14]:
lda = models.ldamodel.LdaModel(corpus, id2word=id2word, num_topics=20, update_every=1, chunksize=100, passes=200,iterations=1000)

# lda.save('model.lda')
# lda = gensim.models.LdaModel.load('model.lda')

In [16]:
lda.print_topics(num_topics=20, num_words=5)

[(0,
  u'0.019*star/NN + 0.012*right/RB + 0.008*likely/JJ + 0.008*won/NN + 0.008*won/NN_awhile/RB'),
 (1,
  u'0.075*season/NN_delay/VB + 0.071*delay/VB + 0.054*season/NN + 0.047*consider/VB + 0.047*seriously/RB_consider/VB'),
 (2,
  u'0.049*come/VB + 0.044*winter/NN + 0.034*winter/NN_come/VB + 0.026*weather/NN + 0.016*cold/JJ'),
 (3,
  u'0.015*throne/NN_season/NN + 0.015*goodbye/NN + 0.013*delay/VB_season/NN + 0.011*production/NN_delay/VB + 0.010*win/VB'),
 (4,
  u'0.033*king/NN + 0.023*song/NN + 0.017*excited/JJ + 0.014*ice/NN + 0.014*song/NN_ice/NN'),
 (5,
  u'0.039*cersei/NN_like/VB + 0.039*like/VB + 0.039*cersei/NN + 0.033*stream/NN + 0.025*send/VB'),
 (6,
  u'0.019*watch/VB + 0.017*emilia/NN + 0.011*jon/JJ_snow/NN + 0.011*jon/JJ + 0.010*ll/VB'),
 (7,
  u'0.020*episode/NN + 0.019*expect/VB_stab/VB + 0.019*end/VB_rule/VB + 0.019*stab/VB + 0.019*stab/VB_end/VB'),
 (8,
  u'0.020*anticipation/NN + 0.020*anticipation/NN_season/NN + 0.020*wait/VB_anticipation/NN + 0.020*season/NN_like/JJ

In [17]:
pyLDAvis.display(pyLDAvis.gensim.prepare(lda, corpus, id2word))