In [316]:
# refer: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# refer: https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/
import numpy as np 
import pandas as pd 
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model, Sequential
from keras.callbacks import EarlyStopping
import gensim
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import codecs
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.utils import np_utils



In [317]:
df = pd.read_csv("/ilab/users/kc1026/Documents/cs543/sentiment140_clean.csv", sep=',', header=0)
df.dropna(inplace=True)

In [318]:
df.drop(['target'], axis=1, inplace=True)

In [319]:
df = df.rename(index=str, columns={"Unnamed: 0": "index"})

In [320]:
import gensim
from nltk.corpus import wordnet

from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import re

stopWords = set(STOPWORDS)
lmtzr = nltk.WordNetLemmatizer().lemmatize


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def normalize_text(text):
    token_text = nltk.word_tokenize(text)
    cleaned_token_text = []
    for tt in token_text:
        if tt in stopWords or tt == '' or len(tt) < 2: continue    
        cleaned_token_text.append(tt)
    
    word_pos = nltk.pos_tag(cleaned_token_text)
    lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]
    
    return [x.lower() for x in lemm_words]

In [321]:
# doc_sample='lol'
# print('original document: ')

# print('\n\n tokenized and lemmatized document: ')
# print(normalize_text(doc_sample))



original document: 


 tokenized and lemmatized document: 
['lol']


In [322]:
processed_docs = df['text'].map(normalize_text)

In [323]:
dictionary = gensim.corpora.Dictionary(processed_docs)
# count = 0
# for k, v in dictionary.iteritems():
#     print(k, v)
#     count += 1
#     if count > 10:
#         break

0 awww
1 bummer
2 carr
3 david
4 day
5 get
6 shoulda
7 blah
8 facebook
9 result
10 school


In [324]:
dictionary.filter_extremes(keep_n=2000)


In [325]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(4, 2), (70, 1), (161, 1), (198, 1), (199, 1), (258, 1), (375, 1), (1722, 1)]

In [326]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=75, id2word=dictionary, passes=2, workers=2)

In [327]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.340*"happy" + 0.128*"leave" + 0.099*"hahaha" + 0.073*"stay" + 0.039*"tea" + 0.032*"hmm" + 0.030*"bday" + 0.022*"club" + 0.021*"jb" + 0.020*"shout"
Topic: 1 
Words: 0.295*"hope" + 0.144*"soon" + 0.142*"well" + 0.091*"th" + 0.058*"friday" + 0.029*"good" + 0.028*"ll" + 0.028*"go" + 0.022*"thursday" + 0.021*"mention"
Topic: 2 
Words: 0.254*"great" + 0.223*"look" + 0.080*"birthday" + 0.048*"kid" + 0.045*"forward" + 0.041*"like" + 0.029*"super" + 0.027*"ice" + 0.026*"st" + 0.020*"cream"
Topic: 3 
Words: 0.162*"glad" + 0.148*"www" + 0.118*"later" + 0.097*"hot" + 0.083*"place" + 0.067*"amazing" + 0.057*"hate" + 0.026*"like" + 0.020*"ball" + 0.019*"go"
Topic: 4 
Words: 0.355*"night" + 0.177*"tonight" + 0.140*"movie" + 0.066*"late" + 0.034*"go" + 0.020*"uk" + 0.016*"bar" + 0.015*"complete" + 0.014*"def" + 0.012*"time"
Topic: 5 
Words: 0.141*"idea" + 0.131*"totally" + 0.074*"worth" + 0.072*"gym" + 0.068*"pool" + 0.061*"soooo" + 0.050*"yo" + 0.042*"mad" + 0.037*"good" + 0.020*"s

In [332]:
unseen_document = ' I was sitting in a restaurant with friends  They asked me something which they thought I should know  Actually I know it  but at that time I was not able to remember it  '
bow_vector = dictionary.doc2bow(normalize_text(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.17581303417682648	 Topic: 0.434*"know" + 0.084*"dont" + 0.046*"turn" + 0.040*"site" + 0.036*"problem"
Score: 0.09711853414773941	 Topic: 0.195*"good" + 0.116*"luck" + 0.108*"saw" + 0.081*"mom" + 0.080*"real"
Score: 0.09648895263671875	 Topic: 0.292*"friend" + 0.255*"best" + 0.099*"vip" + 0.062*"line" + 0.058*"shop"
Score: 0.09616363048553467	 Topic: 0.114*"actually" + 0.083*"hehe" + 0.082*"vote" + 0.061*"happen" + 0.057*"cause"
Score: 0.09603578597307205	 Topic: 0.263*"wait" + 0.102*"year" + 0.093*"maybe" + 0.078*"baby" + 0.056*"ask"
Score: 0.09409008175134659	 Topic: 0.468*"think" + 0.103*"wow" + 0.042*"afternoon" + 0.041*"dude" + 0.034*"myspace"
Score: 0.09133269637823105	 Topic: 0.129*"ha" + 0.093*"remember" + 0.075*"damn" + 0.069*"saturday" + 0.064*"green"
Score: 0.09084299206733704	 Topic: 0.198*"hour" + 0.112*"rain" + 0.096*"sit" + 0.069*"agree" + 0.063*"half"
Score: 0.08878091722726822	 Topic: 0.227*"tweet" + 0.134*"check" + 0.090*"god" + 0.087*"send" + 0.068*"plan"
