In [19]:
# refer: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# refer: https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/
import numpy as np 
import pandas as pd 
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model, Sequential
from keras.callbacks import EarlyStopping
import gensim
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import codecs
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.utils import np_utils

d = {'text': ['when thieves broke into my house at night, and held my wife and me on gun point for at least ten minutes and took away a lot  of property.',
                 'International sports events won by myfavourite national team or playerbrings me joy when india won the world cup cricket match',
                 'I was sitting in a restaurant with friends they asked me something which they thoughtI should know Actually I know it but atthat time I was not able to remember it']}
d_ex = pd.DataFrame(data=d)
d_ex['cleaned_text']

In [44]:
with pd.option_context('display.max_colwidth', 500):
    print(d_ex)


                                                                                                                                                                  text
0                           when thieves broke into my house at night, and held my wife and me on gun point for at least ten minutes and took away a lot  of property.
1                                       International sports events won by myfavourite national team or playerbrings me joy when india won the world cup cricket match
2  I was sitting in a restaurant with friends they asked me something which they thoughtI should know Actually I know it but atthat time I was not able to remember it


In [21]:
df = pd.read_csv("/ilab/users/kc1026/Documents/cs543/sentiment140_clean.csv", sep=',', header=0)
df.dropna(inplace=True)

In [22]:
df.drop(['target'], axis=1, inplace=True)

In [23]:
df = df.rename(index=str, columns={"Unnamed: 0": "index"})

In [24]:
import gensim
from nltk.corpus import wordnet

from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import re
stopWords = set(STOPWORDS)
lmtzr = nltk.WordNetLemmatizer().lemmatize


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def normalize_text(text):
    token_text = nltk.word_tokenize(text)
    cleaned_token_text = []
    for tt in token_text:
        if tt in stopWords or tt == '' or len(tt) < 2 or tt : continue    
        cleaned_token_text.append(tt)
    
    word_pos = nltk.pos_tag(cleaned_token_text)
    lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]
    
    return [x.lower() for x in lemm_words]

In [26]:
processed_docs = df['text'].map(normalize_text)

In [35]:
dictionary = gensim.corpora.Dictionary(processed_docs)
# count = 0
# for k, v in dictionary.iteritems():
#     print(k, v)
#     count += 1
#     if count > 10:
#         break

In [40]:
dictionary.save('lda_dict')

In [36]:
dictionary.filter_extremes(keep_n=2000)


In [37]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [38]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=75, id2word=dictionary, passes=2, workers=4)

In [39]:
lda_model.save('lda_model')

In [41]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.218*"yes" + 0.140*"ok" + 0.042*"sister" + 0.039*"nite" + 0.035*"text" + 0.032*"appreciate" + 0.029*"thought" + 0.025*"aw" + 0.025*"lol" + 0.023*"project"
Topic: 1 
Words: 0.104*"damn" + 0.071*"chat" + 0.057*"awake" + 0.052*"absolutely" + 0.050*"wine" + 0.049*"stupid" + 0.045*"bbq" + 0.044*"realize" + 0.038*"invite" + 0.038*"freak"
Topic: 2 
Words: 0.188*"take" + 0.126*"rock" + 0.090*"walk" + 0.080*"ill" + 0.079*"super" + 0.033*"foot" + 0.032*"tune" + 0.030*"boyfriend" + 0.027*"street" + 0.021*"doubt"
Topic: 3 
Words: 0.249*"twitter" + 0.095*"finally" + 0.067*"funny" + 0.048*"real" + 0.044*"goodnight" + 0.041*"give" + 0.029*"line" + 0.027*"online" + 0.026*"city" + 0.019*"new"
Topic: 4 
Words: 0.128*"add" + 0.122*"later" + 0.087*"picture" + 0.080*"plan" + 0.045*"date" + 0.038*"cuz" + 0.030*"hold" + 0.030*"congratulation" + 0.028*"alright" + 0.028*"ipod"
Topic: 5 
Words: 0.134*"talk" + 0.132*"hour" + 0.061*"till" + 0.055*"drive" + 0.047*"join" + 0.045*"pm" + 0.034*"bles

In [13]:
unseen_document='lolol'
doc_bow = dictionary.doc2bow(normalize_text(unseen_document))
doc_lda = lda_model[doc_bow]
for index, score in sorted(doc_lda, key=lambda tup: -1*tup[1]):
    print(lda_model.show_topic(index, 5))

[('know', 0.3915512), ('let', 0.14674239), ('god', 0.07042134), ('idea', 0.050161228), ('happen', 0.04144807)]
[('think', 0.47267756), ('lunch', 0.074404486), ('news', 0.040900342), ('da', 0.027649488), ('awake', 0.025618644)]
[('old', 0.16212055), ('sorry', 0.15302135), ('hear', 0.14452542), ('remember', 0.08374428), ('lmao', 0.07042261)]
[('win', 0.16647513), ('train', 0.09225803), ('place', 0.078995794), ('plan', 0.07622332), ('probably', 0.057463672)]
[('read', 0.23063527), ('hard', 0.11917932), ('sit', 0.1178597), ('drive', 0.09761256), ('lady', 0.08243821)]
[('sure', 0.15412599), ('summer', 0.10685616), ('finish', 0.10630787), ('ask', 0.068303294), ('fine', 0.049663723)]
[('need', 0.34333673), ('vip', 0.06955794), ('trip', 0.056927953), ('story', 0.04533144), ('cover', 0.026527233)]


In [77]:
sug_words_list = []
for text in d_ex['text']:
    doc_bow = dictionary.doc2bow(normalize_text(text))
    doc_lda = lda_model[doc_bow]
    temp = []
    for index, score in sorted(doc_lda, key=lambda tup: -1*tup[1]):
        for word, score in lda_model.show_topic(index, 5):
            temp.append(word)
    sug_words_list.append(temp)

sug_words = pd.Series(sug_words_list)
d_ex['suggested_words'] = sug_words

In [4]:
import gensim
word2vec_path = "/ilab/users/kc1026/Documents/cs543/GoogleNews-vectors-negative300.bin.gz"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [5]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model, Sequential
from keras.callbacks import EarlyStopping
import gensim
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import codecs
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.utils import np_utils

stop_words = set(stopwords.words('english'))
EMBEDDING_DIM = 300 # how big is each word vector
MAX_VOCAB_SIZE = 175303 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = 150 # max number of words in a comment to use



In [6]:
df2 = pd.read_csv("/ilab/users/kc1026/Documents/cs543/emoji.csv", sep=',', header=0)
train, test = train_test_split(df2, test_size=0.1)
df2.count()

text     7480
label    7480
dtype: int64

In [7]:
tokenizer = RegexpTokenizer(r'\w+')
train['text'] = train['text'].apply(lambda x: re.sub("[^a-zA-Z']", ' ', x))
test['text'] = test['text'].apply(lambda x: re.sub("[^a-zA-Z']", ' ', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(train["text"].tolist())
x_training_sequences = tokenizer.texts_to_sequences(train["text"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

# set max length of sequences, now all data has the same length of 300
train_cnn_data = pad_sequences(x_training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

num_words = len(train_word_index) + 1
train_embedding_weights = np.zeros((num_words, EMBEDDING_DIM))
for word,index in train_word_index.items():
    if word in word2vec:
        train_embedding_weights[index,:] = word2vec[word]     


Found 8506 unique tokens.


In [43]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
test_sequences = tokenizer.texts_to_sequences(test["text"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = test_cnn_data
y_test = np_utils.to_categorical(test['label'].values)

In [10]:
x_train = train_cnn_data
y_train = np_utils.to_categorical(train['label'].values)

In [12]:
# word2vec + CNN
from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D

#training params
batch_size = 128 
num_epochs = 6 
drop_rate = 0.2

EMBEDDING_DIM = 300 # how big is each word vector
MAX_SEQUENCE_LENGTH = 150
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedding_layer = Embedding(num_words, EMBEDDING_DIM, weights=[train_embedding_weights], input_length=MAX_SEQUENCE_LENGTH, trainable=True)(sequence_input)
conv_0 = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(embedding_layer)
maxpool_0 = GlobalMaxPooling1D()(conv_0)
conv_1 = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(embedding_layer)
maxpool_1 = GlobalMaxPooling1D()(conv_1)
conv_2 = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(embedding_layer)
maxpool_2 = GlobalMaxPooling1D()(conv_2)


merged = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(drop_rate)(merged)
merged = Dense(7)(merged)
output = Activation('softmax')(merged)
model = Model(inputs=[sequence_input], outputs=[output])
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 300)     2552100     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 149, 100)     60100       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 148, 100)     90100       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_6 (

In [13]:
model.fit(x_train, y_train, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Train on 6058 samples, validate on 674 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f1b7b6be2e8>

In [14]:
model.save('cnn_model.h5') 

In [15]:
y_predict = model.predict(pad_sequences(tokenizer.texts_to_sequences(d_ex["text"].tolist()), maxlen=MAX_SEQUENCE_LENGTH)
, batch_size=64, verbose=1)



In [16]:
y_predict_top1 = []
for i in range(0, len(y_predict)):
    max_index = 0
    max_value = 0.0
    for j in range(0, len(y_predict[0])):
        if y_predict[i][j] > max_value:
            max_value = y_predict[i][j]
            max_index = j

    y_predict_top1.append(max_index)

In [17]:
suggested_emoji = pd.Series(y_predict_top1)

In [18]:
suggested_emoji

0    1
1    0
2    5
dtype: int64

In [81]:
d_ex['suggested_emoji'] = suggested_emoji

In [82]:
with pd.option_context('display.max_colwidth', 500):
    print(d_ex)


                                                                                                                                                                  text  \
0                             when thieves broke into my house at night and held my wife and me on gun point for at least ten minutes and took away a lot  of property   
1                                       International sports events won by myfavourite national team or playerbrings me joy when india won the world cup cricket match   
2  I was sitting in a restaurant with friends they asked me something which they thoughtI should know Actually I know it but atthat time I was not able to remember it   

                                                                                                                                                                                                                                                                                                   suggested_words  \
0  [yeah,