In [65]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import gensim
from IPython.display import display, HTML

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


 **Load data file**

In [66]:
data = pd.read_csv('training.csv', sep=",", encoding = "ISO-8859-1",  error_bad_lines=False)

In [67]:
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [68]:
x_raw = data.iloc[:,-1]
x_raw = x_raw.values.tolist()
x_raw = [a.replace("just", "") for a in x_raw]
x_raw = [a.replace("good", "") for a in x_raw]
x_raw = [a.replace("like", "") for a in x_raw]
x_raw = [a.replace("today", "") for a in x_raw]
x_raw = [a.replace("day", "") for a in x_raw]
x_raw = [a.replace("got", "") for a in x_raw]
x_raw = [a.replace("don", "") for a in x_raw]
x_raw = [a.replace("quot", "") for a in x_raw]

**Preprocess**

In [69]:
#Preprocess corpus
import string
from string import punctuation
from nltk.corpus import stopwords

def preprocess(text):
    # word tokenisation, including punctuation removal
    tokenizer = RegexpTokenizer(r'\w+')

    # lowercasing
    text = [t.lower() for t in text]

    # stopword removal
    stop = set(stopwords.words('english'))
    text = [t for t in text if t not in stop]
    
    # lemmatisation
    lemmatiser = WordNetLemmatizer()
    text = [lemmatiser.lemmatize(t) for t in text]

    # remove numbers and empty space
    digits = ' 0123456789'
    text = [t for t in text if t not in digits]

    #tokenize sentences
    for sent in text:
        yield(gensim.utils.simple_preprocess(str(sent)))
        
    return text

**Ngrams**

In [None]:
# Use bigrams to improve model
def create_bigrams(text):
    bigram = gensim.models.Phrases(text, min_count=4, threshold=40)


**Define Model and parameters**

In [70]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

model = ['LDA', 'NMF']
vectorizer = ['count', 'tfidf']

def topic_model(data, model, vectorizer, n_topics=20):
    """
    model: method for topic model
    vectorizer: method for representation of words
    n_topics = number of topics
    """

    if vectorizer == 'count':
        vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
        x_train = vect.fit_transform(x)

    if vectorizer == 'tfidf':
        vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
        x_train = vect.fit_transform(x)

    if model == 'LDA':
        LDA = LatentDirichletAllocation(n_components=n_topics, learning_decay=0.7, random_state=0, n_jobs=-1, learning_method='online')
        _model = LDA.fit(x_train)
        print('Done!')
    
    if model == 'NMF':
        nmf = NMF(n_components=n_topics, random_state=1, alpha=0.1, l1_ratio = 0.5, init= 'nndsvd')
        _model = nmf.fit(x_train)
        print('Done!')

    return _model, vect

**Prepare data to train**

In [71]:
x = list(preprocess(x_raw))
bigram = gensim.models.Phrases(x, min_count=3, threshold=30)
#bigram = gensim.models.phrases.Phraser(bigram)
#trigram = gensim.models.Phrases(bigram[x], threshold=30)  
#x = [trigram[bigram[line]] for line in x]
x = [bigram[line] for line in x]
x = [' '.join(i) for i in x]



In [None]:
print('Training...')
LDA, vect = topic_model(x, model='LDA', vectorizer='count')
#LDA.get_params()
nmf, vect = topic_model(x, model='NMF', vectorizer='tfidf')
#nmf.get_params()

Training...


**Show topics infered**

In [62]:
def create_topics_table(model, vectorizer, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topics = []
    for weights in model.components_:
        top_index = (-weights).argsort()[:n_words]
        #topic = keywords.take(top_index)
        #w = np.sort(-weights)[:5]
        #w = w.round(1)
        #w = w.astype(str)
        #ind = map(' '.join, zip(topic, w))
        #topics.append(ind)
        topics.append(keywords.take(top_index))
    return topics

print('Top 10 words for each topic with LDA')
topic_keywords = create_topics_table(model=LDA, vectorizer=vect, n_words=10)       
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
display(df_topic_keywords)

print('Top 10 words for each topic with NMF')
topic_keywords2 = create_topics_table(model=nmf, vectorizer=vect, n_words=10)
df_topic_keywords2 = pd.DataFrame(topic_keywords2)
df_topic_keywords2.columns = ['Word '+str(i) for i in range(df_topic_keywords2.shape[1])]
df_topic_keywords2.index = ['Topic '+str(i) for i in range(df_topic_keywords2.shape[0])]
display(df_topic_keywords2)



Top 10 words for each topic with LDA


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,going,twitter,getting,working,late,thank,ready,bored,tomorrow,looking_forward
Topic 1,love,amp,little,feel,poor,sick,trying,guy,makes,heart
Topic 2,thanks,miss,really,welcome,hey,mean,great,hope,weeks,think
Topic 3,im,lol,yeah,haha,gonna,yay,ur,rain,think,omg
Topic 4,new,feeling,phone,need,come,iphone,car,want,buy,twitter
Topic 5,know,oh,yes,ll,people,sure,sorry,let,hear,maybe
Topic 6,time,morning,long,going,wish,friends,amp,wow,having,great
Topic 7,sad,happy,lost,ll,missing,home,th,soon,year,come
Topic 8,work,sleep,watching,school,tired,watch,going,tomorrow,night,tonight
Topic 9,hope,better,make,http_bit,ly,love,song,video,hi,lol


Top 10 words for each topic with NMF


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,lol,haha,yeah,ur,did,yes,twitter,need,right,ok
Topic 1,love,lt,song,haha,guys,miss,life,watching,amazing,ya
Topic 2,work,tomorrow,morning,need,week,weekend,hours,sleep,tired,early
Topic 3,going,bed,tomorrow,sleep,tonight,miss,night,school,watch,soon
Topic 4,time,long,great,bed,sleep,having,just,ve,fun,little
Topic 5,amp,watching,sad,just,ur,lt,make,fun,went,ve
Topic 6,really,bad,need,sad,feel,miss,wanna,right,hate,nice
Topic 7,com,http_twitpic,http_tinyurl,http_plurk,http,http_www,says,look,www,check
Topic 8,im,gonna,sorry,sad,tired,haha,bored,sick,right,happy
Topic 9,know,let,didn,right,people,did,haha,sad,yeah,twitter


**Show reviews and topic assigned**

In [None]:
# Topic ID for each review in dataset: review ID, review text, topic ID

def show_results(model, vectorizer):
    x_train = vectorizer.fit_transform(x)
    topic_values = model.transform(x_train)
    data['Topic'] = topic_values.argmax(axis=1)
    #data[['REVIEW_TEXT', 'Topic']]

    result = data[['REVIEW_TEXT', 'Topic']][:50]
    from IPython.display import display, HTML

    display(HTML(result.to_html()))

In [None]:
lda_result = show_results(LDA, vect)

Unnamed: 0,REVIEW_TEXT,Topic
0,"When least you think so, this product will save the day. Just keep it around just in case you need it for something.",5
1,Lithium batteries are something new introduced in the market there average developing cost is relatively high but Stallion doesn't compromise on quality and provides us with the best at a low cost.<br />There are so many in built technical assistants that act like a sensor in their particular forté. The battery keeps my phone charged up and it works at every voltage and a high voltage is never risked.,5
2,"I purchased this swing for my baby. She is 6 months now and has pretty much out grown it. It is very loud and doesn't swing very well. It is beautiful though. I love the colors and it has a lot of settings, but I don't think it was worth the money.",5
3,I was looking for an inexpensive desk calcolatur and here it is. It works and does everything I need. Only issue is that it tilts slightly to one side so when I hit any keys it rocks a little bit. Not a big deal.,5
4,"I only use it twice a week and the results are great. I have used other teeth whitening solutions and most of them, for the same results I would have to use it at least three times a week. Will keep using this because of the potency of the solution and also the technique of the trays, it keeps everything in my teeth, in my mouth.",9
5,I'm not sure what this is supposed to be but I would recommend that you do a little more research into the culture of using pipes if you plan on giving this as a gift or using it yourself.,5
6,"Pleased with ping pong table. 11 year old and 13 year old having a blast, plus lots of family entertainment too. Plus better than kids sitting on video games all day. A friend put it together. I do believe that was a challenge, but nothing they could not handle",5
7,"Great vitamin C serum... I really like the oil feeling, not too sticky. I used it last week on some of my recent bug bites and it helps heal the skin faster than normal.",6
8,"I've used tide pods laundry detergent for many years,its such a great detergent to use having a nice scent and leaver the cloths smelling fresh.",0
9,"Everybody wants to fall for their promises. But this is a relatively unheard of brand, some even say a non existant company. Look at how amateur their labels and products are. You have to ask yourself if you would trust this kind of amateur stuff? No way! Don't waste your money.",5


In [None]:
nmf_result = show_results(nmf, vect)

Unnamed: 0,REVIEW_TEXT,Topic
0,"When least you think so, this product will save the day. Just keep it around just in case you need it for something.",7
1,Lithium batteries are something new introduced in the market there average developing cost is relatively high but Stallion doesn't compromise on quality and provides us with the best at a low cost.<br />There are so many in built technical assistants that act like a sensor in their particular forté. The battery keeps my phone charged up and it works at every voltage and a high voltage is never risked.,7
2,"I purchased this swing for my baby. She is 6 months now and has pretty much out grown it. It is very loud and doesn't swing very well. It is beautiful though. I love the colors and it has a lot of settings, but I don't think it was worth the money.",3
3,I was looking for an inexpensive desk calcolatur and here it is. It works and does everything I need. Only issue is that it tilts slightly to one side so when I hit any keys it rocks a little bit. Not a big deal.,0
4,"I only use it twice a week and the results are great. I have used other teeth whitening solutions and most of them, for the same results I would have to use it at least three times a week. Will keep using this because of the potency of the solution and also the technique of the trays, it keeps everything in my teeth, in my mouth.",0
5,I'm not sure what this is supposed to be but I would recommend that you do a little more research into the culture of using pipes if you plan on giving this as a gift or using it yourself.,0
6,"Pleased with ping pong table. 11 year old and 13 year old having a blast, plus lots of family entertainment too. Plus better than kids sitting on video games all day. A friend put it together. I do believe that was a challenge, but nothing they could not handle",8
7,"Great vitamin C serum... I really like the oil feeling, not too sticky. I used it last week on some of my recent bug bites and it helps heal the skin faster than normal.",2
8,"I've used tide pods laundry detergent for many years,its such a great detergent to use having a nice scent and leaver the cloths smelling fresh.",2
9,"Everybody wants to fall for their promises. But this is a relatively unheard of brand, some even say a non existant company. Look at how amateur their labels and products are. You have to ask yourself if you would trust this kind of amateur stuff? No way! Don't waste your money.",0


In [16]:
import pickle
import joblib

def save_model(model):
    pickle.dump(model)
    #model = pickle.load('model.pk')

def save_model2(model):
    joblib.dump(model)
    #model = joblib.load('model.jl')

lda_model = save_model(LDA)
#lda_model2 = save_model2(LDA, 'lda_model2.pk')
nmf_model = save_model(nmf)
#nmf_model = save_model2(nmf, 'nmf_model2.jl')

TypeError: ignored

In [None]:
!pip install pyLDAvis
from pyLDAvis import sklearn as sklearn_lda
import os
import pickle 
import pyLDAvis
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(10))

if 1 == 1:
    x_train = vect.fit_transform(x)
    LDAvis_prepared = sklearn_lda.prepare(LDA, x_train, vect)
with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
     LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(10) +'.html')

pyLDAvis.display(LDAvis_prepared)