#SoMe Topic Modeling Notebook | Release canvas 1 📖

## Installations and Libraries 💽

In [3]:
import time 
# Time the running of everything
start_of_notebook_time = time.time()

In [4]:
# Required Libraries

#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)



pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.


Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working


`scipy.sparse.sparsetools` is deprecated!
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.



##Data Cleaning 🧹

In [5]:
df = pd.read_csv('jan_uber_positive.csv')
df = pd.DataFrame({"original_tweets":df['Tweets']})
df.head()

#Removing emojies from text
#Refrence 1 : https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
#Refrence 2 : https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python

def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

df

Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets
0,_Support\n_NYC\n_India\n \n\nMy account has go...,_Support _NYC _India My account has gone too l...,_Support _NYC _India My account has gone too l...
1,"Let it corporate or personal life, in need of ...","Let it corporate or personal life, in need of ...","Let it corporate or personal life, in need of ..."
2,Someone in India really hacked my uber account...,Someone in India really hacked my uber account...,Someone in India really hacked my uber account...
3,"_luv Hey Rahul, kindly share your registered c...","_luv Hey Rahul, kindly share your registered c...","_luv Hey Rahul, kindly share your registered c..."
4,"Hey Biju, with safety as our top priority, it ...","Hey Biju, with safety as our top priority, it ...","Hey Biju, with safety as our top priority, it ..."
...,...,...,...
6467,"Hey, please send us a note via the help sectio...","Hey, please send us a note via the help sectio...","Hey, please send us a note via the help sectio..."
6468,is anyone alive?,is anyone alive?,is anyone alive?
6469,"Hey Rajiv, could you please share your registe...","Hey Rajiv, could you please share your registe...","Hey Rajiv, could you please share your registe..."
6470,"Hey, sorry to hear that you weren't able to ap...","Hey, sorry to hear that you weren't able to ap...","Hey, sorry to hear that you weren't able to ap..."


##Tokenizing 🕵🏻‍♂

In [6]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab
nlp = spacy.load('en_core_web_lg')


In [7]:
"""
Import Gensim and Wordcloud to use their stopwords as well and use the combined stopwords of ALL as the variable:
ALL_STOP_WORDS
"""
# Timing Start
program_start_time = time.time()

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)


# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

# Timing End
program_end_time = time.time()

# View df
df

Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets,tokens
0,_Support\n_NYC\n_India\n \n\nMy account has go...,_Support _NYC _India My account has gone too l...,_Support _NYC _India My account has gone too l...,"[_support, _nyc, _india, account, gone, long, ..."
1,"Let it corporate or personal life, in need of ...","Let it corporate or personal life, in need of ...","Let it corporate or personal life, in need of ...","[let, corporate, personal, life,, need, help, ..."
2,Someone in India really hacked my uber account...,Someone in India really hacked my uber account...,Someone in India really hacked my uber account...,"[india, hacked, uber, account, u]"
3,"_luv Hey Rahul, kindly share your registered c...","_luv Hey Rahul, kindly share your registered c...","_luv Hey Rahul, kindly share your registered c...","[_luv, hey, rahul,, kindly, share, registered,..."
4,"Hey Biju, with safety as our top priority, it ...","Hey Biju, with safety as our top priority, it ...","Hey Biju, with safety as our top priority, it ...","[hey, biju,, safety, priority,, recommended, 2..."
...,...,...,...,...
6467,"Hey, please send us a note via the help sectio...","Hey, please send us a note via the help sectio...","Hey, please send us a note via the help sectio...","[hey,, send, note, help, section, app, direct,..."
6468,is anyone alive?,is anyone alive?,is anyone alive?,[alive?]
6469,"Hey Rajiv, could you please share your registe...","Hey Rajiv, could you please share your registe...","Hey Rajiv, could you please share your registe...","[hey, rajiv,, share, registered, details, dire..."
6470,"Hey, sorry to hear that you weren't able to ap...","Hey, sorry to hear that you weren't able to ap...","Hey, sorry to hear that you weren't able to ap...","[hey,, sorry, hear, weren't, able, apply, prom..."


In [16]:
import re
df['url_free_tweets']=df['url_free_tweets'].apply(lambda x: re.sub(r"_", ' ', str(x)))
df

Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets,tokens
0,_Support\n_NYC\n_India\n \n\nMy account has go...,_Support _NYC _India My account has gone too l...,Support NYC India My account has gone too l...,"['support', 'nyc', 'india', 'account', 'gone',..."
1,"Let it corporate or personal life, in need of ...","Let it corporate or personal life, in need of ...","Let it corporate or personal life, in need of ...","['let', 'corporate', 'personal', 'life,', 'nee..."
2,Someone in India really hacked my uber account...,Someone in India really hacked my uber account...,Someone in India really hacked my uber account...,"['india', 'hacked', 'uber', 'account', 'u']"
3,"_luv Hey Rahul, kindly share your registered c...","_luv Hey Rahul, kindly share your registered c...","luv Hey Rahul, kindly share your registered c...","['luv', 'hey', 'rahul,', 'kindly', 'share', 'r..."
4,"Hey Biju, with safety as our top priority, it ...","Hey Biju, with safety as our top priority, it ...","Hey Biju, with safety as our top priority, it ...","['hey', 'biju,', 'safety', 'priority,', 'recom..."
...,...,...,...,...
6467,"Hey, please send us a note via the help sectio...","Hey, please send us a note via the help sectio...","Hey, please send us a note via the help sectio...","['hey,', 'send', 'note', 'help', 'section', 'a..."
6468,is anyone alive?,is anyone alive?,is anyone alive?,['alive?']
6469,"Hey Rajiv, could you please share your registe...","Hey Rajiv, could you please share your registe...","Hey Rajiv, could you please share your registe...","['hey', 'rajiv,', 'share', 'registered', 'deta..."
6470,"Hey, sorry to hear that you weren't able to ap...","Hey, sorry to hear that you weren't able to ap...","Hey, sorry to hear that you weren't able to ap...","['hey,', 'sorry', 'hear', ""weren't"", 'able', '..."


In [6]:
# See how long it took
print(program_end_time - program_start_time, "seconds to finish")

3.8468899726867676 seconds to finish


##Lemmatization🇬🇧

In [7]:
# Refrence 4 : https://stackoverflow.com/questions/45306988/column-of-lists-convert-list-to-string-as-a-new-column

# Timing Start
program_start_time = time.time()

# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
# df[['original_tweet', 'lemmas_back_to_text']]

# Timing End
program_end_time = time.time()


In [8]:
#Printing Lemmetization Time
print(program_end_time - program_start_time, "seconds to finish")

197.85924100875854 seconds to finish


In [9]:
# Timing Start
program_start_time = time.time()

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(r'[^a-zA-Z 0-9]', '', text)
    tokens = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub(r'\w*\d\w*', '', text) # Remove words containing numbers
    tokens = re.sub(r'@*!*\$*', '', text) # Remove @ ! $
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

# Timing End
program_end_time = time.time()

# View those tokens (the 4th column)
df

Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets,tokens,tokens_back_to_text,lemmas,lemmas_back_to_text,lemma_tokens
0,_Support\n_NYC\n_India\n \n\nMy account has go...,_Support _NYC _India My account has gone too l...,_Support _NYC _India My account has gone too l...,"[_support, _nyc, _india, account, gone, long, ...",_support _nyc _india account gone long fixed k...,"[support, nyc, india, account, go, long, fix, ...",support nyc india account go long fix know,"[support, nyc, india, account, go, long, fix, ..."
1,"Let it corporate or personal life, in need of ...","Let it corporate or personal life, in need of ...","Let it corporate or personal life, in need of ...","[let, corporate, personal, life,, need, help, ...","let corporate personal life, need help crisis,...","[let, corporate, personal, life, need, help, c...",let corporate personal life need help crisis h...,"[let, corporate, personal, life, need, help, c..."
2,Someone in India really hacked my uber account...,Someone in India really hacked my uber account...,Someone in India really hacked my uber account...,"[india, hacked, uber, account, u]",india hacked uber account u,"[india, hack, uber, account, u]",india hack uber account u,"[india, hack, uber, account, u]"
3,"_luv Hey Rahul, kindly share your registered c...","_luv Hey Rahul, kindly share your registered c...","_luv Hey Rahul, kindly share your registered c...","[_luv, hey, rahul,, kindly, share, registered,...","_luv hey rahul, kindly share registered contac...","[luv, hey, rahul, kindly, share, register, con...",luv hey rahul kindly share register contact de...,"[luv, hey, rahul, kindly, share, register, con..."
4,"Hey Biju, with safety as our top priority, it ...","Hey Biju, with safety as our top priority, it ...","Hey Biju, with safety as our top priority, it ...","[hey, biju,, safety, priority,, recommended, 2...","hey biju, safety priority, recommended 2 peopl...","[hey, biju, safety, priority, recommend, 2, pe...",hey biju safety priority recommend 2 people ub...,"[hey, biju, safety, priority, recommend, 2, pe..."
...,...,...,...,...,...,...,...,...
6467,"Hey, please send us a note via the help sectio...","Hey, please send us a note via the help sectio...","Hey, please send us a note via the help sectio...","[hey,, send, note, help, section, app, direct,...","hey, send note help section app direct message...","[hey, send, note, help, section, app, direct, ...",hey send note help section app direct message ...,"[hey, send, note, help, section, app, direct, ..."
6468,is anyone alive?,is anyone alive?,is anyone alive?,[alive?],alive?,[alive],alive,[alive]
6469,"Hey Rajiv, could you please share your registe...","Hey Rajiv, could you please share your registe...","Hey Rajiv, could you please share your registe...","[hey, rajiv,, share, registered, details, dire...","hey rajiv, share registered details direct mes...","[hey, rajiv, share, register, detail, direct, ...",hey rajiv share register detail direct message...,"[hey, rajiv, share, register, detail, direct, ..."
6470,"Hey, sorry to hear that you weren't able to ap...","Hey, sorry to hear that you weren't able to ap...","Hey, sorry to hear that you weren't able to ap...","[hey,, sorry, hear, weren't, able, apply, prom...","hey, sorry hear weren't able apply promotional...","[hey, sorry, hear, able, apply, promotional, c...",hey sorry hear able apply promotional code sha...,"[hey, sorry, hear, able, apply, promotional, c..."


In [10]:
#Printing Tokenization Time
print(program_end_time - program_start_time, "seconds to finish")


1.7772495746612549 seconds to finish


##Topic Modeling ㊙️

###id2word 📒

In [11]:
# Create a id2word dictionary
id2word = Dictionary(df['lemma_tokens'])
print(len(id2word))

9049


In [12]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

3696


###Corpus Object & Generating Base Model Topics 📚

In [13]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

####Base Model

In [14]:
# Timing Start
base_model_program_start_time = time.time()

# Instantiating a LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=5, id2word=id2word, workers=12, passes=5)

# Timing End
base_model_program_end_time = time.time()


In [15]:
#Printing First Model Time
base_model_runtime = round(base_model_program_end_time - base_model_program_start_time, 2)
print(base_model_runtime)


47.92


In [16]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]


In [17]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]


In [18]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
hey help message direct kindly share detail app register section

------ Topic 1 ------
new health make simple manage premium device seamless style express

------ Topic 2 ------
service support hey uber india driver essential zone help kindly

------ Topic 3 ------
india uber driver ride good cab support work delhi great

------ Topic 4 ------
uber service guideline ride hey available city notice driver thank



In [19]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)



Perplexity:  -6.286402913419075

Coherence Score:  0.4561066501444625


#### Base Model Topic Distance Visualization 📈

In [20]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(base_model, corpus, id2word)

###Grid Seach 🔍

In [21]:
lemmas_df = df['lemmas_back_to_text']
print(type(lemmas_df[0]))


<class 'str'>


In [22]:
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text'])


In [23]:
gs_start_time = time.time()

# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=100, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001, 
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1, 
                                                 random_state=None,
                                                 topic_word_prior=None, 
                                                 total_samples=1000000.0, 
                                                 verbose=0),
             iid=True, n_jobs=1,
             param_grid={'n_topics': [10, 15, 20, 30], 
                         'learning_decay': [0.5, 0.7, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
             scoring=None, verbose=0)

gs_end_time = time.time()






In [24]:
print(gs_end_time - gs_start_time, "seconds to finish")


1556.9325757026672 seconds to finish


In [55]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
# Best Model's Params:  {'learning_decay': 0.9, 'n_topics': 10}

Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -249400.0726756264
Model Perplexity:  752.7927832092748


###Hyperparameter Tuning 🦾

####Model iteration 1 (number of topics)


#####1.0 Topics = 5

In [56]:
# Let's start with parameter tuning for the LDA model and,
# find an optimal number of topics to reach the best coherence score

# Define chunksize and passes
# Chunksize is Number of documents to be used in each training chunk
# Passes is Number of passes through the corpus during training

# Timing Start
model_1_0_start_time = time.time()

model_1_0 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=5,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_0_end_time = time.time()


In [57]:
#Printing First Model Time
model_1_0_runtime = round(model_1_0_end_time - model_1_0_start_time, 2)
print(model_1_0_runtime)


42.55


In [58]:
# Filtering for words 
words_1_0 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_0.print_topics()]

# Create Topics
topics_1_0 = [' '.join(t[0:10]) for t in words_1_0]

# Getting the topics
for id, t in enumerate(topics_1_0): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
uber service read hey guideline safety available ride driver india

------ Topic 1 ------
india uber driver work good ola support time car come

------ Topic 2 ------
hey message direct share register kindly detail follow sorry number

------ Topic 3 ------
help app hey section send kindly response way concern uber

------ Topic 4 ------
health new make premium simple manage device seamless style express



In [59]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_0_perplexity = model_1_0.log_perplexity(corpus)
print('\nPerplexity: ', model_1_0_perplexity) 

# Compute Coherence Score
coherence_model_1_0 = CoherenceModel(model=model_1_0, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_0 = coherence_model_1_0.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_0)



Perplexity:  -6.302528402577665

Coherence Score:  0.5275585254774091


In [60]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_0, corpus, id2word)


#####1.1 Topics = 10

In [61]:
#Increasing number of topics to 10
#Timing Start
model_1_1_start_time = time.time()

model_1_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=10,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_1_end_time = time.time()


In [62]:
#Printing First Model Time
model_1_1_runtime = round(model_1_1_end_time - model_1_1_start_time, 2)
print(model_1_1_runtime)


35.24


In [63]:
# Filtering for words 
words_1_1 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_1.print_topics()]

# Create Topics
topics_1_1 = [' '.join(t[0:10]) for t in words_1_1]

# Getting the topics
for id, t in enumerate(topics_1_1): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
uber service read hey guideline available safety ride city accordance

------ Topic 1 ------
india uber good great work ola sir driver company job

------ Topic 2 ------
message detail direct share hey register kindly follow trip sorry

------ Topic 3 ------
help app section hey send kindly response way concern update

------ Topic 4 ------
thank issue uber support team day ac need driver thanks

------ Topic 5 ------
new health make simple manage seamless device premium express style

------ Topic 6 ------
hey number message direct kindly register follow concern share look

------ Topic 7 ------
driver ride cab cancel india uber book pay trip location

------ Topic 8 ------
thank driver uber india hai action soon work bring ko

------ Topic 9 ------
driver uber india safety customer rider time ride safe well



In [64]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_1_perplexity = model_1_1.log_perplexity(corpus)
print('\nPerplexity: ', model_1_1_perplexity) 

# Compute Coherence Score
coherence_model_1_1 = CoherenceModel(model=model_1_1, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_1 = coherence_model_1_1.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_1)



Perplexity:  -6.342547666613483

Coherence Score:  0.4971956661440924


In [65]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_1, corpus, id2word)


#####1.2 Topics = 15

In [66]:
#Increasing number of topics to 15
#Timing Start
model_1_2_start_time = time.time()

model_1_2 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=15,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_2_end_time = time.time()


In [67]:
#Printing First Model Time
model_1_2_runtime = round(model_1_2_end_time - model_1_2_start_time, 2)
print(model_1_2_runtime)


43.61


In [68]:
# Filtering for words 
words_1_2 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_2.print_topics()]

# Create Topics
topics_1_2 = [' '.join(t[0:10]) for t in words_1_2]

# Getting the topics
for id, t in enumerate(topics_1_2): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
uber service available guideline india read ride mean city government

------ Topic 1 ------
india uber work driver love pls year car delivery drive

------ Topic 2 ------
share message direct hey detail register kindly sorry follow hear

------ Topic 3 ------
help uber driver happy account drop hey hope india trip

------ Topic 4 ------
cancel charge ride driver uber trip day pay ask india

------ Topic 5 ------
kindly hey uber trip allow update fare cab support app

------ Topic 6 ------
number hey message direct register early follow uber mobile send

------ Topic 7 ------
account uber issue trip provide hey book assist request face

------ Topic 8 ------
thank driver action india people give uber work understand take

------ Topic 9 ------
driver good great safety job uber ride time take rider

------ Topic 10 ------
hey help app section concern kindly send way response update

------ Topic 11 ------
new make health simple manage seamless device premium expres

In [69]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_2_perplexity = model_1_2.log_perplexity(corpus)
print('\nPerplexity: ', model_1_2_perplexity) 

# Compute Coherence Score
coherence_model_1_2 = CoherenceModel(model=model_1_2, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_2 = coherence_model_1_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_2)



Perplexity:  -6.333067227401998

Coherence Score:  0.47320311850378516


In [70]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_2, corpus, id2word)


##### 1.3 Topics = 20

In [71]:
#Increasing number of topics to 20
#Timing Start
model_1_3_start_time = time.time()

model_1_3 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=20,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_3_end_time = time.time()


In [72]:
#Printing First Model Time
model_1_3_runtime = round(model_1_3_end_time - model_1_3_start_time, 2)
print(model_1_3_runtime)


48.78


In [73]:
# Filtering for words 
words_1_3 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_3.print_topics()]

# Create Topics
topics_1_3 = [' '.join(t[0:10]) for t in words_1_3]

# Getting the topics
for id, t in enumerate(topics_1_3): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
service uber read available guideline ride city mean government notice

------ Topic 1 ------
india uber delivery pls local love encourage eat zomato food

------ Topic 2 ------
trip time platform india date kind detail experience share uber

------ Topic 3 ------
help app section hey trip kindly update fare review happy

------ Topic 4 ------
driver ride charge pay ac day reply issue ask cancel

------ Topic 5 ------
hey kindly travel cab support mumbai essential allow ride availability

------ Topic 6 ------
uber safety 2 hey people priority vehicle distance follow local

------ Topic 7 ------
issue account know hey india cancel uber app let kindly

------ Topic 8 ------
thank driver india uber think action trip happen help respect

------ Topic 9 ------
message direct send kind experience definitely registered detail hey address

------ Topic 10 ------
help hey kindly app check way send section concern response

------ Topic 11 ------
new health make simple man

In [74]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_3_perplexity = model_1_3.log_perplexity(corpus)
print('\nPerplexity: ', model_1_3_perplexity) 

# Compute Coherence Score
coherence_model_1_3 = CoherenceModel(model=model_1_3, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_3 = coherence_model_1_3.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_3)



Perplexity:  -6.322442138314105

Coherence Score:  0.4895762869976711


In [75]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_3, corpus, id2word)


##### 1.4 Topics = 25

In [76]:
#Increasing number of topics to 25
#Timing Start
model_1_4_start_time = time.time()

model_1_4 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=25,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_4_end_time = time.time()


In [77]:
#Printing First Model Time
model_1_4_runtime = round(model_1_4_end_time - model_1_4_start_time, 2)
print(model_1_4_runtime)


41.62


In [78]:
# Filtering for words 
words_1_4 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_4.print_topics()]

# Create Topics
topics_1_4 = [' '.join(t[0:10]) for t in words_1_3]

# Getting the topics
for id, t in enumerate(topics_1_4): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
service uber read available guideline ride city mean government notice

------ Topic 1 ------
india uber delivery pls local love encourage eat zomato food

------ Topic 2 ------
trip time platform india date kind detail experience share uber

------ Topic 3 ------
help app section hey trip kindly update fare review happy

------ Topic 4 ------
driver ride charge pay ac day reply issue ask cancel

------ Topic 5 ------
hey kindly travel cab support mumbai essential allow ride availability

------ Topic 6 ------
uber safety 2 hey people priority vehicle distance follow local

------ Topic 7 ------
issue account know hey india cancel uber app let kindly

------ Topic 8 ------
thank driver india uber think action trip happen help respect

------ Topic 9 ------
message direct send kind experience definitely registered detail hey address

------ Topic 10 ------
help hey kindly app check way send section concern response

------ Topic 11 ------
new health make simple man

In [79]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_4_perplexity = model_1_4.log_perplexity(corpus)
print('\nPerplexity: ', model_1_4_perplexity) 

# Compute Coherence Score
coherence_model_1_4 = CoherenceModel(model=model_1_4, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_4 = coherence_model_1_4.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_4)



Perplexity:  -6.311173451379648

Coherence Score:  0.4518313625062108


In [80]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_4, corpus, id2word)


#####1.5 Topics = 30

In [81]:
#We have had a reduction from .39 to .35 in coherence score 
#by going from 20 to 25. Let's try 30 topics and see what 
#coherence score we'll get. 

# Timing Start
model_1_5_start_time = time.time()

model_1_5 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=30,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_5_end_time = time.time()


In [82]:
#Printing First Model Time
model_1_5_runtime = round(model_1_5_end_time - model_1_5_start_time, 2)
print(model_1_5_runtime)


48.45


In [83]:
# Filtering for words 
words_1_5 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_5.print_topics()]

# Create Topics
topics_1_5 = [' '.join(t[0:10]) for t in words_1_5]

# Getting the topics
for id, t in enumerate(topics_1_5): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
uber service read available safety guideline city government resume mean

------ Topic 1 ------
issue provide hey look payment able detail time dm face

------ Topic 2 ------
action understand driver drive bring uber appropriate instance monitor basis

------ Topic 3 ------
uber sure good able close time bhaiya business live rental

------ Topic 4 ------
india uber ac service thank ride ok pune driver turn

------ Topic 5 ------
thank sir day team great okay wow month uber receive

------ Topic 6 ------
india uber thanks ride sir support come ola cancellation charge

------ Topic 7 ------
drop location driver uber safety use right guy away app

------ Topic 8 ------
book pay ask india day uber reach destination take need

------ Topic 9 ------
trip time date kindly reference kind experience detail share request

------ Topic 10 ------
travel hey kindly appreciate essential allow uber support cab mumbai

------ Topic 11 ------
new health make simple manage seamless

In [84]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_5_perplexity = model_1_5.log_perplexity(corpus)
print('\nPerplexity: ', model_1_5_perplexity) 

# Compute Coherence Score
coherence_model_1_5 = CoherenceModel(model=model_1_5, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_5 = coherence_model_1_5.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_5)



Perplexity:  -6.311927894526822

Coherence Score:  0.47283554071016026


In [85]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_5, corpus, id2word)


#####1.6 Topics = 35

In [86]:
# Coherence score jumped back to 0.39, let's try 35 topics
# Timing Start
model_1_6_start_time = time.time()

model_1_6 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=35,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_6_end_time = time.time()


In [87]:
#Printing First Model Time
model_1_6_runtime = round(model_1_6_end_time - model_1_6_start_time, 2)
print(model_1_6_runtime)


50.09


In [88]:
# Filtering for words 
words_1_6 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_6.print_topics()]

# Create Topics
topics_1_6 = [' '.join(t[0:10]) for t in words_1_6]

# Getting the topics
for id, t in enumerate(topics_1_6): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
driver uber ask book wow bangalore option cancel india pay

------ Topic 1 ------
drop location app request outstanding able account balance ride trip

------ Topic 2 ------
hey guideline uber vehicle people priority 2 safety follow distance

------ Topic 3 ------
safe hey phone india travel provide support confirm kindly number

------ Topic 4 ------
hey guideline update mha late car uber accordance service resume

------ Topic 5 ------
team hey trip book need touch issue help step email

------ Topic 6 ------
new health make simple manage device seamless premium style express

------ Topic 7 ------
thank yes india reply sir partner uber team help time

------ Topic 8 ------
driver time uber cancel pune minute place cancellation 20 है

------ Topic 9 ------
uber india mobile help cause time trip lose normal hour

------ Topic 10 ------
help happy guy nice true love day proud fight jai

------ Topic 11 ------
register share message direct hey kindly follow detail 

In [89]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_6_perplexity = model_1_6.log_perplexity(corpus)
print('\nPerplexity: ', model_1_6_perplexity) 

# Compute Coherence Score
coherence_model_1_6 = CoherenceModel(model=model_1_6, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_6 = coherence_model_1_6.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_6)



Perplexity:  -6.3603054794619815

Coherence Score:  0.4156631452524796


In [90]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_6, corpus, id2word)


#####1.7 Topics = 40

In [91]:
# Let's try 40 topics
# Timing Start
model_1_7_start_time = time.time()

model_1_7 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=40,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_7_end_time = time.time()


In [92]:
#Printing First Model Time
model_1_7_runtime = round(model_1_7_end_time - model_1_7_start_time, 2)
print(model_1_7_runtime)


45.19


In [93]:
# Filtering for words 
words_1_7 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1_7.print_topics()]

# Create Topics
topics_1_7 = [' '.join(t[0:10]) for t in words_1_7]

# Getting the topics
for id, t in enumerate(topics_1_7): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
great drive thank news india uber ola legal है world

------ Topic 1 ------
trip kind experience detail hey direct message registered date time

------ Topic 2 ------
uber shall okay help people nation india say thank driver

------ Topic 3 ------
help trip happy fare hey app refund section kindly refer

------ Topic 4 ------
thank india uber help ur support good great lose hour

------ Topic 5 ------
help way concern app send section hey response kindly address

------ Topic 6 ------
driver safety well rider ensure ask possible switch mode saferforeachother

------ Topic 7 ------
number time driver ride uber d place day minute abhishek

------ Topic 8 ------
good job great price guy wow sell post booking pakistan

------ Topic 9 ------
early update number hey kindly message share register direct follow

------ Topic 10 ------
message direct register hey share detail kindly follow sorry hear

------ Topic 11 ------
issue uber safety hey provide detail know change 

In [94]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_7_perplexity = model_1_7.log_perplexity(corpus)
print('\nPerplexity: ', model_1_7_perplexity) 

# Compute Coherence Score
coherence_model_1_7 = CoherenceModel(model=model_1_7, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1_7 = coherence_model_1_7.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1_7)



Perplexity:  -6.333637765414876

Coherence Score:  0.44280368554788263


In [95]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1_7, corpus, id2word)


#####1.8 Topics = 5-200


In [96]:
#Defining a function to loop over number of topics to be used to find an 
#optimal number of tipics
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the 
    LDA model with respective number of topics
    """
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list_topic.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    return model_list_topic, coherence_values_topic      

In [97]:
# Can take a long time to run.
model_1_8_start_time = time.time()

model_list_topic, coherence_values_topic = compute_coherence_values(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=df['lemma_tokens'],
                                                        start=2, limit=200, step=6)
model_1_8_end_time = time.time()


In [98]:
#Printing First Model Time
model_1_8_runtime = round(model_1_8_end_time - model_1_8_start_time, 2)
print(model_1_8_runtime)


1414.43


In [99]:
limit=200; start=2; step=6;
x_topic = range(start, limit, step)

topic_ts = {'coherence_value': coherence_values_topic,
            'number_of_topics': x_topic}

topic_chart = pd.DataFrame(data=topic_ts)

topic_fig = px.line(topic_chart, x="number_of_topics", y="coherence_value")
topic_fig.show()
