In [1]:
!pip install pyLDAvis --quiet
!pip install chart_studio --quiet


In [2]:
import pandas as pd
import numpy as np
import time
import re
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation
import gensim
from spacy.tokenizer import Tokenizer
import gensim.corpora as corpora
from gensim.models.ldamulticore import LdaMulticore
from pprint import pprint
from gensim.models.coherencemodel import CoherenceModel
import plotly.express as px
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls
from operator import itemgetter
from ipywidgets import interact
import tqdm
from IPython.display import display, Markdown, clear_output
import ipywidgets as widgets


# supress warnings
warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


We will load in our preprocessed tweets from our [data cleaning notebook](https://github.com/tarrantcarter/Final_Capstone/blob/main/Modern_Motivation_Data_Cleaning_Feature_Engineering.ipynb). The csv can be found [here](https://drive.google.com/file/d/1-d-61YuocweY0F3rSnf-idL1ixbuKq4H/view?usp=sharing). 

In [4]:
# load in cleaned tweets from data cleaning notebook
tweets_cleaned = pd.read_json("/content/drive/MyDrive/Data/NLP_Capstone/motivational_tweets_cleaned.json")

In [5]:
tweets_cleaned.head()

Unnamed: 0,date,user_name,content,content_preprocessed,unigram_tokens,ngrams,ngram_tokens,nouns_only,nouns_verbs,bigrams_trigrams,bigrams_trigrams_strings
44,2021-01-17 22:13:17,LewisHowes,Know this. Everything is happening for a reaso...,know happen reason favor betterment future pai...,"[know, happen, reason, favor, betterment, futu...",know_happen_reason favor betterment future pai...,"[know_happen_reason, favor, betterment, future...","[reason, favor, betterment, future, pain, feel...","[know, reason, favor, betterment, future, pain...",[know_happen_reason],know_happen_reason
61,2021-01-15 15:28:06,LewisHowes,Protect your inner peace at all costs. Create ...,protect inner peace cost create daily practice...,"[protect, inner, peace, cost, create, daily, p...",protect inner_peace cost create daily_practice...,"[protect, inner_peace, cost, create, daily_pra...","[peace, cost, practice, communicate, stress, d...","[peace, cost, practice, communicate, stress, d...","[inner_peace, daily_practice]",inner_peace daily_practice
161,2021-01-07 16:00:29,LewisHowes,Always remember to ask for exactly what you wa...,remember ask exactly want ask love good health...,"[remember, ask, exactly, want, ask, love, good...",remember ask exactly want ask love good health...,"[remember, ask, exactly, want, ask, love, good...","[health, abundance, peace, ask, wisdom, creati...","[remember, health, abundance, peace, ask, wisd...","[health_abundance, ask_wisdom]",health_abundance ask_wisdom
274,2021-01-01 02:44:46,LewisHowes,This will be your greatest year ever. All your...,great year work start pay earn happy love deep...,"[great, year, work, start, pay, earn, happy, l...",great year work start pay earn happy love deep...,"[great, year, work, start, pay, earn, happy, l...","[year, work, start, earn, love, embrace, fear,...","[year, work, start, pay, earn, love, embrace, ...",[massive_action],massive_action
317,2020-12-30 01:38:35,LewisHowes,Be grateful for the breakdown this year. It's ...,grateful breakdown year set massive breakthrou...,"[grateful, breakdown, year, set, massive, brea...",grateful breakdown year set massive breakthrou...,"[grateful, breakdown, year, set, massive, brea...","[year, breakthrough, money, mission, get, rela...","[year, set, breakthrough, money, come, mission...",[real_friend],real_friend


In [6]:
tweets_cleaned.applymap(type)

Unnamed: 0,date,user_name,content,content_preprocessed,unigram_tokens,ngrams,ngram_tokens,nouns_only,nouns_verbs,bigrams_trigrams,bigrams_trigrams_strings
44,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
61,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
161,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
274,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
317,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
...,...,...,...,...,...,...,...,...,...,...,...
977598,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
977599,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
977606,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
977608,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>


In [7]:
tweets_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 621449 entries, 44 to 977610
Data columns (total 11 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   date                      621449 non-null  datetime64[ns]
 1   user_name                 621449 non-null  object        
 2   content                   621449 non-null  object        
 3   content_preprocessed      621449 non-null  object        
 4   unigram_tokens            621449 non-null  object        
 5   ngrams                    621449 non-null  object        
 6   ngram_tokens              621449 non-null  object        
 7   nouns_only                621449 non-null  object        
 8   nouns_verbs               621449 non-null  object        
 9   bigrams_trigrams          621449 non-null  object        
 10  bigrams_trigrams_strings  621449 non-null  object        
dtypes: datetime64[ns](1), object(10)
memory usage: 56.9+ MB


In [8]:
tweets_cleaned.shape

(621449, 11)

# Topic Modeling

In [9]:
# create dictionary
id2word = corpora.Dictionary(tweets_cleaned['nouns_verbs'])
# create texts corpus
texts = tweets_cleaned['nouns_verbs']
# term document frequency
corpus = [id2word.doc2bow(text) for text in texts]
# print first 30 tuples from corpus
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]


In [10]:
# number of topics
num_topics = 20
# build LDA model
base_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# # print the keyword in the 10 topics
# pprint(base_model.print_topics())
# doc_lda = base_model[corpus]

In [11]:
# filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

# create topic sorted by 10 most relevent words
topics = [' '.join(t[0:10]) for t in words]


# print most relevent words for each topic
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
life way person matter david day thing care think mind

------ Topic 1 ------
life dyer power david soul create control action thinking thought

------ Topic 2 ------
dyer life way success amp thing opportunity matter happen future

------ Topic 3 ------
life heart churchill winston person problem rule happiness dream hell

------ Topic 4 ------
time life fall let person david rise feel john mind

------ Topic 5 ------
thing man david world person aurelius think time life power

------ Topic 6 ------
mind dyer goal step fact life amp david peace attract

------ Topic 7 ------
life thing word david change help stress happiness emerson look

------ Topic 8 ------
life look david time day way mirror change word dyer

------ Topic 9 ------
life love let aurelius time self truth speak man lao

------ Topic 10 ------
amp life love dyer act man find matter way change

------ Topic 11 ------
dream day life thing dyer result feel einstein way energy

------ Topic 12 ------

In [12]:
# Compute Perplexity
## a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=tweets_cleaned['nouns_verbs'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -6.716074277851314

Coherence Score:  0.4487116925229948


In [13]:
# topic distance visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(base_model, corpus, id2word)

In [14]:
get_document_topics = [base_model.get_document_topics(item) for item in corpus]

In [15]:
len(get_document_topics)

621449

In [16]:
get_document_topics[:20]

[[(4, 0.6131946), (12, 0.3338643)],
 [(2, 0.15983823), (7, 0.63149077), (16, 0.14328636)],
 [(10, 0.57430613), (11, 0.3564631)],
 [(0, 0.21321432), (2, 0.6215954), (16, 0.09980571)],
 [(6, 0.26057056), (10, 0.58053476), (16, 0.08806134)],
 [(6, 0.39537886), (10, 0.3546025), (13, 0.20001861)],
 [(5, 0.3410168), (10, 0.4726136), (19, 0.115536265)],
 [(3, 0.23232241), (10, 0.5876776)],
 [(5, 0.4670222), (7, 0.45115957)],
 [(5, 0.27518785), (10, 0.6348122)],
 [(9, 0.2283047), (10, 0.6966953)],
 [(7, 0.90500003)],
 [(16, 0.010000001), (17, 0.010000001), (18, 0.81000006)],
 [(0, 0.45705965), (12, 0.39294034)],
 [(0, 0.73108405), (6, 0.17891595)],
 [(4, 0.010000001), (10, 0.6069313), (19, 0.21306872)],
 [(0, 0.025),
  (1, 0.025),
  (2, 0.025),
  (3, 0.025),
  (4, 0.025),
  (5, 0.025),
  (6, 0.025),
  (7, 0.025),
  (8, 0.025),
  (9, 0.025),
  (10, 0.025),
  (11, 0.525),
  (12, 0.025),
  (13, 0.025),
  (14, 0.025),
  (15, 0.025),
  (16, 0.025),
  (17, 0.025),
  (18, 0.025),
  (19, 0.025)],
 [(2

In [17]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           alpha=a,
                                           eta=b,
                                           random_state=222)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [18]:
# start preprocess runtime
start_time = time.time() 

# Topics range
min_topics = 20
max_topics = 80
step_size = 5
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = [.01,.1,.5,1,'symmetric','asymmetric']
# Beta parameter
beta = [.01,.1,.5,1,'symmetric']

# model results dict
model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=420)

    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                              k=k, a=a, b=b)
                # Save the model results
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                
                pbar.update(1)
    lda_tuning_results = pd.DataFrame(model_results)                
    lda_tuning_results.to_csv('/content/drive/MyDrive/Data/NLP_Capstone/lda_tuning_20_plus_topics.csv', index=False)
    pbar.close()

# print preprocess runtime
print(time.strftime(f'%H hours, %M minutes, %S seconds', time.gmtime(time.time() - start_time)))

 86%|████████▌ | 360/420 [1:58:30<19:45, 19.75s/it]

01 hours, 58 minutes, 30 seconds





In [19]:
lda_tuning_results.sort_values('Coherence',ascending=False).head(30)

Unnamed: 0,Topics,Alpha,Beta,Coherence
269,60,asymmetric,symmetric,0.521515
299,65,asymmetric,symmetric,0.521234
266,60,asymmetric,0.1,0.52028
295,65,asymmetric,0.01,0.517416
146,40,asymmetric,0.1,0.516659
296,65,asymmetric,0.1,0.515638
145,40,asymmetric,0.01,0.515555
245,60,0.1,0.01,0.514674
149,40,asymmetric,symmetric,0.513614
335,75,0.1,0.01,0.513385


In [29]:
# number optimal of topics
num_topics = 60
# build optimal LDA model
optimal_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                        alpha='asymmetric',
                                        eta='symmetric',
                                        random_state=222
                                        )
# filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in optimal_model.print_topics()]

# create topic sorted by 10 most relevent words
topics = [' '.join(t[0:10]) for t in words]


# print most relevent words for each topic
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
step miracle world accept person time wilde word echo mind

------ Topic 1 ------
find leader spirit work dyer job vision c release life

------ Topic 2 ------
eye thing heart truth look conscience jonathan feel behavior find

------ Topic 3 ------
risk take tzu understanding order lao reputation change wisdom mark

------ Topic 4 ------
attract jim pain rohn covey stephen change circumstance peter learn

------ Topic 5 ------
stay direction happiness choice month wise speech life come memory

------ Topic 6 ------
proverb strike george time hepburn heart realise level sign mission

------ Topic 7 ------
life option thing inquire peter remain opinion amazing question amp

------ Topic 8 ------
create ralph emerson person dyer jump quit achievement self tracy

------ Topic 9 ------
search listen light succeed robert aspire ppl sun love amp

------ Topic 10 ------
dyer fall place thought presence life clement time stone result

------ Topic 11 ------
thing failure d

In [30]:
# Compute Perplexity
## a measure of how good the model is. lower the better
base_perplexity = optimal_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=optimal_model, texts=tweets_cleaned['nouns_verbs'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -6.708353668373961

Coherence Score:  0.5161769647965639


In [31]:
# topic distance visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)

In [32]:
# number optimal of topics
num_topics = 40
# build optimal LDA model
optimal_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                        alpha='asymmetric',
                                        eta=.1,
                                        random_state=222
                                        )
# filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in optimal_model.print_topics()]

# create topic sorted by 10 most relevent words
topics = [' '.join(t[0:10]) for t in words]


# print most relevent words for each topic
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
ask question heart work life courage search man bhajan nietzsche

------ Topic 1 ------
dream god discover life achievement heart character test reason responsibility

------ Topic 2 ------
treat winston man way tracy character judge answer friend life

------ Topic 3 ------
thing play man life albert know covey storm hand stephen

------ Topic 4 ------
life dare schuller mind attempt day w till blessing stevenson

------ Topic 5 ------
dyer grow risk self hold thing keep life worth wisdom

------ Topic 6 ------
peale vincent bind energy heart kennedy speak let confucius dwell

------ Topic 7 ------
time forget life man world mediocrity child decision freedom chain

------ Topic 8 ------
change happiness win step maya time attitude trouble design goethe

------ Topic 9 ------
fight consequence aim pull head person brian push deng xiaoping

------ Topic 10 ------
dyer place fall rule heart experience hell jefferson patience presence

------ Topic 11 ------
thing fa

In [33]:
# Compute Perplexity
## a measure of how good the model is. lower the better
base_perplexity = optimal_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=optimal_model, texts=tweets_cleaned['nouns_verbs'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -6.733660263099588

Coherence Score:  0.5150786105875539


In [34]:
# topic distance visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)