In [None]:
!pip install pyLDAvis --quiet
!pip install chart_studio --quiet


In [None]:
import pandas as pd
import numpy as np
import time
import re
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation
import gensim
from spacy.tokenizer import Tokenizer
import gensim.corpora as corpora
from gensim.models.ldamulticore import LdaMulticore
from pprint import pprint
from gensim.models.coherencemodel import CoherenceModel
import plotly.express as px
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls
from operator import itemgetter
from ipywidgets import interact
import tqdm
from IPython.display import display, Markdown, clear_output
# widget packages
import ipywidgets as widgets


# supress warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


We will load in our preprocessed tweets from our [data cleaning notebook](https://github.com/tarrantcarter/Final_Capstone/blob/main/Modern_Motivation_Data_Cleaning_Feature_Engineering.ipynb). The csv can be found [here](https://drive.google.com/file/d/1-d-61YuocweY0F3rSnf-idL1ixbuKq4H/view?usp=sharing). 

In [None]:
# load in cleaned tweets from data cleaning notebook
tweets_cleaned = pd.read_json("/content/drive/MyDrive/Data/NLP_Capstone/motivational_tweets_cleaned_bigram.json")

In [None]:
tweets_cleaned.head()

Unnamed: 0,date,user_name,content,content_preprocessed,unigram_tokens,ngrams,ngram_tokens,nouns_only,nouns_verbs,bigrams,bigram_strings
44,2021-01-17 22:13:17,LewisHowes,Know this. Everything is happening for a reaso...,know happen reason favor betterment future pai...,"[know, happen, reason, favor, betterment, futu...",know happen_reason favor betterment future pai...,"[know, happen_reason, favor, betterment, futur...","[reason, favor, betterment, future, pain, feel...","[know, reason, favor, betterment, future, pain...",[happen_reason],happen_reason
61,2021-01-15 15:28:06,LewisHowes,Protect your inner peace at all costs. Create ...,protect inner peace cost create daily practice...,"[protect, inner, peace, cost, create, daily, p...",protect inner_peace cost create daily_practice...,"[protect, inner_peace, cost, create, daily_pra...","[peace, cost, practice, communicate, stress, d...","[peace, cost, practice, communicate, stress, d...","[inner_peace, daily_practice]",inner_peace daily_practice
161,2021-01-07 16:00:29,LewisHowes,Always remember to ask for exactly what you wa...,remember ask exactly want ask love good health...,"[remember, ask, exactly, want, ask, love, good...",remember ask exactly want ask love good health...,"[remember, ask, exactly, want, ask, love, good...","[health, abundance, peace, ask, wisdom, creati...","[remember, health, abundance, peace, ask, wisd...","[health_abundance, ask_wisdom]",health_abundance ask_wisdom
274,2021-01-01 02:44:46,LewisHowes,This will be your greatest year ever. All your...,great year work start pay earn happy love deep...,"[great, year, work, start, pay, earn, happy, l...",great year work start pay earn happy love deep...,"[great, year, work, start, pay, earn, happy, l...","[year, work, start, earn, love, embrace, fear,...","[year, work, start, pay, earn, love, embrace, ...",[massive_action],massive_action
317,2020-12-30 01:38:35,LewisHowes,Be grateful for the breakdown this year. It's ...,grateful breakdown year set massive breakthrou...,"[grateful, breakdown, year, set, massive, brea...",grateful breakdown year set massive breakthrou...,"[grateful, breakdown, year, set, massive, brea...","[year, breakthrough, money, mission, get, rela...","[year, set, breakthrough, money, come, mission...",[real_friend],real_friend


In [None]:
tweets_cleaned.applymap(type)

Unnamed: 0,date,user_name,content,content_preprocessed,unigram_tokens,ngrams,ngram_tokens,nouns_only,nouns_verbs,bigrams,bigram_strings
44,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
61,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
161,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
274,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
317,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
...,...,...,...,...,...,...,...,...,...,...,...
977598,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
977599,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
977606,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
977608,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>


In [None]:
tweets_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 583340 entries, 44 to 977610
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   date                  583340 non-null  datetime64[ns]
 1   user_name             583340 non-null  object        
 2   content               583340 non-null  object        
 3   content_preprocessed  583340 non-null  object        
 4   unigram_tokens        583340 non-null  object        
 5   ngrams                583340 non-null  object        
 6   ngram_tokens          583340 non-null  object        
 7   nouns_only            583340 non-null  object        
 8   nouns_verbs           583340 non-null  object        
 9   bigrams               583340 non-null  object        
 10  bigram_strings        583340 non-null  object        
dtypes: datetime64[ns](1), object(10)
memory usage: 53.4+ MB


In [None]:
tweets_cleaned.shape

(583340, 11)

# Topic Modeling

In [None]:
# create dictionary
id2word = corpora.Dictionary(tweets_cleaned['bigrams'])
# create texts corpus
texts = tweets_cleaned['bigrams']
# term document frequency
corpus = [id2word.doc2bow(text) for text in texts]
# print first 30 tuples from corpus
print(corpus[:1][0][:30])

[(0, 1)]


In [None]:
# number of topics
num_topics = 10
# build LDA model
base_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# # print the keyword in the 10 topics
# pprint(base_model.print_topics())
# doc_lda = base_model[corpus]

In [None]:
# filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

# create topic sorted by 10 most relevent words
topics = [' '.join(t[0:10]) for t in words]


# print most relevent words for each topic
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
wisdom_avoid_thought_weaken irritate_lead_understanding_carl_jung happiness_exist_acceptance_denis_de_rougamont shoe_fit_person_pinch_recipe_live_suit_case_carl_jung better_better_maya_anjelou lose_sight_shore_gide discover_new_ocean_courage right_stick_george_eliot time_valuable_asset_tend_waste_kill_spend_invest_jim_rohn stuck_decide

------ Topic 1 ------
thomas_edison tao_teach_storm_last_forever_matter release_feelings amp_guilt_dyer practice_keep amp_uncomplicated_dyer amp_willingness_allow care_choose winston_churchill key_change

------ Topic 2 ------
day_harvest_reap_seed_plant_robert_louis_stevenson bring_peace hope_meet farther_backward_look_farther hard_beat_person give_babe_ruth mark_twain power_choice_stephen_covey strong_conviction_precede_great strong_doubt_weak

------ Topic 3 ------
longer_dwell_misfortune_great power_harm_voltaire late_happy_childhood_dyer expect_miracle_instead_miracle learn_appreciate trust_heart follow_heart_quiet_ask_questio

In [None]:
# Compute Perplexity
## a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=tweets_cleaned['bigrams'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -8.866752213571143

Coherence Score:  0.7624842297263832


In [None]:
# topic distance visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(base_model, corpus, id2word)

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           alpha=a,
                                           eta=b,
                                           random_state=222)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
# start preprocess runtime
start_time = time.time() 

# Topics range
min_topics = 11
max_topics = 18
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = [.01,.1,.5,1,'symmetric','asymmetric']
# Beta parameter
beta = [.01,.1,.5,1,'symmetric']

# model results dict
model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=210)

    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                              k=k, a=a, b=b)
                # Save the model results
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                
                pbar.update(1)
    lda_tuning_results = pd.DataFrame(model_results)                
    lda_tuning_results.to_csv('/content/drive/MyDrive/Data/NLP_Capstone/lda_tuning_results4.csv', index=False)
    pbar.close()

# print preprocess runtime
print(time.strftime(f'%H hours, %M minutes, %S seconds', time.gmtime(time.time() - start_time)))



  0%|          | 0/210 [00:00<?, ?it/s][A[A

  0%|          | 1/210 [00:30<1:46:13, 30.50s/it][A[A

  1%|          | 2/210 [01:07<1:52:48, 32.54s/it][A[A

  1%|▏         | 3/210 [01:47<1:59:22, 34.60s/it][A[A

  2%|▏         | 4/210 [02:26<2:03:43, 36.04s/it][A[A

  2%|▏         | 5/210 [03:04<2:04:39, 36.49s/it][A[A

  3%|▎         | 6/210 [03:34<1:57:28, 34.55s/it][A[A

  3%|▎         | 7/210 [04:12<2:01:04, 35.78s/it][A[A

  4%|▍         | 8/210 [04:53<2:05:45, 37.35s/it][A[A

  4%|▍         | 9/210 [05:35<2:09:39, 38.70s/it][A[A

  5%|▍         | 10/210 [06:14<2:09:04, 38.72s/it][A[A

  5%|▌         | 11/210 [06:46<2:01:31, 36.64s/it][A[A

  6%|▌         | 12/210 [07:34<2:12:11, 40.06s/it][A[A

  6%|▌         | 13/210 [08:22<2:19:45, 42.56s/it][A[A

  7%|▋         | 14/210 [09:11<2:24:58, 44.38s/it][A[A

  7%|▋         | 15/210 [09:58<2:27:23, 45.35s/it][A[A

  8%|▊         | 16/210 [10:30<2:12:59, 41.13s/it][A[A

  8%|▊         | 17/210 [11:11<2

02 hours, 16 minutes, 35 seconds





In [None]:
lda_tuning_results.sort_values('Coherence',ascending=False).head(30)

Unnamed: 0,Topics,Alpha,Beta,Coherence
198,17,1,1.0,0.818768
48,12,1,1.0,0.817982
108,14,1,1.0,0.81742
165,16,1,0.01,0.816741
168,16,1,1.0,0.816721
15,11,1,0.01,0.816575
138,15,1,1.0,0.816066
195,17,1,0.01,0.815091
197,17,1,0.5,0.81458
78,13,1,1.0,0.814304


In [None]:
# number optimal of topics
num_topics = 12
# build optimal LDA model
optimal_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                        alpha=1,
                                        eta=1,
                                        random_state=222
                                        )
# filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in optimal_model.print_topics()]

# create topic sorted by 10 most relevent words
topics = [' '.join(t[0:10]) for t in words]


# print most relevent words for each topic
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
longer_dwell_misfortune_great heaven_earth_choice_place excuses_dyer passion_trumps suit_wear_need_pocket_dyer detach_stuff_dyer time_decision_forget_harry_truman albert_einstein wayne_dyer thing_frustration_drive_improve_john_lyons

------ Topic 1 ------
henry_david_thoreau longer_dwell_misfortune_great suit_wear_need_pocket_dyer excuses_dyer passion_trumps heaven_earth_choice_place detach_stuff_dyer time_decision_forget_harry_truman thomas_edison abraham_lincoln

------ Topic 2 ------
albert_einstein longer_dwell_misfortune_great passion_trumps excuses_dyer heaven_earth_choice_place suit_wear_need_pocket_dyer matter_slowly dare_fail_greatly detach_stuff_dyer time_decision_forget_harry_truman

------ Topic 3 ------
happy_pursue_want jim_rohn albert_einstein longer_dwell_misfortune_great thomas_edison passion_trumps excuses_dyer heaven_earth_choice_place suit_wear_need_pocket_dyer peaceful_life

------ Topic 4 ------
wayne_dyer abraham_lincoln try_fail wisdom_avoi

In [None]:
# Compute Perplexity
## a measure of how good the model is. lower the better
base_perplexity = optimal_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=optimal_model, texts=tweets_cleaned['bigrams'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -9.18996763398098

Coherence Score:  0.8022484547518839


In [None]:
# topic distance visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)