In [None]:
!pip install pyLDAvis --quiet
!pip install chart_studio --quiet


In [None]:
import pandas as pd
import numpy as np
import time
import re
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation
import gensim
from spacy.tokenizer import Tokenizer
import gensim.corpora as corpora
from gensim.models.ldamulticore import LdaMulticore
from pprint import pprint
from gensim.models.coherencemodel import CoherenceModel
import plotly.express as px
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls
from operator import itemgetter
from ipywidgets import interact
import tqdm
from IPython.display import display, Markdown, clear_output
# widget packages
import ipywidgets as widgets


# supress warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


We will load in our preprocessed tweets from our [data cleaning notebook](https://github.com/tarrantcarter/Final_Capstone/blob/main/Modern_Motivation_Data_Cleaning_Feature_Engineering.ipynb). The csv can be found [here](https://drive.google.com/file/d/1-d-61YuocweY0F3rSnf-idL1ixbuKq4H/view?usp=sharing). 

In [None]:
# load in cleaned tweets from data cleaning notebook
tweets_cleaned = pd.read_json("/content/drive/MyDrive/Data/NLP_Capstone/motivational_tweets_cleaned.json")

In [None]:
tweets_cleaned.head()

Unnamed: 0,date,user_name,content,content_preprocessed,unigram_tokens,ngrams,ngram_tokens,nouns_only,nouns_verbs,bigrams_trigrams,bigrams_trigrams_strings
44,2021-01-17 22:13:17,LewisHowes,Know this. Everything is happening for a reaso...,know happen reason favor betterment future pai...,"[know, happen, reason, favor, betterment, futu...",know_happen_reason favor betterment future pai...,"[know_happen_reason, favor, betterment, future...","[reason, favor, betterment, future, pain, feel...","[know, reason, favor, betterment, future, pain...",[know_happen_reason],know_happen_reason
61,2021-01-15 15:28:06,LewisHowes,Protect your inner peace at all costs. Create ...,protect inner peace cost create daily practice...,"[protect, inner, peace, cost, create, daily, p...",protect inner_peace cost create daily_practice...,"[protect, inner_peace, cost, create, daily_pra...","[peace, cost, practice, communicate, stress, d...","[peace, cost, practice, communicate, stress, d...","[inner_peace, daily_practice]",inner_peace daily_practice
161,2021-01-07 16:00:29,LewisHowes,Always remember to ask for exactly what you wa...,remember ask exactly want ask love good health...,"[remember, ask, exactly, want, ask, love, good...",remember ask exactly want ask love good health...,"[remember, ask, exactly, want, ask, love, good...","[health, abundance, peace, ask, wisdom, creati...","[remember, health, abundance, peace, ask, wisd...","[health_abundance, ask_wisdom]",health_abundance ask_wisdom
274,2021-01-01 02:44:46,LewisHowes,This will be your greatest year ever. All your...,great year work start pay earn happy love deep...,"[great, year, work, start, pay, earn, happy, l...",great year work start pay earn happy love deep...,"[great, year, work, start, pay, earn, happy, l...","[year, work, start, earn, love, embrace, fear,...","[year, work, start, pay, earn, love, embrace, ...",[massive_action],massive_action
317,2020-12-30 01:38:35,LewisHowes,Be grateful for the breakdown this year. It's ...,grateful breakdown year set massive breakthrou...,"[grateful, breakdown, year, set, massive, brea...",grateful breakdown year set massive breakthrou...,"[grateful, breakdown, year, set, massive, brea...","[year, breakthrough, money, mission, get, rela...","[year, set, breakthrough, money, come, mission...",[real_friend],real_friend


In [None]:
tweets_cleaned.applymap(type)

Unnamed: 0,date,user_name,content,content_preprocessed,unigram_tokens,ngrams,ngram_tokens,nouns_only,nouns_verbs,bigrams_trigrams,bigrams_trigrams_strings
44,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
61,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
161,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
274,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
317,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
...,...,...,...,...,...,...,...,...,...,...,...
977598,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
977599,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
977606,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>
977608,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>,<class 'str'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'list'>,<class 'str'>


In [None]:
tweets_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 621449 entries, 44 to 977610
Data columns (total 11 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   date                      621449 non-null  datetime64[ns]
 1   user_name                 621449 non-null  object        
 2   content                   621449 non-null  object        
 3   content_preprocessed      621449 non-null  object        
 4   unigram_tokens            621449 non-null  object        
 5   ngrams                    621449 non-null  object        
 6   ngram_tokens              621449 non-null  object        
 7   nouns_only                621449 non-null  object        
 8   nouns_verbs               621449 non-null  object        
 9   bigrams_trigrams          621449 non-null  object        
 10  bigrams_trigrams_strings  621449 non-null  object        
dtypes: datetime64[ns](1), object(10)
memory usage: 56.9+ MB


In [None]:
tweets_cleaned.shape

(621449, 11)

# Topic Modeling

In [None]:
# create dictionary
id2word = corpora.Dictionary(tweets_cleaned['nouns_verbs'])
# create texts corpus
texts = tweets_cleaned['nouns_verbs']
# term document frequency
corpus = [id2word.doc2bow(text) for text in texts]
# print first 30 tuples from corpus
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]


In [None]:
# number of topics
num_topics = 10
# build LDA model
base_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# # print the keyword in the 10 topics
# pprint(base_model.print_topics())
# doc_lda = base_model[corpus]

In [None]:
# filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

# create topic sorted by 10 most relevent words
topics = [' '.join(t[0:10]) for t in words]


# print most relevent words for each topic
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
love man choice life happiness judge passion treat reason desire

------ Topic 1 ------
day matter mind energy body step lie ability power go

------ Topic 2 ------
dyer person time place problem churchill winston wisdom george life

------ Topic 3 ------
amp look life create opportunity ask eye thought woman word

------ Topic 4 ------
way change heart let find fall learn edison result life

------ Topic 5 ------
dream thing reality control thought trust goal man action presence

------ Topic 6 ------
man think lincoln life happen franklin roosevelt time abraham ralph

------ Topic 7 ------
life time day act john today henry tomorrow enemy habit

------ Topic 8 ------
world proverb question courage fear jim mean attract amp rohn

------ Topic 9 ------
thing einstein get success try peace strength albert failure come



In [None]:
# Compute Perplexity
## a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=tweets_cleaned['nouns_verbs'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -7.027831968962231

Coherence Score:  0.5739083655829538


In [None]:
# topic distance visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(base_model, corpus, id2word)

In [None]:
get_document_topics = [base_model.get_document_topics(item) for item in corpus]

In [None]:
len(get_document_topics)

621449

In [None]:
get_document_topics[:20]

[[(0, 0.07759577),
  (3, 0.1459197),
  (4, 0.24289474),
  (5, 0.14068808),
  (6, 0.07820044),
  (8, 0.07310224),
  (9, 0.22395071)],
 [(0, 0.19800296),
  (1, 0.12737502),
  (7, 0.10281024),
  (8, 0.26659524),
  (9, 0.26675215)],
 [(1, 0.09309117),
  (2, 0.08692066),
  (3, 0.088717036),
  (5, 0.088662416),
  (6, 0.08744971),
  (7, 0.17251125),
  (9, 0.35956553)],
 [(1, 0.09000237),
  (5, 0.46093732),
  (6, 0.0831831),
  (7, 0.1662822),
  (8, 0.16111732)],
 [(5, 0.42661807), (7, 0.34115666), (9, 0.17386495)],
 [(0, 0.07064529),
  (3, 0.2158849),
  (4, 0.15357126),
  (5, 0.10551868),
  (7, 0.20773077),
  (8, 0.13850185),
  (9, 0.09049938)],
 [(0, 0.18421452),
  (2, 0.09828032),
  (4, 0.14827089),
  (5, 0.3020424),
  (6, 0.13036136),
  (7, 0.10349163)],
 [(0, 0.02),
  (1, 0.02),
  (2, 0.41979742),
  (3, 0.020000003),
  (4, 0.22020216),
  (5, 0.020000022),
  (6, 0.020000014),
  (7, 0.22000036),
  (8, 0.02),
  (9, 0.02)],
 [(0, 0.107988626),
  (2, 0.11538682),
  (3, 0.22285599),
  (6, 0.2263

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           alpha=a,
                                           eta=b,
                                           random_state=222)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
# start preprocess runtime
start_time = time.time() 

# Topics range
min_topics = 11
max_topics = 18
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = [.01,.1,.5,1,'symmetric','asymmetric']
# Beta parameter
beta = [.01,.1,.5,1,'symmetric']

# model results dict
model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=210)

    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                cv = compute_coherence_values(corpus=corpus, dictionary=id2word, 
                                              k=k, a=a, b=b)
                # Save the model results
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                
                pbar.update(1)
    lda_tuning_results = pd.DataFrame(model_results)                
    lda_tuning_results.to_csv('/content/drive/MyDrive/Data/NLP_Capstone/lda_tuning_results2.csv', index=False)
    pbar.close()

# print preprocess runtime
print(time.strftime(f'%H hours, %M minutes, %S seconds', time.gmtime(time.time() - start_time)))


  0%|          | 0/210 [00:00<?, ?it/s][A
  0%|          | 1/210 [01:17<4:29:07, 77.26s/it][A
  1%|          | 2/210 [02:38<4:31:35, 78.34s/it][A
  1%|▏         | 3/210 [04:01<4:35:21, 79.81s/it][A
  2%|▏         | 4/210 [05:26<4:39:23, 81.37s/it][A
  2%|▏         | 5/210 [06:47<4:37:35, 81.25s/it][A
  3%|▎         | 6/210 [08:01<4:29:26, 79.25s/it][A
  3%|▎         | 7/210 [09:20<4:27:51, 79.17s/it][A
  4%|▍         | 8/210 [10:41<4:27:59, 79.60s/it][A
  4%|▍         | 9/210 [12:03<4:28:46, 80.23s/it][A
  5%|▍         | 10/210 [13:21<4:25:08, 79.54s/it][A
  5%|▌         | 11/210 [14:42<4:25:10, 79.95s/it][A
  6%|▌         | 12/210 [16:06<4:28:37, 81.40s/it][A
  6%|▌         | 13/210 [17:32<4:31:38, 82.73s/it][A
  7%|▋         | 14/210 [18:59<4:34:03, 83.89s/it][A
  7%|▋         | 15/210 [20:23<4:32:47, 83.94s/it][A
  8%|▊         | 16/210 [21:40<4:24:32, 81.82s/it][A
  8%|▊         | 17/210 [23:01<4:22:29, 81.60s/it][A
  9%|▊         | 18/210 [24:22<4:20:36, 81.44s/

04 hours, 49 minutes, 05 seconds





In [None]:
lda_tuning_results.sort_values('Coherence',ascending=False).head(30)

Unnamed: 0,Topics,Alpha,Beta,Coherence
170,16,symmetric,0.01,0.639318
179,16,asymmetric,symmetric,0.633133
125,15,0.1,0.01,0.631851
176,16,asymmetric,0.1,0.63034
155,16,0.1,0.01,0.629119
185,17,0.1,0.01,0.629045
186,17,0.1,0.1,0.625642
205,17,asymmetric,0.01,0.624912
175,16,asymmetric,0.01,0.620752
189,17,0.1,symmetric,0.620421
