In [11]:
!pip install pyLDAvis --quiet
!pip install chart_studio --quiet


In [12]:
import pandas as pd
import numpy as np
import time
import re
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation
import gensim
from spacy.tokenizer import Tokenizer
import gensim.corpora as corpora
from gensim.models.ldamulticore import LdaMulticore
from pprint import pprint
from gensim.models.coherencemodel import CoherenceModel
import plotly.express as px
import pyLDAvis.gensim
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls
from operator import itemgetter
from ipywidgets import interact
import tqdm
from IPython.display import display, Markdown, clear_output
# widget packages
import ipywidgets as widgets


# supress warnings
warnings.filterwarnings("ignore")

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


We will load in our cleaned tweets from our [data cleaning notebook](https://github.com/tarrantcarter/Final_Capstone/blob/main/Modern_Motivation_Data_Cleaning_Feature_Engineering.ipynb). The csv can be found here. 

In [14]:
# load in cleaned tweets from data cleaning notebook
tweets_cleaned = pd.read_json("/content/drive/MyDrive/Data/NLP_Capstone/motivational_tweets_cleaned.json"
                    )

In [15]:
tweets_cleaned.head()

Unnamed: 0,date,user_name,display_name,content,content_preprocessed,tokenized
44,2021-01-17 22:13:17,LewisHowes,Lewis Howes,Know this. Everything is happening for a reaso...,know happen reason favor betterment future pai...,"[know, happen, reason, favor, betterment, futu..."
61,2021-01-15 15:28:06,LewisHowes,Lewis Howes,Protect your inner peace at all costs. Create ...,protect inner peace cost create daily practice...,"[protect, inner, peace, cost, create, daily, p..."
108,2021-01-12 16:40:46,LewisHowes,Lewis Howes,You are stronger than you think. The painful m...,strong think painful moment hurt past mean bre...,"[strong, think, painful, moment, hurt, past, m..."
161,2021-01-07 16:00:29,LewisHowes,Lewis Howes,Always remember to ask for exactly what you wa...,remember ask exactly want ask love good health...,"[remember, ask, exactly, want, ask, love, good..."
206,2021-01-05 00:11:24,LewisHowes,Lewis Howes,What if you treated yourself like someone you ...,treat like madly love imagine positive energy ...,"[treat, like, madly, love, imagine, positive, ..."


In [16]:
tweets_cleaned.applymap(type)

Unnamed: 0,date,user_name,display_name,content,content_preprocessed,tokenized
44,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>
61,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>
108,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>
161,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>
206,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>
...,...,...,...,...,...,...
977606,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>
977607,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>
977608,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>
977609,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'str'>,<class 'str'>,<class 'str'>,<class 'str'>,<class 'list'>


In [17]:
tweets_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 694155 entries, 44 to 977610
Data columns (total 6 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   date                  694155 non-null  datetime64[ns]
 1   user_name             694155 non-null  object        
 2   display_name          694155 non-null  object        
 3   content               694155 non-null  object        
 4   content_preprocessed  694155 non-null  object        
 5   tokenized             694155 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 37.1+ MB


In [18]:
tweets_cleaned.shape

(694155, 6)

# Topic Modeling

In [19]:
# create dictionary
id2word = corpora.Dictionary(tweets_cleaned['tokenized'])
# create texts corpus
texts = tweets_cleaned['tokenized']
# term document frequency
corpus = [id2word.doc2bow(text) for text in texts]
# print first 30 tuples from corpus
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]


In [20]:
# number of topics
num_topics = 5
# build LDA model
base_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# # print the keyword in the 10 topics
# pprint(base_model.print_topics())
# doc_lda = base_model[corpus]

In [21]:
# filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

# create topic sorted by 10 most relevent words
topics = [' '.join(t[0:10]) for t in words]


# print most relevent words for each topic
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

NameError: ignored

In [None]:
# Compute Perplexity
## a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=tweets_cleaned['tokenized'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)

In [None]:
# topic distance visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(base_model, corpus, id2word)

In [None]:
get_document_topics = [base_model.get_document_topics(item) for item in corpus]

In [None]:
len(get_document_topics)

In [None]:
get_document_topics[:20]

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
# start preprocess runtime
start_time = time.time() 

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 15
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

# print preprocess runtime
print(time.strftime(f'%H hours, %M minutes, %S seconds', time.gmtime(time.time() - start_time)))

In [None]:
# Function to sort the list by second item of tuple 
def sort_tuple(tup):  
  
    # reverse = None (Sorts in Ascending order)  
    # key is set to sort using second element of  
    # sublist lambda has been used  
    return(sorted(tup, key = lambda x: x[1],reverse=True))  

In [None]:
sort_tuple(get_document_topics[0])

In [None]:
sorted_tuples = [sort_tuple(tup) for tup in get_document_topics]

In [None]:
sorted_tuples[:5]

In [None]:
def test(x):
  if x[1] > .5:
    return x[0]
  return 'None'


In [None]:
highest_topic = [test(tup[0]) for tup in sorted_tuples]

In [None]:
highest_topic[:5]

In [None]:
max(get_document_topics[0],key = itemgetter(1))[0]

In [None]:
topic_df = tweets_cleaned.copy()

topic_df['best_topic_fit'] = highest_topic

topic_df['best_topic_fit'].value_counts()

In [None]:
# tweet bot will have a drop down for topic and seperate one for twitter user account
test = topic_df['content'].sample(5).reset_index()
test

In [None]:
# helper function to display ipywidgets in colab
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))
  from plotly.offline import init_notebook_mode
  init_notebook_mode(connected=False)

In [None]:
usernames = tweets_cleaned['user_name'].unique().tolist()
usernames.insert(0,'All')
usernames

In [None]:
configure_plotly_browser_state()

@interact(Topic=['All','None',0,1,2,3,4,5,6,7,8,9],User_Name=usernames,button=button)

def topics(Topic,User_Name):
  topic_filtered = topic_df[topic_df['best_topic_fit'] == Topic]
  if Topic:
    if Topic == 'All':
      if User_Name == 'All':
        quote = topic_df['content'].sample(1).reset_index(drop=True)
        return quote.item()
      else:
        user_name_filtered = topic_df[topic_df['user_name'] == User_Name]
        quote = user_name_filtered['content'].sample(1).reset_index(drop=True)
        return quote.item()
    else:  
      if User_Name == 'All':
        quote = topic_filtered['content'].sample(1).reset_index(drop=True)
        return quote.item()
      else:
        user_name_filtered = topic_filtered[topic_filtered['user_name'] == User_Name]
        quote = user_name_filtered['content'].sample(1).reset_index(drop=True)
        return quote.item()
  else:
    return 'Sorry, there are no quotes for these categories'

In [None]:
def topics(Topic,User_Name):
  topic_filtered = topic_df[topic_df['best_topic_fit'] == Topic]
  if Topic:
    if Topic == 'All':
      if User_Name == 'All':
        quote = topic_df['content'].sample(1).reset_index(drop=True)
        return quote.item()
      else:
        user_name_filtered = topic_df[topic_df['user_name'] == User_Name]
        quote = user_name_filtered['content'].sample(1).reset_index(drop=True)
        return quote.item()
    else:  
      if User_Name == 'All':
        quote = topic_filtered['content'].sample(1).reset_index(drop=True)
        return quote.item()
      else:
        user_name_filtered = topic_filtered[topic_filtered['user_name'] == User_Name]
        quote = user_name_filtered['content'].sample(1).reset_index(drop=True)
        return quote.item()
  else:
    return 'Sorry, there are no quotes for these categories'

In [None]:
configure_plotly_browser_state()

# button = widgets.Button(description='My Button')
# out = widgets.Output()

@interact(Topic=['All','None',0,1,2,3,4,5,6,7,8,9],User_Name=usernames,Click_Me=True)

def topics(Topic,User_Name,Click_Me):
  topic_filtered = topic_df[topic_df['best_topic_fit'] == Topic]
  if Topic:
    if Topic == 'All':
      if User_Name == 'All':
        quote = topic_df['content'].sample(1).reset_index(drop=True)
        print(quote.item())
      else:
        user_name_filtered = topic_df[topic_df['user_name'] == User_Name]
        quote = user_name_filtered['content'].sample(1).reset_index(drop=True)
        print(quote.item())
    else:  
      if User_Name == 'All':
        quote = topic_filtered['content'].sample(1).reset_index(drop=True)
        print(quote.item())
      else:
        user_name_filtered = topic_filtered[topic_filtered['user_name'] == User_Name]
        quote = user_name_filtered['content'].sample(1).reset_index(drop=True)
        print(quote.item())
  else:
    print('Sorry, there are no quotes for these categories')

# def on_button_clicked(Topic):
#       # "linking function with output"
#       with out:
#           # what happens when we press the button
#           clear_output()
#           topics(Topic,User_Name)
#           print(Topic)
# linking button and function together using a button's method
# button.on_click(topics)
# # displaying button and its output together
# widgets.VBox([button,out])


In [None]:
btn = widgets.Button(description='Medium')
display(btn)
def btn_eventhandler(obj):
    print('Hello from the {} button!'.format(obj.description))
btn.on_click(btn_eventhandler)

In [None]:
def unique_values(array):
  unique = array.unique().tolist()
  unique.insert(0,'All')
  return unique

In [None]:
dropdown_user = widgets.Dropdown(options = unique_values(tweets_cleaned['user_name']))
#dropdown_topic = widgets.Dropdown(options = unique_values(topic_df['best_topic_fit']))
dropdown_topic = widgets.Dropdown(options = ['All','None',0,1,2,3,4,5,6,7,8,9])

In [None]:
dropdown_topic = widgets.Dropdown(options = ['All','None',0,1,2,3,4,5,6,7,8,9])

output_topic = widgets.Output()

def dropdown_topic_eventhandler(change):
  #output_topic.clear_output(wait=True)
  #with output_topic:
    if (change.new == 'All'):
      #quote = topic_df['content'].sample(1).reset_index(drop=True)
      display(output_topic)
    else:
      #topic_filtered = topic_df[topic_df['best_topic_fit'] == change.new]
      #quote = topic_filtered['content'].sample(1).reset_index(drop=True)
      #display(quote.item())
      display(tweets_cleaned.tail())

dropdown_topic.observe(dropdown_topic_eventhandler, names='values')

display(dropdown_topic)

In [None]:
import pandas as pd
import numpy as np
url = "https://data.london.gov.uk/download/number-international-visitors-london/b1e0f953-4c8a-4b45-95f5-e0d143d5641e/international-visitors-london-raw.csv"
df_london = pd.read_csv(url, encoding= 'unicode_escape')

In [None]:
ALL = 'ALL'
def unique_sorted_values_plus_ALL(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL)
    return unique

In [None]:
dropdown_year = widgets.Dropdown(options = unique_sorted_values_plus_ALL(df_london.year))

In [None]:
dropdown_year = widgets.Dropdown(options =    unique_sorted_values_plus_ALL(df_london.year))

output_year = widgets.Output()

display(dropdown_year)

def dropdown_year_eventhandler(change):
    # with output_year:
    #   output_year.clear_output()
      if (change.new == ALL):
          display(df_london)
      else:
          display(df_london[df_london.year == change.new])

dropdown_year.observe(dropdown_year_eventhandler, names='value')

In [None]:
def topics(Topic,User_Name,Click_Me):
  topic_filtered = topic_df[topic_df['best_topic_fit'] == Topic]
  if Topic:
    if Topic == 'All':
      if User_Name == 'All':
        quote = topic_df['content'].sample(1).reset_index(drop=True)
        print(quote.item())
      else:
        user_name_filtered = topic_df[topic_df['user_name'] == User_Name]
        quote = user_name_filtered['content'].sample(1).reset_index(drop=True)
        print(quote.item())
    else:  
      if User_Name == 'All':
        quote = topic_filtered['content'].sample(1).reset_index(drop=True)
        print(quote.item())
      else:
        user_name_filtered = topic_filtered[topic_filtered['user_name'] == User_Name]
        quote = user_name_filtered['content'].sample(1).reset_index(drop=True)
        print(quote.item())
  else:
    print('Sorry, there are no quotes for these categories')


In [None]:



def on_button_clicked(_):
      # "linking function with output"
      with out:
          # what happens when we press the button
          clear_output()
          topics('All','All')
# linking button and function together using a button's method
button.on_click(on_button_clicked)
# displaying button and its output together
widgets.VBox([button,out])