# Topic Modeling with Top2Vec
Here we'll build a NLP Pipeline to interpret our tweet text data. Then we can see how these topics might correlate with virality.

In [18]:
# Importing packages
import pandas as pd
import numpy as np
from top2vec import Top2Vec
import regex

### Loading and Processing Data

In [13]:
data = pd.read_csv('data/combined_data.csv', index_col=0)

In [14]:
# Document List Creation
docs = data.tweet.tolist()

In [20]:
# Document List Cleaning
docs = [d.replace("https", "") for d in docs]
docs = [d.replace("daysofcode", "") for d in docs]
docs = [d.replace(".co", "") for d in docs]

### Top2Vec

In [21]:
# Model Parameters
min_ct_for_topic = int(len(docs) / 700)
print(f"Min Occurrance Count: {min_ct_for_topic}")

# Model Training
model = Top2Vec(docs, embedding_model='universal-sentence-encoder', min_count=min_ct_for_topic, workers=8, ngram_vocab=False, speed="deep-learn")

2022-10-15 13:12:36,371 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training


Min Occurrance Count: 90


2022-10-15 13:12:41,175 - top2vec - INFO - Downloading universal-sentence-encoder model
INFO:top2vec:Downloading universal-sentence-encoder model
2022-10-15 13:39:28,090 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-10-15 13:39:38,752 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-10-15 13:40:12,040 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-10-15 13:40:15,665 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics


### Interpreting Outputs

In [22]:
# Peek topic length & distribution
topic_sizes, topic_nums = model.get_topic_sizes()
print("Number of Topics: ",len(topic_nums))
print()
print(topic_sizes)

Number of Topics:  228

[1706 1700 1657 1310 1304 1183 1149 1084 1035  984  892  871  859  852
  688  669  653  648  618  616  594  586  582  577  573  557  551  541
  520  518  504  502  481  464  448  444  443  442  414  414  413  409
  392  392  385  380  372  357  352  352  349  346  346  342  337  335
  334  331  325  320  319  315  314  313  309  306  305  302  296  293
  292  289  285  278  277  277  276  275  272  271  270  265  262  261
  261  255  246  242  240  238  236  234  228  228  227  222  211  210
  208  207  204  202  200  197  195  194  193  193  190  186  184  184
  177  176  175  173  170  169  168  167  167  165  162  162  162  161
  160  160  159  156  155  151  150  147  140  140  139  137  137  137
  136  136  134  133  133  131  128  128  126  125  125  123  123  122
  121  120  119  118  117  116  114  114  113  112  112  112  110  109
  108  108  106  105  104  101  101  100   99   99   95   94   92   92
   92   89   89   89   86   86   84   84   82   79   

In [55]:
# # Examine all topics
# for words, scores, num in zip(topic_words, word_scores, topic_nums):
#     print(num)
#     print(f"Topic Keywords: {words}")

In [54]:
# Extract topic data
topic_words, word_scores, topic_nums = model.get_topics()

# Individual Tweet Lookup
def tweet_topic_lookup(tweet_int, model, data):
    display(data.iloc[[tweet_int]].style.set_properties(**{'text-align': 'left'})) # Testing document correlation with DF by looking up tweets
    display(model.get_documents_topics([tweet_int])[1]) # Look at top2vec confidence of single tweet
    display(model.get_documents_topics([tweet_int])[2]) # Look at top2vec topic keywords compared to a single tweet

# Look up topic and get information and examples
def examine_topic(topic, model):
    print('Main Keywords:')
    print(topic_words[topic])
    print('----------')
    print('Keyword Importance:')
    print(word_scores[topic]) # Look at word scores by topic
    print('----------')
    print('Sample Tweets:')
    print('----------')
    documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic, num_docs=5)
    for doc, score, doc_id in zip(documents, document_scores, document_ids):
        print(f"### Document: {doc_id}, Score: {score} ###")
        print(doc)
        print('----------')

In [53]:
examine_topic(3, model) # Examine specific topic

Main Keywords:
['nlp' 'deepmind' 'neural' 'tensorflow' 'kaggle' 'generative' 'openai'
 'computational' 'algorithms' 'ai' 'coursera' 'dataset' 'learning'
 'freecodecamp' 'algorithm' 'bigdata' 'datasets' 'training' 'webinar'
 'introducing' 'neuroscience' 'taught' 'data' 'mit' 'blockchain'
 'cognitive' 'iot' 'reinforcement' 'recognition' 'stanford'
 'classification' 'econometrics' 'pytorch' 'quantum' 'gpu' 'text'
 'framework' 'knows' 'featuring' 'ml' 'developed' 'inference'
 'interactive' 'api' 'computing' 'github' 'trained' 'abstract' 'improved'
 'study']
----------
Keyword Importance:
[0.39640322 0.35023597 0.33974016 0.32590184 0.322042   0.30702546
 0.28158936 0.27387494 0.26199976 0.25702336 0.25467148 0.2471179
 0.22766978 0.21247593 0.2123543  0.21055162 0.20482895 0.19830191
 0.1968064  0.19650778 0.19054145 0.18923914 0.18612158 0.18451017
 0.18440522 0.18187502 0.18078029 0.17971212 0.17963073 0.17637622
 0.17589478 0.17390436 0.17373878 0.17256013 0.17111129 0.17054862
 0.16924

In [47]:
tweet_topic_lookup(0, model, data)

Unnamed: 0,creation_time,user,tweet,retweets,favorites,followers,lists,topic,confidence
0,2022-09-26 20:14:17+00:00,GregL_Intel,I am looking forward to Intel Fellow @brendangregg joining me during my keynote at Intel Innovation. Hear directly from a globally recognized expert in computing performance and eBPF as well as other industry experts. https://t.co/cq9gt1SuPx #IntelON,10,54,4123,41,25,0.484086


array([0.4840864], dtype=float32)

array([['webinar', 'ai', 'deepmind', 'kaggle', 'openai', 'attending',
        'meeting', 'conference', 'technologies', 'robotics', 'discussed',
        'panel', 'session', 'meetings', 'mit', 'vr', 'collaboration',
        'talks', 'joining', 'iot', 'introducing', 'institute',
        'neuroscience', 'conferences', 'neural', 'ml', 'lab', 'coursera',
        'implications', 'invited', 'speakers', 'participate',
        'autonomous', 'computational', 'offline', 'announced', 'virtual',
        'generative', 'discussing', 'guest', 'speaker', 'cohort',
        'sessions', 'cybersecurity', 'workshop', 'based', 'met',
        'joined', 'discussions', 'third']], dtype='<U15')

In [58]:
# Topic top tweet Lookup
data.loc[data['topic'] == 70].sort_values('retweets', ascending=False).head().style.set_properties(**{'text-align': 'left'})

Unnamed: 0,creation_time,user,tweet,retweets,favorites,followers,lists,viral,virality,topic,confidence
14577,2021-10-09 19:18:19+00:00,RomeoStevens76,(source unknown) https://t.co/cdJ25qAEIE,128,712,4104,82,1,0.001975,70,0.279908
17158,2022-05-22 15:58:20+00:00,MCMCD_,other coal pieces https://t.co/hJmMZzhwvv,25,367,3499,28,1,0.000452,70,0.147615
14730,2022-02-17 09:22:35+00:00,smallkewlhuman,@PrimeOgHarris @zaharoorin rest in peace sweet antwain <3,18,1818,2516,20,1,0.000453,70,0.338763
17323,2021-08-24 23:35:05+00:00,metaphorician,Earthling 😵 @ziyatong https://t.co/xngI3On2jh,9,58,3063,55,1,0.000186,70,0.367412
16607,2021-08-18 17:16:41+00:00,CharlesCrooks,@ReturnofR “It Makes No Difference” by The Band. https://t.co/sapjCcxHrF,8,16,351,1,1,0.001443,70,0.395987


### Preparing data for export

In [38]:
# for loop - look up each document and append the topic ID to the original dataframe
topic_ids = []
topic_confidence = []
for doc in model.document_ids: # model.document_ids # List of all document IDs
    topic_confidence.append(model.get_documents_topics([doc])[1][0]) # Grabbing Confidence
    topic_ids.append(model.get_documents_topics([doc])[0][0]) # Grabbing Topic ID
data['topic'] = topic_ids
data['confidence'] = topic_confidence

### Export

In [46]:
data['topic'].to_csv('topics.csv')

:)