In [12]:
# Packages
import pandas as pd
import datetime
from top2vec import Top2Vec


In [16]:
# Loading Data
df = pd.read_csv('data/combined_data.csv', index_col=0)
df

Unnamed: 0,creation_time,user,tweet,retweets,favorites,followers,lists
0,2022-09-26 20:14:17+00:00,GregL_Intel,I am looking forward to Intel Fellow @brendang...,10,54,4123,41
1,2022-09-26 20:11:39+00:00,GregL_Intel,Open software ecosystems are key to fostering ...,4,26,4123,41
2,2022-09-21 18:48:59+00:00,GregL_Intel,“AI Everywhere” will require optimized hardwar...,11,25,4123,41
3,2022-09-20 18:00:01+00:00,GregL_Intel,Our collaboration with @TU_Muenchen and the It...,2,12,4123,41
4,2022-09-19 22:52:59+00:00,GregL_Intel,"I am looking forward to having @AndrewYNg, fou...",19,85,4123,41
...,...,...,...,...,...,...,...
63028,2020-04-15 16:15:57+00:00,BarryJOGorman,@S_dF speaking of 'daily digital habits' - int...,2,1,909,103
63029,2020-04-15 13:54:53+00:00,BarryJOGorman,@nyike - if 'pushing envelope' - will always m...,1,0,909,103
63030,2021-02-03 18:17:58+00:00,RahulRJB,Even the mighty fall\n#FarmersProstest,1,2,47,0
63031,2022-10-06 06:42:42+00:00,jonsadventures,I've been writing notes for my second year non...,10,62,99,4


In [17]:
# Separating tweet text as text documents
docs = df.tweet.tolist()

# Some cleanup of words that are not removed by Top2Vec's stopword cleaning:
docs = [d.replace("https", "") for d in docs]
docs = [d.replace(".co", "") for d in docs]

# Setting threshold for word frequency - this will determine how nuanced our topics are
min_ct_for_topic = int(len(docs) / 1000) # Change the number in this equation to tune - higher numbers here increases topic ct
print(f"Min Occurrance Count: {min_ct_for_topic}")

Min Occurrance Count: 63


In [18]:
# Building our Top2Vec model - this cell takes a long time to run!
start = datetime.datetime.now()
tv_model = Top2Vec(docs, 
#                    embedding_model='universal-sentence-encoder', 
                   min_count=min_ct_for_topic, 
                   workers=8, 
                   ngram_vocab=False, 
                   speed="deep-learn")
end = datetime.datetime.now(); elapsed = end-start
print('Cell took a total of {}'.format(elapsed))

2022-10-11 12:54:38,541 - top2vec - INFO - Pre-processing documents for training
2022-10-11 12:54:42,699 - top2vec - INFO - Creating joint document/word embedding
2022-10-11 13:35:02,288 - top2vec - INFO - Creating lower dimension embedding of documents
2022-10-11 13:35:43,536 - top2vec - INFO - Finding dense areas of documents
2022-10-11 13:35:49,597 - top2vec - INFO - Finding topics


Cell took a total of 0:41:11.664962


In [19]:
topic_sizes, topic_nums = tv_model.get_topic_sizes()
topic_words, word_scores, topic_nums = tv_model.get_topics()

print("Number of Topics: ",len(topic_nums))
print()
print("First 20 Topic Sizes:", topic_sizes[0:20])

Number of Topics:  433

First 20 Topic Sizes: [718 543 479 472 469 460 435 433 417 414 406 400 386 384 379 353 347 347
 342 334]


In [20]:
# Functions for examining our topics and tweet/topic relationships:
def examine_topic(topic, model):
    print('Main Keywords:')
    print(topic_words[topic])
    print('----------')
    print('Keyword Importance:')
    print(word_scores[topic]) # Look at word scores by topic
    print('----------')
    print('Sample Tweets:')
    print('----------')
    documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic, num_docs=5)
    for doc, score, doc_id in zip(documents, document_scores, document_ids):
        print(f"### Document: {doc_id}, Score: {score} ###")
        print(doc)
        print('----------')

# Individual Tweet Lookup
def tweet_topic_lookup(tweet_int, model, df=df):
    display(df.iloc[[tweet_int]].style.set_properties(**{'text-align': 'left'})) # Testing document correlation with DF by looking up tweets
    display(model.get_documents_topics([tweet_int])[1]) # Look at top2vec confidence of single tweet
    display(model.get_documents_topics([tweet_int])[2]) # Look at top2vec topic keywords compared to a single tweet

In [21]:
examine_topic(140, tv_model) # Examine specific topic

Main Keywords:
['knows' 'gretl' 'econometrics' 'everybody' 'statistics' 'everything'
 'nobody' 'else' 'datascience' 'gap' 'elon' 'doc' 'crazy' 'absolutely'
 'god' 'talking' 'pdf' 'contributed' 'terrible' 'guy' 'programming'
 'realize' 'fossil' 'wants' 'either' 'machines' 'somehow' 'statistical'
 'none' 'gop' 'neuralnetworks' 'stuff' 'someone' 'function' 'package'
 'oil' 'math' 'difference' 'selling' 'keeps' 'elonmusk' 'medicare'
 'mathematics' 'republicans' 'jack' 'file' 'green' 'materials' 'anyone'
 'he']
----------
Keyword Importance:
[0.23365511 0.22360906 0.219239   0.12581164 0.11512724 0.11330457
 0.10870527 0.10527524 0.10105331 0.0988408  0.08908126 0.08776836
 0.08202932 0.0757618  0.07196143 0.06944071 0.06817529 0.06389932
 0.06136418 0.06037543 0.05991491 0.05960588 0.05906182 0.05860115
 0.05764069 0.05731321 0.05648596 0.05558082 0.05443136 0.05431063
 0.05377773 0.05132705 0.05113176 0.05033844 0.05006592 0.04919706
 0.04902262 0.04827056 0.04681633 0.04583693 0.04425872

In [22]:
tweet_topic_lookup(4612, tv_model)

Unnamed: 0,creation_time,user,tweet,retweets,favorites,followers,lists
4612,2022-08-02 15:39:31+00:00,benthompson,Thank you for the warm welcome back Taiwan! https://t.co/bAJP0r4f3k,5,135,224268,5978


array([0.50866127], dtype=float32)

array([['ocean', 'fire', 'weekend', 'classes', 'mentor', 'midjourney',
        'lecture', 'chairs', 'ipcc', 'friday', 'warm', 'pr', 'tips',
        'conflict', 'appreciate', 'managing', 'digitalart', 'lucky',
        'distance', 'lectures', 'forever', 'card', 'christmas',
        'graduate', 'sea', 'ambassador', 'scholarship', 'dreams',
        'harvard', 'god', 'astra', 'trading', 'gift', 'beta', 'county',
        'walk', 'send', 'father', 'okay', 'program', 'ex', 'forms',
        'max', 'shares', 'generativeart', 'guys', 'yourself', 'title',
        'clarkwa', 'mental']], dtype='<U15')

In [23]:
# for loop - look up each document and append the topic ID to the original dataframe
topic_ids = []
for doc in tv_model.document_ids: # model.document_ids # List of all document IDs
    # tv_model.get_documents_topics([doc])[1][0] # Grabbing Confidence
    topic_ids.append(tv_model.get_documents_topics([doc])[0][0]) # Grabbing Topic ID
df['topic'] = topic_ids

In [None]:
### Saving Topics
# df['topic'].to_csv('data/topics.csv')