In [62]:
import random
import torch
import pickle

import re

import pandas
pandas.set_option('display.max_rows', None)

from sentence_transformers import SentenceTransformer, models
from bertopic import BERTopic

## Data importing and cleaning

* Remove links, images, @usernames and RTs indicating re-tweeting.
* Remove all content which is empyt after this

In [63]:
data = pandas.read_csv("realDonaldTrump_in_office.csv")

## remove links and images
data['Tweet.Text'] = data['Tweet.Text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
data['Tweet.Text'] = data['Tweet.Text'].replace(r'pic.twitter.com\S+', '', regex=True)

## remove usernames
data['Tweet.Text'] = data['Tweet.Text'].replace(r'@\S+', '', regex=True)

## remove RTs
data['Tweet.Text'] = data['Tweet.Text'].replace(r'RT', '', regex=True)

data['Tweet.Text'] = data['Tweet.Text'].str.strip()

## remove items which have no remaining content after cleaning it
data = data.drop( data[ data['Tweet.Text'] == '' ].index )

data.head()

Unnamed: 0.1,Unnamed: 0,ID,Time,Tweet.URL,Tweet.Text
0,1,@realDonaldTrump,2017-01-20 06:31,https://twitter.com/realDonaldTrump/status/82...,It all begins today! I will see you at 11:00 A...
1,2,@realDonaldTrump,2017-01-20 11:51,https://twitter.com/realDonaldTrump/status/82...,Today we are not merely transferring power fro...
2,3,@realDonaldTrump,2017-01-20 11:51,https://twitter.com/realDonaldTrump/status/82...,"power from Washington, D.C. and giving it back..."
3,4,@realDonaldTrump,2017-01-20 11:52,https://twitter.com/realDonaldTrump/status/82...,What truly matters is not which party controls...
4,5,@realDonaldTrump,2017-01-20 11:53,https://twitter.com/realDonaldTrump/status/82...,"January 20th 2017, will be remembered as the d..."


## BERTopics created

* Resetting seeds for each model, extracting the word embeddings and then running the analysis with as close to default settings as possible

In [64]:
def create_model( model_name ):

    torch.manual_seed(0)
    random.seed(0)

    word_embedding_model = models.Transformer(model_name, max_seq_length=256)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

    transformer_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    topic_model = BERTopic(embedding_model=transformer_model )
    topic_model.fit_transform(data['Tweet.Text'])

    reduced_topics = topic_model.reduce_topics(data['Tweet.Text'], nr_topics="auto")
    
    pickle.dump( ( reduced_topics, topic_model, transformer_model ) , file = open( f"{model_name.replace('/', '_').replace('..', '')}", "wb") )
    return reduced_topics, topic_model, transformer_model

In [65]:
capitalist, capitalist_full, capitalist_transformer = create_model("microsoft/deberta-v3-base")
marxist, marxist_full, marxist_transformer = create_model("../models/microsoft_deberta-v3-base-finetuned-masked-model")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable 	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISMhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the env

## Basic summaries

In [67]:
capitalist.merge_topics(data['Tweet.Text'], [-1, 0])
capitalist.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,19428,-1_the_to_and_of,"[the, to, and, of, in, is, for, that, on, are]",[It is amazing that I became President of the ...
1,0,205,0_great_true_wow_thanks,"[great, true, wow, thanks, job, book, amazing,...","[Great!, Great!, Great!]"
2,1,127,1_thank_you_great_nice,"[thank, you, great, nice, honor, job, keep, wo...","[THANK YOU GREAT JOB!!, Great news, thank you..."
3,2,97,2_live_delivers_remarks_potus,"[live, delivers, remarks, potus, president, sa...","[LIVE: President delivers remarks on testing,..."
4,3,85,3_again_make_america_great,"[again, make, america, great, watcher, volunte...","[MAKE AMERICA GREAT AGAIN!, MAKE AMERICA GREAT..."
5,4,84,4_thank_you_000_matt,"[thank, you, 000, matt, payrolls, adp, 219, es...","[Thank you Mark!, Thank you John!, THANK YOU B..."
6,5,78,5_interviewed_00_enjoy_tonight,"[interviewed, 00, enjoy, tonight, at, eastern,...",[I will be interviewed by tonight at 9:00 P.M...
7,6,77,6_great_usa_job_agree,"[great, usa, job, agree, wow, dirty, congratul...","[WOW, they got caught. End the Witch Hunt now!..."
8,7,77,7_true_watch_enjoy_now,"[true, watch, enjoy, now, address, on, weekly,...","[Thanks - TRUE!, Watch on NOW. Enjoy!, Than..."
9,8,75,8_pleased_announce_secretary_united,"[pleased, announce, secretary, united, of, am,...",[We are pleased to announce that Matthew G. Wh...


In [69]:
marxist.merge_topics(data['Tweet.Text'], [-1, 0])
marxist.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,19006,-1_the_to_and_of,"[the, to, and, of, in, is, for, that, on, are]","[....and other really bad people, SPIED ON MY ..."
1,0,462,0_via_trump_biden_election,"[via, trump, biden, election, fbi, to, clinton...",[EXCLUSIVE: Report Shows FBI Official Received...
2,1,118,1_thank_you_matt_franklin,"[thank, you, matt, franklin, rudy, brad, steve...","[Thank you!, Thank you!, Thank you!]"
3,2,101,2_true_wow_thanks_great,"[true, wow, thanks, great, amazing, yes, inter...","[True!, Thanks - TRUE!, Thanks - TRUE!]"
4,3,99,3_interviewed_00_enjoy_tonight,"[interviewed, 00, enjoy, tonight, at, be, east...",[Will be interviewed by tonight at 9:00 P.M. ...
5,4,89,4_see_soon_way_state,"[see, soon, way, state, landed, everyone, head...","[On my way, see you soon!, On my way to the Gr..."
6,5,87,5_thank_you_love_nice,"[thank, you, love, nice, honor, so, true, grea...","[Thank you. I love you too!, Great news, thank..."
7,6,83,6_thank_maga_you_maga2020,"[thank, maga, you, maga2020, kag2020, missoula...","[THANK YOU! #MAGA, THANK YOU! #MAGA, THANK YOU..."
8,7,82,7_again_make_america_great,"[again, make, america, great, keep, making, pr...","[MAKE AMERICA GREAT AGAIN!, MAKE AMERICA GREAT..."
9,8,79,8_thank_you_mike_great,"[thank, you, mike, great, dirty, usa, group, k...","[Thank you! ‚Äö√Ñ¬∂, Thank you! ‚Äö√Ñ¬∂, Thank you! ‚Äö√Ñ¬∂]"


In [70]:
capitalist.visualize_barchart().write_image("capitalist-barchart.pdf")
capitalist.visualize_barchart()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [71]:
marxist.visualize_barchart().write_image("marxist-barchart.pdf")
marxist.visualize_barchart()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [72]:
marxist.get_representative_docs(3)

['Will be interviewed by  tonight at 9:00 P.M. on  Enjoy!!!',
 'I will be interviewed by  tonight at 9:00 P.M. on  Enjoy!',
 'Will be interviewed by  on  tonight at 8:00 P.M. Enjoy!']

## Focus on the Nasdaq topic as an example

In [100]:
marxist.get_representative_docs(55)

['DOW, S&amp;P 500 and NASDAQ close at record highs! #MAGA',
 '‚ÄúDOW, NASDAQ, S&amp;P 500 CLOSE AT RECORD HIGHS‚Äù',
 '‚ÄúDOW, NASDAQ, S&amp;P 500 CLOSE AT RECORD HIGHS‚Äù']

In [109]:
topics_over_time = marxist.topics_over_time( data['Tweet.Text'], data['Time'], nr_bins = 50  )
marxist.visualize_topics_over_time(topics_over_time, topics = [55 ])



In [107]:
for topic in capitalist.find_topics('NASDAQ')[0]:
    print( topic )
    print( capitalist.get_representative_docs( topic ) )

32
['Congratulations', 'Congratulations', 'Congratulations']
2
['So true!', 'So true!', 'So true']
49
['OBAMAGATE!', 'OBAMAGATE!', 'OBAMAGATE!']
43
['Presidential Harassment!', 'Presidential Harassment!', 'Presidential Harassment!']
20
['READ THE TRANSCRIPTS!', 'Read the transcripts!', 'Read the Transcripts!']
