In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import json
import pprint

In [11]:
data = pd.read_json("bluesky_posts_0131.json",lines=True,orient='columns')

In [12]:
data[:10]

Unnamed: 0,author,text,timestamp,likes,reposts,comment
0,worldbytenews.bsky.social,Renowned Dallas burger spot hosting fundraiser...,2025-01-31T12:31:07Z,0,0,[]
1,worldbytenews.bsky.social,Renowned Dallas burger spot hosting fundraiser...,2025-01-31T12:20:41Z,0,0,[]
2,smcclain2110.bsky.social,I feel like when we were a more cohesive socie...,2025-01-31T12:14:58.402Z,1,0,[]
3,rufatto.bsky.social,Ontem rolou o Fire Aid em beneficio das vitima...,2025-01-31T11:47:13.570Z,2,0,[fui direto ver o Dawes com Stephen Stills e G...
4,thecapitalist.bsky.social,"California has been devastated by fires, trans...",2025-01-31T11:34:54.638Z,0,0,[]
5,greendayitaly.bsky.social,Billie Joe Armstrong of Green Day poses with D...,2025-01-31T11:14:30.018Z,3,0,[]
6,elpaischile.bsky.social,Durante 2025 se firmará un acuerdo de colabora...,2025-01-31T10:53:57.509Z,2,0,[]
7,greendayitaly.bsky.social,Green Day performing with Billie Eilish during...,2025-01-31T09:44:07.267Z,16,6,[]
8,emoryro.bsky.social,"No notes. Okay, one note.",2025-01-31T09:33:36.536Z,0,0,[]
9,aptronym.bsky.social,Chubb Ltd. is estimating that it will need to ...,2025-01-31T09:22:33.129Z,0,0,[]


In [13]:
docs = data[data['text'].str.len() > 30].text.unique()

In [14]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stop_words = stopwords.words('english')


In [50]:
stop_words.extend(['california wildfires', 'california', 'los angeles', 'california fire', 'california fires', 'california wildfire',
                   'fire', 'wildfire', 'fires', 'wildfires', 'ca', 'los', 'angeles', 'la', 'san', 'francisco', 'sf'])

In [56]:
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stop_words)
representation_model = KeyBERTInspired()
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP(n_neighbors=20, n_components=2, min_dist=0)
hdbscan_model = HDBSCAN(min_cluster_size=80, min_samples=40,
                        gen_min_span_tree=True,
                        prediction_data=True)

In [57]:
from bertopic import BERTopic

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    top_n_words=5,
    language='english',
    calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform(docs)

2025-03-04 22:22:08,917 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/136 [00:00<?, ?it/s]

2025-03-04 22:22:50,959 - BERTopic - Embedding - Completed ✓
2025-03-04 22:22:50,960 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-04 22:23:00,446 - BERTopic - Dimensionality - Completed ✓
2025-03-04 22:23:00,453 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-04 22:23:00,713 - BERTopic - Cluster - Completed ✓
2025-03-04 22:23:00,738 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-04 22:23:09,454 - BERTopic - Representation - Completed ✓


In [58]:
model.visualize_topics()

In [59]:
freq = model.get_topic_info()
freq.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,799,-1_ending fema_fema ahead_fema_trump recommends,"[ending fema, fema ahead, fema, trump recommen...",[Dumbass President wants to end FEMA. If you v...
1,0,1094,0_firefighters_cal_palisadesfire_pacific,"[firefighters, cal, palisadesfire, pacific, pa...",[Progress is made on a huge fire north of LA a...
2,1,732,1_trump wants_gavin newsom_disaster relief_trump,"[trump wants, gavin newsom, disaster relief, t...",[Trump Shifts Tone on California After Touring...
3,2,299,2_battery facility_battery plants_battery plan...,"[battery facility, battery plants, battery pla...",[Fire at world’s largest battery facility is a...
4,3,294,3_dangerous winds_gusty winds_potential rain_w...,"[dangerous winds, gusty winds, potential rain,...",[Fire risk and strong winds continue in Southe...
5,4,238,4_hughes burning_hughesfire_burning near_hughe...,"[hughes burning, hughesfire, burning near, hug...",[Southern California Wildfires Grow As Santa A...
6,5,230,5_fireaidla org_fireaid_fireaidla_fundraising,"[fireaidla org, fireaid, fireaidla, fundraisin...",[FireAid for LA ~ \nBringing together some of ...
7,6,183,6_trump water_water trump_water policy_trump cr,"[trump water, water trump, water policy, trump...",[Trump targets California water policy as he p...
8,7,179,7_homes resistant_implementing law_law make_law,"[homes resistant, implementing law, law make, ...",[California is years behind in implementing a ...
9,8,165,8_eaton altadena_southern edison_deadly eaton_...,"[eaton altadena, southern edison, deadly eaton...",[Altadena’s Black residents disproportionally ...


In [60]:
model.visualize_heatmap()

In [61]:
model.visualize_hierarchy()