In [1]:
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [2]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

In [3]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

vectorizer_model = CountVectorizer(stop_words="english")

ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
)

In [4]:
news_df = pd.read_parquet("gs://scraped-news-article-data-null/subsample-2023-4.parquet")
news_df.head()

Unnamed: 0,source,id,category,title,published,body,summary,summary_type
0,reuters,25639,Middle East,Major Gulf markets retreat in early trade,2023-04-05T08:16:00,"April 5 (Reuters) - Major stock markets in the Gulf were subdued in early trade on Wednesday, as signs of a slowing U.S. labour market made investors nervous about the economic outlook.\n\nAsia trade was thinned by holidays in Hong Kong and China, leaving MSCI's Asia-Pacific index excluding Japan (.MIAPJ0000PUS) faring little better than flat, while Japan's Nikkei (.N225) fell 1.6% and was set for the biggest one-day percentage fall since mid-March.\n\nSaudi Arabia's benchmark index (.TASI) ...",,
1,reuters,109921,Europe,Father of hundreds gets sperm donation ban from Dutch court,2023-04-28T22:44:00,"THE HAGUE, April 28 (Reuters) - A Dutch court on Friday the court ruling a man who judges said had fathered between 500 and 600 children around the world to stop donating sperm.\n\na man who judges said had fathered between 500 and 600 children around the world was forbidden to donate more semen to clinics, the court ruling said. a man who judges said had fathered between 500 and 600 children around the world could be fined 100,000 euros ($110,000) per infraction.\n\nA Dutch court also order...",,
2,reuters,82443,Asia Pacific,Former NZ PM Jacinda Ardern accepts Harvard fellowships,2023-04-25T22:54:00,"WELLINGTON, April 26 (Reuters) - Former New Zealand Prime Minister Jacinda Ardern said on Wednesday that Former New Zealand Prime Minister Jacinda Ardern was taking up three fellowships at Harvard University later in 2023.\n\nFormer New Zealand Prime Minister Jacinda Ardern stepped down as prime minister in January saying Former New Zealand Prime Minister Jacinda Ardern had ""no more in the tank"" to lead New Zealand and would also not seek re-election to parliament. Former New Zealand Prime M...",,
3,reuters,55850,United States,Factbox: What is known about latest leak of U.S. secrets,2023-04-11T13:55:00,"WASHINGTON, April 10 (Reuters) - (This April 10 story has been corrected to read NOFORN instead of NONFORN, in paragraph 6)\n\nU.S. national security agencies and the Justice Department are investigating the release of dozens of classified documents to assess the damage to national security and relations with allies and other countries, including Ukraine.\n\nHere is what we know and do not know about what appears to be the gravest leak of U.S. secrets in years:\n\nU.S. officials believe most...",,
4,reuters,63446,World,Explainer: South Korea's nuclear dilemma: Would U.S. trade Seattle for Seoul?,2023-04-26T10:31:00,"WASHINGTON, April 26 (Reuters) - At a summit with South Korean leader Yoon Suk Yeol on Wednesday, U.S. President Joe Biden will seek to reassure South Koreans that the U.S. nuclear umbrella is sufficient protection from North Korean attack.\n\nBut with North Korean missiles now a direct threat to the United States, an old Cold-War dilemma has resurfaced: would the United States risk nuclear retaliation against the United States to defend an ally?\n\nWHY IS South Koreans WORRIED?\n\nU.S. ""ext...",,


In [5]:
clusters = topic_model.fit_transform(news_df.body)

In [6]:
topics, prob = clusters
type(topics)

list

In [7]:
topic_model.get_topics()

{-1: [('group', 0.002760418941299727),
  ('said', 0.0024537963976513613),
  ('new', 0.0023968438094255236),
  ('year', 0.002325335936270146),
  ('bank', 0.002266154077591328),
  ('billion', 0.0021967172699500362),
  ('reuters', 0.002191912984461558),
  ('april', 0.002178235023287301),
  ('federal', 0.0021427616819330504),
  ('china', 0.0020967023976799434)],
 0: [('joe', 0.037498663356501614),
  ('biden', 0.03677295208782699),
  ('desantis', 0.024933955497762398),
  ('ron', 0.024516071469747793),
  ('president', 0.02321174477989267),
  ('trump', 0.02275658697480155),
  ('republican', 0.022335747067233423),
  ('donald', 0.02191379743278028),
  ('florida', 0.01869382111797152),
  ('campaign', 0.015298878259509483)],
 1: [('abortion', 0.0564951878656614),
  ('mifepristone', 0.0515505975141842),
  ('pill', 0.04220723785789389),
  ('drug', 0.03855570843182264),
  ('administration', 0.026885244995672335),
  ('judge', 0.024573699175937363),
  ('food', 0.02108798656788588),
  ('approval', 0.01

In [15]:
news_df["topic"] = topics
news_df["probability"] = prob
news_df.to_parquet("../2023-topics.parquet", index=False)