In [1]:
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [2]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

In [3]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

vectorizer_model = CountVectorizer(stop_words="english")

ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
)

In [4]:
news_df = pd.read_parquet("../2023-coref.parquet")
news_df.head()

Unnamed: 0,source,id,category,title,published,body,summary,summary_type
0,reuters,29667,Europe,"European shares rise on upbeat cues from Powell, strong earnings",2023-02-08T17:11:00,"Feb 8 (Reuters) - European shares rose on Wednesday, lifted by positive cues from U.S. Federal Reserve Chair Jerome Powell's overnight remarks as well as upbeat earnings from energy and chemicals firms.\n\nThe pan-European STOXX 600 (.STOXX) closed 0.3% higher having retreated from nine-month highs hit earlier in the session after other U.S. Federal Reserve policymakers sounded a more hawkish tone.\n\nU.S. Federal Reserve Chair Jerome Powell's's remarks on Tuesday, in which U.S. Federal Rese...","* \n* STOXX 600 pulls back from 9-mth high to close up 0.3%\n* Neste, Equinor boost energy stocks on strong earnings\n* Pandora jumps on better than expected results",BULLETS
1,reuters,79619,China,China Evergrande debt restructuring incentive to test creditor appetite,2023-04-27T06:05:00,"HONG KONG, April 27 (Reuters) - Embattled property developer China Evergrande Group (3333.HK) will get a signal on Thursday of the level of creditor backing for Embattled property developer China Evergrande Group (3333.HK)'s offshore debt restructuring proposal, with investors to be given an incentive for early support.\n\nWhile some major creditors have already agreed to support its offshore debt restructuring proposal, others are eager to cut ties with Embattled property developer China Ev...",,
2,reuters,48010,European Markets,European stocks rally as banking worries fade,2023-03-30T16:27:00,"March 30 (Reuters) - European stocks rose to near three-week highs on Thursday, lifted by upbeat results from retail giant H&M and as fading concerns about the global banking sector fuelled a Wall Street rally.\n\nThe pan-European STOXX 600 index (.STOXX) rose 1.1%, hitting The pan-European STOXX 600 index (.STOXX) strongest level since March 10, tracking the recovery in global markets.\n\n""It's a combination of easing banking fears and expectation that we are very close to peak rates which ...","* \n* H&M posts surprise profit in Dec-Feb\n* Vestas wins order in Brazil\n* Spain March inflation falls more than expected\n* German inflation eases but less than expected in March\n* STOXX 600 rises 1.0%, banks add 1.5%",BULLETS
3,reuters,67859,Aerospace & Defense,China gears up to compete with SpaceX's Starlink this year,2023-03-02T10:29:00,"BEIJING, March 2 (Reuters) - China's military-industrial complex is set to start building China's military-industrial complex's first constellation of very low-Earth orbit (LEO) satellites later this year, the latest Chinese bid to catch up with U.S.-based Starlink.\n\nU.S.-based Starlink has built a fast-growing network of more than 3,500 satellites in low-Earth orbit. U.S.-based Starlink has tens of thousands of users in the United States so far, and plans to add tens of thousands more sat...",,
4,reuters,109054,World,Russia's war on Ukraine latest: Moscow denies reports of Ukrainian breakthroughs,2023-05-12T05:13:00,"May 11 (Reuters) - Russia's defence ministry on Thursday denied reports that Ukrainian forces had broken through in various places along the front lines and said the military situation was under control.\n\nRussia's reacted after Russian military bloggers, writing on Telegram, reported what Russian military bloggers said were Ukrainian advances north and south of the city of Bakhmut, with some suggesting a long-awaited counter offensive by pro-Kyiv forces had started.\n\n* President Volodymy...",,


In [5]:
clusters = topic_model.fit_transform(news_df.body)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [10]:
topics, prob = clusters
type(topics)

list

In [12]:
topic_model.get_topics()

{-1: [('fox', 0.0008343562388895845),
  ('meta', 0.0006664647082011821),
  ('business', 0.000660936842484573),
  ('russian', 0.000652827467150431),
  ('rose', 0.0006249531092944191),
  ('federal', 0.0006221544346418792),
  ('china', 0.000619572992807452),
  ('said', 0.0006164097201609378),
  ('company', 0.0006154021427715742),
  ('german', 0.0006138148960396302)],
 0: [('openai', 0.01948428043776365),
  ('microsoft', 0.019245174024242268),
  ('chatgpt', 0.01781041933242382),
  ('ai', 0.017308309083302442),
  ('chatbot', 0.01438820736972961),
  ('bing', 0.011979222950661932),
  ('msft', 0.010647218723821864),
  ('artificial', 0.010457368085216957),
  ('bard', 0.00999661352389688),
  ('generative', 0.0097481673776943)],
 1: [('opec', 0.019149169226633938),
  ('crude', 0.01715375326935137),
  ('brent', 0.014308209809166805),
  ('barrel', 0.013642263336666682),
  ('intermediate', 0.01125594257774884),
  ('wti', 0.011071934278289638),
  ('oil', 0.008573241328084959),
  ('barrels', 0.0085479

In [15]:
news_df["topic"] = topics
news_df["probability"] = prob
news_df.to_parquet("../2023-topics.parquet", index=False)