In [1]:
import pandas as pd
from bertopic import BERTopic
import re
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Load data
df = pd.read_csv("../Data/ukraine_textcontain_after_new_preprocessed.csv")
df

Unnamed: 0.1,Unnamed: 0,article_title,author,published_time,article_text,article_category_one,article_category_two,picture_description,author_title,author_description
0,0,US can no longer be perfunctory toward Guterre...,Global Times,2022-08-22,"Since this year, the United Nations (UN) has r...",OPINION,EDITORIAL,United Nations Secretary-General Antonio Guter...,Author details not found,Author details not found
1,1,Healthy and stable China-S.Korea ties depend o...,Wang Junsheng,2022-08-23,Wednesday marks the 30th anniversary of the es...,OPINION,VIEWPOINT,Illustration: Chen Xia/GT,Author details not found,Author details not found
2,2,"China’s power in the world, economically and p...",Global Times,2022-08-23,"Editor's Note: For Chinese people, the past de...",OPINION,VIEWPOINT,"Tower Bridge, London, the UK Photo:VCG",Author details not found,Author details not found
3,3,Europe faces ‘coldest’ winter as energy crisis...,Qi Xijia,2022-08-23,A deepening energy crisis in Europe could put ...,SOURCE,ECONOMY,A coal-fired power plant operated by German en...,Author details not found,Author details not found
4,4,Severe drought in Europe ‘further expanding an...,AFP,2022-08-23,A severe drought hitting swathes of Europe is ...,WORLD,EUROPE,Picture description not found,Author details not found,Author details not found
...,...,...,...,...,...,...,...,...,...,...
484,485,US policy of ‘technological apartheid’ could l...,William Jones,2022-06-06,A New York Times article earlier this month do...,OPINION,VIEWPOINT,Illustration: Tang Tengfei/Global Times,Author details not found,Author details not found
485,486,US behind global grain crises: Chinese FM spok...,Global Times,2022-06-07,Zhao Lijian Photo: VCG Each global grain cris...,SOURCE,ECONOMY,Picture description not found,Author details not found,Author details not found
486,487,"‘No longer US’ backyard,’ Latin America sends ...",GT staff reporters,2022-06-07,Mexico's President Andres Manuel Lopez Obrador...,WORLD,AMERICAS,Picture description not found,Author details not found,Author details not found
487,488,Chinese envoy warns against providing weapons ...,Xinhua,2022-06-07,A Chinese envoy on Monday warned against const...,CHINA,DIPLOMACY,Picture description not found,Author details not found,Author details not found


In [3]:
sentences = []
dates = []
for article, date in zip(df['article_text'], df['published_time']):
    sentence_cout = len(sentences)
    sentences.extend(sent_tokenize(str(article)))
    delta = len(sentences) - sentence_cout
    for _ in range(delta):
        dates.append(date)
sentences = [re.sub(r"[^a-zA-Z]+", " ", s) for s in sentences]

In [12]:
#prapare models
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(sentences, show_progress_bar=True)
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english")
# client = openai.OpenAI(api_key="sk-WuvQLP8aUps80YkLbDDRKYHCFi1k8SfHJ4VJF2-Ow0T3BlbkFJMkZXPa3NSyP9-nWqhxncLvIvJ5wOpctXxbOtvM0ioA")
# prompt = """
# I have a topic that contains the following documents:
# [DOCUMENTS]
# The topic is described by the following keywords: [KEYWORDS]
#
# Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
# topic: <topic label>
# """
# openai_model = OpenAI(client, model="gpt-3.5-turbo", delay_in_seconds=5, prompt=prompt)
representation_model = MaximalMarginalRelevance(diversity=0.2)

Batches:   0%|          | 0/398 [00:00<?, ?it/s]

In [13]:
# Create BERTopic model
topic_model = BERTopic(verbose=True, vectorizer_model=vectorizer_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model, representation_model=representation_model)
topics, probs = topic_model.fit_transform(sentences)

2024-12-10 20:21:20,265 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/398 [00:00<?, ?it/s]

2024-12-10 20:23:04,991 - BERTopic - Embedding - Completed ✓
2024-12-10 20:23:04,992 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-10 20:23:22,534 - BERTopic - Dimensionality - Completed ✓
2024-12-10 20:23:22,535 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-10 20:23:23,078 - BERTopic - Cluster - Completed ✓
2024-12-10 20:23:23,084 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-10 20:23:24,515 - BERTopic - Representation - Completed ✓


In [14]:
topics_over_time = topic_model.topics_over_time(sentences,dates, nr_bins=20)

20it [00:22,  1.10s/it]


In [15]:
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"china, countries, products, belarusian, chinese",147,2022-06-05 22:07:40.800
1,0,"countries, policy, violence, interests, global",42,2022-06-05 22:07:40.800
2,1,"ukraine, russia, weapons, conflict, oil",62,2022-06-05 22:07:40.800
3,2,"china, chinese, nations, zhang, guns",36,2022-06-05 22:07:40.800
4,3,"farm, xinjiang, agricultural, bozhou, harvester",2,2022-06-05 22:07:40.800
...,...,...,...,...
492,18,"covid, pandemic, hiatus, wave, visits",3,2022-08-19 02:24:00.000
493,20,"nato, asia, destabilizing, establish, seeking",3,2022-08-19 02:24:00.000
494,22,"committee, cpc, pandemic, jiechi, visits",1,2022-08-19 02:24:00.000
495,31,"ali, champion, seconds, championships, garcia",4,2022-08-19 02:24:00.000


In [24]:
fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
fig.write_image("topic_over_time.png")

In [17]:
#print topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2910,-1_china_global_chinese_countries,"[china, global, chinese, countries, economic, ...",[ It s clear that NATO is no longer a military...
1,0,1713,0_world_countries_democracy_political,"[world, countries, democracy, political, secur...",[And we also need to find a way to a new archi...
2,1,1078,1_ukraine_russia_eu_sanctions,"[ukraine, russia, eu, sanctions, oil, ukrainia...","[Take Russia Ukraine conflict as an example , ..."
3,2,1047,2_china_chinese_relations_countries,"[china, chinese, relations, countries, russia,...",[Chinese Vice Premier Liu He a member of the P...
4,3,873,3_xinjiang_institute_training_vocational,"[xinjiang, institute, training, vocational, ar...",[Xinjiang Uyghur Autonomous Region Economic an...
5,4,660,4_taiwan_island_straits_mainland,"[taiwan, island, straits, mainland, chinese, i...",[Taiwan island has always been a central part ...
6,5,397,5_food_grain_ukraine_crisis,"[food, grain, ukraine, crisis, wheat, istanbul...",[The longer and bigger the conflict is the gre...
7,6,356,6_brics_global_summit_cooperation,"[brics, global, summit, cooperation, emerging,...",[Accounting for more than percent of the world...
8,7,313,7_japan_japanese_constitution_kishida,"[japan, japanese, constitution, kishida, milit...",[Japan is desperately trying to pull the US an...
9,8,287,8_pelosi_visit_taiwan_nancy,"[pelosi, visit, taiwan, nancy, chinese, counte...",[ It was published on Wednesday following the ...


In [23]:
fig = topic_model.visualize_topics()
#save plotly png
fig.write_image("topic_distance.png")

In [19]:
topic_model.visualize_documents(sentences, embeddings=embeddings)