# BERTrend quickstart
The purpose of this notebook is to complement the existing demos available in the directory `bertrend/demos` with some code examples that explain how to integrate BERTrend with your application code.

In [1]:

%load_ext autoreload
%autoreload 2

## BERTrend installation

In [54]:
import json
from pathlib import Path
import pandas as pd
from pandas import Timestamp
from IPython.display import display
from loguru import logger

from bertrend import DATA_PATH
from bertrend.BERTrend import BERTrend
from bertrend import MODELS_DIR
from bertrend.utils.data_loading import load_data, split_data, TEXT_COLUMN
from bertrend.services.embedding_service import EmbeddingService
from bertrend.BERTopicModel import BERTopicModel
from bertrend.topic_analysis.topic_description import generate_topic_description
from bertrend.trend_analysis.weak_signals import analyze_signal


In [55]:
#!pip install bertrend

### Configuration of topic models

In [6]:
# Topic model with default parameters - each parameter of BERTopic can be modified from the constructor or can be read from a configuration file
# overrides the default config to use English
config = '''
# Default configuration file to be used for topic model

# Global parameters
[global]
language = "English"

# BERTopic parameters: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.__init__
[bertopic_model]
top_n_words = 10
verbose = true
representation_model = ["MaximalMarginalRelevance"] # KeyBERTInspired, OpenAI
zeroshot_topic_list = []
zeroshot_min_similarity = 0

# UMAP parameters: https://umap-learn.readthedocs.io/en/latest/api.html
[umap_model]
n_neighbors = 5
n_components = 5
min_dist = 0.0
metric = "cosine"
random_state = 42

# HDBSCAN parameters: https://hdbscan.readthedocs.io/en/latest/api.html
[hdbscan_model]
min_cluster_size = 5
min_samples = 5
metric = "euclidean"
cluster_selection_method = "eom"
prediction_data = true

# CountVectorizer: https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
[vectorizer_model]
ngram_range = [1, 1]
stop_words = true # If true, will check `language` parameter and load associated stopwords file
min_df = 2

# ClassTfidfTransformer: https://maartengr.github.io/BERTopic/api/ctfidf.html
[ctfidf_model]
bm25_weighting = false
reduce_frequent_words = true

# MaximalMarginalRelevance: https://maartengr.github.io/BERTopic/api/representation/mmr.html
[mmr_model]
diversity = 0.3

# Reduce outliers: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.reduce_outliers
[reduce_outliers]
strategy = "c-tf-idf"
'''

topic_model = BERTopicModel(config)

In [8]:
# The TopicModel class is mainly a wrapper around BERTopic and can be used as-is, for example for a first analysis of data (without considering evolving trends, but this is not mandatory at all)


## Using BERTrend for retrospective analysis

### Instantiation of BERTrend


In the case of a **retrospective trend analysis** task, the goal is to identify and evaluate patterns or changes over time within a dataset, allowing for insights into historical performance, behaviors, or events that can inform future decision-making and strategy development.

In this context, the general principle consists in splitting the past data into different time slices. Then each dataset is used to train a separate topic models. Each topic model description corresponding to the older data slice is merged with the next one and decay factors are applied. This allows to have a vision of topic evolution over time

In [9]:
# Basic creation of the object and parametrization
# BERTrend uses several topic models; therefore, it is necessary to pass a topic_model object as a reference
bertrend = BERTrend(topic_model=topic_model)

### 1. Gather historical data to be analyzed


In [10]:
# Here some Trump tweets from: https://github.com/MarkHershey/CompleteTrumpTweetsArchive/blob/master/data/realDonaldTrump_in_office.csv
#!wget "https://raw.githubusercontent.com/MarkHershey/CompleteTrumpTweetsArchive/refs/heads/master/data/realDonaldTrump_in_office.csv"
df = pd.read_csv("realDonaldTrump_in_office.csv",  sep=',',quotechar='"', skipinitialspace=True)
# BERTrend expects specific data format
df = df.rename(columns={'Time': 'timestamp', 'Tweet URL': 'url', "Tweet Text": "text"})
df["source"]=df["ID"]
df["document_id"] = df.index
df.reset_index(inplace=True, drop=True)
df.head(5)

Unnamed: 0,ID,timestamp,url,text,source,document_id
0,@realDonaldTrump,2017-01-20 06:31,https://twitter.com/realDonaldTrump/status/822...,It all begins today! I will see you at 11:00 A...,@realDonaldTrump,0
1,@realDonaldTrump,2017-01-20 11:51,https://twitter.com/realDonaldTrump/status/822...,Today we are not merely transferring power fro...,@realDonaldTrump,1
2,@realDonaldTrump,2017-01-20 11:51,https://twitter.com/realDonaldTrump/status/822...,"power from Washington, D.C. and giving it back...",@realDonaldTrump,2
3,@realDonaldTrump,2017-01-20 11:52,https://twitter.com/realDonaldTrump/status/822...,What truly matters is not which party controls...,@realDonaldTrump,3
4,@realDonaldTrump,2017-01-20 11:53,https://twitter.com/realDonaldTrump/status/822...,"January 20th 2017, will be remembered as the d...",@realDonaldTrump,4


In [11]:
df.index

RangeIndex(start=0, stop=23075, step=1)

### 2. Embed data

In [None]:
# Selection of a subset of data
df = df.head(1000)

embedding_service_cfg = {"local": False, "host":"10.132.5.44",  "port": 6464}

embedding_service = EmbeddingService(**embedding_service_cfg)
embeddings, token_strings, token_embeddings = embedding_service.embed(
                texts=df["text"],
            )

[32m2025-01-20 16:00:36.345[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_remote_embed_documents[0m:[36m203[0m - [34m[1mComputing embeddings...[0m
[32m2025-01-20 16:01:16.205[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_remote_embed_documents[0m:[36m210[0m - [34m[1mComputing embeddings done for batch[0m
[32m2025-01-20 16:01:16.779[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_get_remote_model_name[0m:[36m226[0m - [34m[1mModel name: OrdalieTech/Solon-embeddings-large-0.1[0m


In [14]:
embedding_model_name = embedding_service.embedding_model_name


### 3. Split the data into time slices

This can be done manually for some reason or can be done automatically based on a specified time granularity

In [16]:
from bertrend.utils.data_loading import group_by_days, load_data

day_granularity = 30
grouped_data = group_by_days(df=df, day_granularity=day_granularity)

In [17]:
# Number of sliced data
len(grouped_data)

6

### 4. Train topic models

In [19]:
bertrend.train_topic_models(grouped_data=grouped_data, embedding_model=embedding_model_name, embeddings=embeddings)

[32m2025-01-20 16:01:17.216[0m | [1mINFO    [0m | [36mbertrend.BERTrend[0m:[36mtrain_topic_models[0m:[36m240[0m - [1mTraining topic model 1/6...[0m
[32m2025-01-20 16:01:17.217[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTrend[0m:[36m_train_by_period[0m:[36m148[0m - [34m[1mProcessing period: 2017-01-20 00:00:00[0m
[32m2025-01-20 16:01:17.218[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTrend[0m:[36m_train_by_period[0m:[36m149[0m - [34m[1mNumber of documents: 184[0m
[32m2025-01-20 16:01:17.218[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTrend[0m:[36m_train_by_period[0m:[36m151[0m - [34m[1mCreating topic model...[0m
[32m2025-01-20 16:01:17.219[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTopicModel[0m:[36mfit[0m:[36m212[0m - [34m[1m	Initializing BERTopic model[0m
[32m2025-01-20 16:01:17.221[0m | [32m[1mSUCCESS [0m | [36mbertrend.BERTopicModel[0m:[36mfit[0m:[36m222[0m - [32m[1m	BERTopic model instance created successful

### 5. (Optional) Save trained_models

In [21]:
bertrend.save_models()

[32m2025-01-20 16:07:12.514[0m | [1mINFO    [0m | [36mbertrend.BERTrend[0m:[36msave_models[0m:[36m652[0m - [1mModels saved to: /home/jerome/dev/cache/bertrend/models[0m


### 6. Merge models

In [23]:
bertrend.merge_all_models()

[32m2025-01-20 16:07:13.172[0m | [32m[1mSUCCESS [0m | [36mbertrend.BERTrend[0m:[36mmerge_all_models[0m:[36m351[0m - [32m[1mAll models merged successfully[0m


### 7. Calculate signal popularity

In [25]:
bertrend.calculate_signal_popularity()

In [26]:
# List of topic models
bertrend.topic_models

{Timestamp('2017-01-20 00:00:00'): <bertopic._bertopic.BERTopic at 0x760aff0ede50>,
 Timestamp('2017-02-19 00:00:00'): <bertopic._bertopic.BERTopic at 0x760c231cff50>,
 Timestamp('2017-03-21 00:00:00'): <bertopic._bertopic.BERTopic at 0x760c20863e90>,
 Timestamp('2017-04-20 00:00:00'): <bertopic._bertopic.BERTopic at 0x760c15964f10>,
 Timestamp('2017-05-20 00:00:00'): <bertopic._bertopic.BERTopic at 0x760c23a23350>,
 Timestamp('2017-06-19 00:00:00'): <bertopic._bertopic.BERTopic at 0x760c209eab90>}

In [27]:
window_size = 30

# List of strong and weak signals over time
for ts in bertrend.topic_models.keys():
    print(ts)
    noise_topics_df, weak_signal_topics_df, strong_signal_topics_df = bertrend.classify_signals(window_size, ts)
    if not weak_signal_topics_df.empty:
        print("Weak signals")
        display(weak_signal_topics_df[["Topic","Representation"]].head(5))
    if not strong_signal_topics_df.empty:
        print("Strong signals")
        display(strong_signal_topics_df[["Topic","Representation"]].head(5))
    print()


2017-01-20 00:00:00
Strong signals


Unnamed: 0,Topic,Representation
0,0,healthcare_getting_together_disaster_new_despi...
1,1,https_great_at_meeting_amp_american_trump_we_f...



2017-02-19 00:00:00
Weak signals


Unnamed: 0,Topic,Representation
0,0,win_republicans_immigration_illegal_dems_until...
1,1,https_our_today_jobs_american_great_at_preside...



2017-03-21 00:00:00
Weak signals


Unnamed: 0,Topic,Representation
0,4,night_interviewed_saturday_foxnews_next_tax_me...
1,10,healthcare_obamacare_plan_dead_lie_great_compa...


Strong signals


Unnamed: 0,Topic,Representation
0,0,fake_news_said_possible_amp_yates_while_china_...
1,1,https_today_order_at_presidential_foxandfriend...
2,2,democrats_our_wall_insurance_companies_governm...



2017-04-20 00:00:00
Weak signals


Unnamed: 0,Topic,Representation
0,2,help_country_our_justice_must_before_peace_his...
1,4,g7_jobs_terrorism_italy_trip_melania_security_...
2,8,nato_hard_east_saudi_trying_countries_2016_sho...
3,9,healthcare_cuts_obamacare_montana_republican_w...


Strong signals


Unnamed: 0,Topic,Representation
0,0,news_media_dems_they_now_no_london_phony_faken...
1,1,deal_workers_trump_again_promise_realdonaldtru...



2017-05-20 00:00:00
Weak signals


Unnamed: 0,Topic,Representation
0,12,gop_georgia_foxnews_steel_congressional_foxand...


Strong signals


Unnamed: 0,Topic,Representation
0,0,fbi_cia_asked_disgraceful_hoax_refused_seat_ta...
1,1,realdonaldtrump_potus_rt_weekly_friends_trump_...
2,3,obama_meddling_election_nothing_2016_russian_w...
3,7,south_deals_uswomensopen_women_meetings_moon_m...
4,9,democrats_healthcare_would_dems_senate_failed_...



2017-06-19 00:00:00
Weak signals


Unnamed: 0,Topic,Representation
0,3,obama_meddling_election_nothing_2016_russian_w...
1,7,south_deals_uswomensopen_women_meetings_moon_m...
2,9,democrats_healthcare_would_dems_senate_failed_...
3,12,gop_georgia_foxnews_steel_congressional_foxand...
4,13,market_jobs_another_deal_syria_like_border_ste...


Strong signals


Unnamed: 0,Topic,Representation
0,0,fbi_cia_asked_disgraceful_hoax_refused_seat_ta...
1,1,realdonaldtrump_potus_rt_weekly_friends_trump_...





In [1]:
# selection of one particular timestamp to look at
selected_timestamp = Timestamp('2017-04-20 00:00:00')
selected_topic_model = bertrend.topic_models.get(selected_timestamp)


### Get topic description


In [38]:
desc = generate_topic_description(topic_model=selected_topic_model, topic_number=5, filtered_docs=df, language_code="en")


[32m2025-01-20 16:09:22.895[0m | [31m[1mERROR   [0m | [36mbertrend.topic_analysis.topic_description[0m:[36mgenerate_topic_description[0m:[36m51[0m - [31m[1mError calling OpenAI API: ' "title"'[0m


'Error generating description: \' "title"\''

In [68]:
desc["title"]

[32m2025-01-20 16:23:13.247[0m | [34m[1mDEBUG   [0m | [36mbertrend.llm_utils.openai_client[0m:[36mgenerate_from_history[0m:[36m128[0m - [34m[1mAPI returned: ChatCompletion(id='chatcmpl-ArnuKCesKptMpkREbYsHA1tBs6qI2', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n  "title": "Inauguration Night: Power Shift and Future Meetings",\n  "description": "Ce thème explore la dynamique de la nuit d\'inauguration, marquée par des interviews et des discussions sur les prochaines étapes du gouvernement. Les événements de samedi, notamment sur Fox News, mettent en lumière les enjeux fiscaux et les réunions à venir avec des représentants étrangers. L\'accent est mis sur le retour du pouvoir aux citoyens américains, soulignant l\'importance des visites à domicile et des interactions directes. Ce moment symbolique représente un tournant dans la politique américaine, où les attentes et les promesses de changement sont au cœur des préo

In [None]:
desc["description"]

### Get topic analysis

In [33]:
summary, analysis, formatted_html = analyze_signal(bertrend, 7, selected_timestamp)

[32m2025-01-20 16:07:16.814[0m | [31m[1mERROR   [0m | [36mbertrend.trend_analysis.weak_signals[0m:[36manalyze_signal[0m:[36m416[0m - [31m[1mNo data available for topic 7 within the specified date range. Please enter a valid topic number.[0m


Exception: No data available for topic 7 within the specified date range. Please enter a valid topic number.

In [39]:
from IPython.display import display, HTML
display(HTML(formatted_html))

## Using BERTrend for prospective analysis

In the case of a **prospective trend analysis task**, the goal is to **forecast future** developments or outcomes based on current data and trends, enabling organizations to make informed decisions, allocate resources effectively, and strategize for upcoming challenges or opportunities.


In this example, we are going to simulate a prospective task:
- we simulate new data coming in
- for each new data, we will compute the new topic model, merge it to previous one and detect at each iteration strong and weak signals


In [18]:
MY_DATA_DIR = DATA_PATH / "feeds/feed_sobriete"

input_data = [
    MY_DATA_DIR / "2024-12-30_feed_sobriete.jsonl",
    MY_DATA_DIR / "2025-01-06_feed_sobriete.jsonl",
    MY_DATA_DIR / "2025-01-20_feed_sobriete.jsonl",
]

window_size = 7

In [48]:
embedding_service_cfg = {"local": False, "host":"10.132.5.44",  "port": 6464}

embedding_service = EmbeddingService(**embedding_service_cfg)
embedding_model_name = embedding_service.embedding_model_name

In [13]:
BERTREND_MODELS_PATH = MODELS_DIR / "sobriete_models"

In [56]:
def process_new_data(data_slice_path: Path, timestamp: pd.Timestamp):
    logger.debug(f"Processing new data: {data_slice_path}")

    # Restore previous models
    try:
        bertrend = BERTrend.restore_models(BERTREND_MODELS_PATH)
    except:
        logger.warning("Cannot restore previous models, creating new one")
        bertrend = BERTrend(topic_model=BERTopicModel())

    # Read data
    df = load_data(data_slice_path, language="French")
    df = split_data(df)
    text = df[TEXT_COLUMN]

    # Embed new data
    embeddings, token_strings, token_embeddings = embedding_service.embed(
                texts=text,
    )

    # Create topic model for new data
    bertrend.train_topic_models({timestamp: df}, embeddings=embeddings, embedding_model=embedding_model_name)

    # Merge models
    bertrend.merge_all_models()

    # Compute popularities
    bertrend.calculate_signal_popularity()

    # classify last signals
    noise_topics_df, weak_signal_topics_df, strong_signal_topics_df = bertrend.classify_signals(window_size, timestamp)
    # TODO: save dfs
    wt = noise_topics_df['Topic']
    logger.info(f"Weak topics: {wt}")
    for topic in wt:
        desc = generate_topic_description(topic_model=bertrend.topic_models[timestamp], topic_number=topic, filtered_docs=df, language_code="fr")
        logger.info(f"Topic: {topic}\t\t{desc['title']}\n{desc['description']}")


    # Save models
    bertrend.save_models(models_path=BERTREND_MODELS_PATH)



In [57]:
input_data = [
    MY_DATA_DIR / "2024-12-30_feed_sobriete.jsonl",
]
for data_file in input_data:
    timestamp = pd.Timestamp(data_file.name.split('_')[0])
    process_new_data(data_file, timestamp)

[32m2025-01-26 22:08:10.435[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mprocess_new_data[0m:[36m2[0m - [34m[1mProcessing new data: /home/jerome/dev/data/bertrend/feeds/feed_sobriete/2024-12-30_feed_sobriete.jsonl[0m
[32m2025-01-26 22:08:10.650[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_remote_embed_documents[0m:[36m203[0m - [34m[1mComputing embeddings...[0m


Index(['title', 'summary', 'link', 'url', 'text', 'timestamp', 'document_id',
       'source'],
      dtype='object')


[32m2025-01-26 22:08:40.748[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_remote_embed_documents[0m:[36m210[0m - [34m[1mComputing embeddings done for batch[0m
[32m2025-01-26 22:08:40.800[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_get_remote_model_name[0m:[36m226[0m - [34m[1mModel name: OrdalieTech/Solon-embeddings-large-0.1[0m
[32m2025-01-26 22:08:40.804[0m | [1mINFO    [0m | [36mbertrend.BERTrend[0m:[36mtrain_topic_models[0m:[36m240[0m - [1mTraining topic model 1/1...[0m
[32m2025-01-26 22:08:40.810[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTrend[0m:[36m_train_by_period[0m:[36m148[0m - [34m[1mProcessing period: 2024-12-30 00:00:00[0m
[32m2025-01-26 22:08:40.811[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTrend[0m:[36m_train_by_period[0m:[36m149[0m - [34m[1mNumber of documents: 932[0m
[32m2025-01-26 22:08:40.812[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTren

KeyboardInterrupt: 