# BERTrend quickstart
The purpose of this notebook is to complement the existing demos available in the directory `bertrend/demos` with some code examples that explain how to integrate BERTrend with your application code.

In [1]:

%load_ext autoreload
%autoreload 2

## BERTrend installation

In [2]:
import json
from pathlib import Path
import pandas as pd
from pandas import Timestamp
from IPython.display import display
from loguru import logger

from bertrend import DATA_PATH
from bertrend.BERTrend import BERTrend
from bertrend import MODELS_DIR
from bertrend.utils.data_loading import load_data, split_data, TEXT_COLUMN
from bertrend.services.embedding_service import EmbeddingService
from bertrend.BERTopicModel import BERTopicModel
from bertrend.topic_analysis.topic_description import generate_topic_description
from bertrend.trend_analysis.weak_signals import analyze_signal


In [3]:
#!pip install bertrend

### Configuration of topic models

In [None]:
# Topic model with default parameters - each parameter of BERTopic can be modified from the constructor or can be read from a configuration file
# overrides the default config to use English
config = '''
# Default configuration file to be used for topic model

# Global parameters
[global]
language = "English"

# BERTopic parameters: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.__init__
[bertopic_model]
top_n_words = 10
verbose = true
representation_model = ["MaximalMarginalRelevance"] # KeyBERTInspired, OpenAI
zeroshot_topic_list = []
zeroshot_min_similarity = 0

# UMAP parameters: https://umap-learn.readthedocs.io/en/latest/api.html
[umap_model]
n_neighbors = 5
n_components = 5
min_dist = 0.0
metric = "cosine"
random_state = 42

# HDBSCAN parameters: https://hdbscan.readthedocs.io/en/latest/api.html
[hdbscan_model]
min_cluster_size = 5
min_samples = 5
metric = "euclidean"
cluster_selection_method = "eom"
prediction_data = true

# CountVectorizer: https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
[vectorizer_model]
ngram_range = [1, 1]
stop_words = true # If true, will check `language` parameter and load associated stopwords file
min_df = 2

# ClassTfidfTransformer: https://maartengr.github.io/BERTopic/api/ctfidf.html
[ctfidf_model]
bm25_weighting = false
reduce_frequent_words = true

# MaximalMarginalRelevance: https://maartengr.github.io/BERTopic/api/representation/mmr.html
[mmr_model]
diversity = 0.3

# Reduce outliers: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.reduce_outliers
[reduce_outliers]
strategy = "c-tf-idf"
'''

topic_model = BERTopicModel(config)

In [None]:
# The TopicModel class is mainly a wrapper around BERTopic and can be used as-is, for example for a first analysis of data (without considering evolving trends, but this is not mandatory at all)


## Using BERTrend for retrospective analysis

### Instantiation of BERTrend


In the case of a **retrospective trend analysis** task, the goal is to identify and evaluate patterns or changes over time within a dataset, allowing for insights into historical performance, behaviors, or events that can inform future decision-making and strategy development.

In this context, the general principle consists in splitting the past data into different time slices. Then each dataset is used to train a separate topic models. Each topic model description corresponding to the older data slice is merged with the next one and decay factors are applied. This allows to have a vision of topic evolution over time

In [None]:
# Basic creation of the object and parametrization
# BERTrend uses several topic models; therefore, it is necessary to pass a topic_model object as a reference
bertrend = BERTrend(topic_model=topic_model)

### 1. Gather historical data to be analyzed


In [None]:
# Here some Trump tweets from: https://github.com/MarkHershey/CompleteTrumpTweetsArchive/blob/master/data/realDonaldTrump_in_office.csv
#!wget "https://raw.githubusercontent.com/MarkHershey/CompleteTrumpTweetsArchive/refs/heads/master/data/realDonaldTrump_in_office.csv"
df = pd.read_csv("realDonaldTrump_in_office.csv",  sep=',',quotechar='"', skipinitialspace=True)
# BERTrend expects specific data format
df = df.rename(columns={'Time': 'timestamp', 'Tweet URL': 'url', "Tweet Text": "text"})
df["source"]=df["ID"]
df["document_id"] = df.index
df.reset_index(inplace=True, drop=True)
df.head(5)

In [None]:
df.index

### 2. Embed data

In [None]:
# Selection of a subset of data
df = df.head(1000)

embedding_service_cfg = {"local": False, "url":"https://localhost:6464"}

embedding_service = EmbeddingService(**embedding_service_cfg)
embeddings, token_strings, token_embeddings = embedding_service.embed(
                texts=df["text"],
            )

In [None]:
embedding_model_name = embedding_service.embedding_model_name


### 3. Split the data into time slices

This can be done manually for some reason or can be done automatically based on a specified time granularity

In [None]:
from bertrend.utils.data_loading import group_by_days, load_data

day_granularity = 30
grouped_data = group_by_days(df=df, day_granularity=day_granularity)

In [None]:
# Number of sliced data
len(grouped_data)

### 4. Train topic models

In [None]:
bertrend.train_topic_models(grouped_data=grouped_data, embedding_model=embedding_model_name, embeddings=embeddings)

### 5. (Optional) Save trained_models

In [None]:
bertrend.save_model()

### 6. Merge models

In [None]:
bertrend.merge_all_models()

### 7. Calculate signal popularity

In [None]:
bertrend.calculate_signal_popularity()

In [None]:
# List of topic models
bertrend.topic_models

In [None]:
window_size = 30

# List of strong and weak signals over time
for ts in bertrend.topic_models.keys():
    print(ts)
    noise_topics_df, weak_signal_topics_df, strong_signal_topics_df = bertrend.classify_signals(window_size, ts)
    if not weak_signal_topics_df.empty:
        print("Weak signals")
        display(weak_signal_topics_df[["Topic","Representation"]].head(5))
    if not strong_signal_topics_df.empty:
        print("Strong signals")
        display(strong_signal_topics_df[["Topic","Representation"]].head(5))
    print()


In [None]:
# selection of one particular timestamp to look at
selected_timestamp = Timestamp('2017-04-20 00:00:00')
selected_topic_model = bertrend.topic_models.get(selected_timestamp)


### Get topic description


In [None]:
desc = generate_topic_description(topic_model=selected_topic_model, topic_number=5, filtered_docs=df, language_code="en")


In [None]:
desc["title"]

In [None]:
desc["description"]

### Get topic analysis

In [None]:
summary, analysis, formatted_html = analyze_signal(bertrend, 7, selected_timestamp)

In [None]:
from IPython.display import display, HTML
display(HTML(formatted_html))

## Using BERTrend for prospective analysis

In the case of a **prospective trend analysis task**, the goal is to **forecast future** developments or outcomes based on current data and trends, enabling organizations to make informed decisions, allocate resources effectively, and strategize for upcoming challenges or opportunities.


In this example, we are going to simulate a prospective task:
- we simulate new data coming in
- for each new data, we will compute the new topic model, merge it to previous one and detect at each iteration strong and weak signals


In [3]:
MY_DATA_DIR = DATA_PATH / "feeds/feed_sobriete"

input_data = [
    MY_DATA_DIR / "2024-12-30_feed_sobriete.jsonl",
    MY_DATA_DIR / "2025-01-06_feed_sobriete.jsonl",
    MY_DATA_DIR / "2025-01-20_feed_sobriete.jsonl",
]

window_size = 7

In [4]:
embedding_service_cfg = {"local": False, "url":"https://localhost:6464"}

embedding_service = EmbeddingService(**embedding_service_cfg)
embedding_model_name = embedding_service.embedding_model_name

In [5]:
BERTREND_MODELS_PATH = MODELS_DIR / "sobriete_models"

In [6]:
def process_new_data(data_slice_path: Path, timestamp: pd.Timestamp):
    logger.debug(f"Processing new data: {data_slice_path}")

    # Restore previous models
    try:
        bertrend = BERTrend.restore_model(BERTREND_MODELS_PATH)
    except:
        logger.warning("Cannot restore previous models, creating new one")
        bertrend = BERTrend(topic_model=BERTopicModel())

    # Read data
    df = load_data(data_slice_path, language="French")
    df = split_data(df)
    text = df[TEXT_COLUMN]

    # Embed new data
    embeddings, token_strings, token_embeddings = embedding_service.embed(
                texts=text,
    )

    # Create topic model for new data
    bertrend.train_topic_models({timestamp: df}, embeddings=embeddings, embedding_model=embedding_model_name)
    
    # Merge models
    bertrend.merge_all_models()

    logger.info(f"BERTrend contains {len(bertrend.topic_models)} topic models")
    
    # Save models
    bertrend.save_model(models_path=BERTREND_MODELS_PATH)

    
    if not bertrend._are_models_merged:
        return None
        
    # Compute popularities
    bertrend.calculate_signal_popularity()
    
    # classify last signals
    noise_topics_df, weak_signal_topics_df, strong_signal_topics_df = bertrend.classify_signals(window_size, timestamp)
    # TODO: save dfs

    if weak_signal_topics_df.empty:
        return None
        
    wt = weak_signal_topics_df['Topic']
    logger.info(f"Weak topics: {wt}")
    wt_list = []
    for topic in wt:
        desc = generate_topic_description(topic_model=bertrend.topic_models[timestamp], topic_number=topic, filtered_docs=df, language_code="fr")
        wt_list.append({"timestamp": timestamp, "topic": topic, "title": desc["title"], "description": desc["description"]})

    return pd.DataFrame(wt_list)


In [None]:
for data_file in input_data:
    timestamp = pd.Timestamp(data_file.name.split('_')[0])
    display(process_new_data(data_file, timestamp))

[32m2025-01-27 18:02:32.141[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mprocess_new_data[0m:[36m2[0m - [34m[1mProcessing new data: /scratch/nlp/data/bertrend/feeds/feed_sobriete/2024-12-30_feed_sobriete.jsonl[0m
[32m2025-01-27 18:02:32.142[0m | [1mINFO    [0m | [36mbertrend.BERTrend[0m:[36mrestore_models[0m:[36m668[0m - [1mLoading models from: /scratch/nlp/cache/bertrend/models/sobriete_models[0m
[32m2025-01-27 18:02:32.335[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_remote_embed_documents[0m:[36m203[0m - [34m[1mComputing embeddings...[0m
[32m2025-01-27 18:02:49.922[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_remote_embed_documents[0m:[36m210[0m - [34m[1mComputing embeddings done for batch[0m
[32m2025-01-27 18:02:49.937[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_get_remote_model_name[0m:[36m226[0m - [34m[1mModel name: OrdalieTec

None

[32m2025-01-27 18:03:14.252[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mprocess_new_data[0m:[36m2[0m - [34m[1mProcessing new data: /scratch/nlp/data/bertrend/feeds/feed_sobriete/2025-01-06_feed_sobriete.jsonl[0m
[32m2025-01-27 18:03:14.252[0m | [1mINFO    [0m | [36mbertrend.BERTrend[0m:[36mrestore_models[0m:[36m668[0m - [1mLoading models from: /scratch/nlp/cache/bertrend/models/sobriete_models[0m
[32m2025-01-27 18:03:17.627[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_remote_embed_documents[0m:[36m203[0m - [34m[1mComputing embeddings...[0m
[32m2025-01-27 18:03:32.419[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_remote_embed_documents[0m:[36m210[0m - [34m[1mComputing embeddings done for batch[0m
[32m2025-01-27 18:03:32.434[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_get_remote_model_name[0m:[36m226[0m - [34m[1mModel name: OrdalieTec