In [1]:
from collections import Counter, defaultdict
import re
import time
from concurrent.futures import ThreadPoolExecutor
import random
import string
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
import torch

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from bertopic import BERTopic
from bertopic.representation import TextGeneration
from umap import UMAP
from hdbscan import HDBSCAN

import boe_risk_monitoring.config as config

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahim1z\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahim1z\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahim1z\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rahim1z\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Check if CUDA is available and print the version
print(torch.version.cuda)
print(torch.cuda.is_available())

11.8
True


#### **SET GLOBAL VARIABLES**

In [3]:
# Make sure the working directory is set correctly
print("Initial working directory: ", os.getcwd())
os.chdir("../..")
print("New working directory: ", os.getcwd())

Initial working directory:  c:\python\bank-of-eng-risk-monitoring\boe_risk_monitoring\nlp
New working directory:  c:\python\bank-of-eng-risk-monitoring


In [150]:
ROOT_FPATH = os.getcwd()
DATA_FOLDER = config.DATA_FOLDER
AGGREGATED_DATA_FOLDER_NAME = config.AGGREGATED_DATA_FOLDER_NAME
ALL_TEXT_FNAME = "all_text.parquet"
ALL_TEXT_FPATH = f"{ROOT_FPATH}/{DATA_FOLDER}/{AGGREGATED_DATA_FOLDER_NAME}/{ALL_TEXT_FNAME}"
GLOSSARY_FNAME = "glossary_dictionary_citigroup.csv"
GLOSSARY_FPATH = f"{ROOT_FPATH}/{DATA_FOLDER}/{GLOSSARY_FNAME}"

#### **SETUP**

In [5]:
# Read in the data
df_all_text = pd.read_parquet(ALL_TEXT_FPATH)
df_all_text

Unnamed: 0,text,fiscal_period_ref,speaker,role,page,section,reporting_period,date_of_earnings_call,bank,document_type,source
0,"Hello, and welcome to Citi's First Quarter 202...",quarter,Operator,Host,1,Introduction,Q1_2023,2023-04-14,Citigroup,transcript,"Operator (Host)\nCitigroup, Q1, 2023 Earnings ..."
1,"Ms. Landis, you may begin.",quarter,Operator,Host,1,Introduction,Q1_2023,2023-04-14,Citigroup,transcript,"Operator (Host)\nCitigroup, Q1, 2023 Earnings ..."
2,"Thank you, operator. Good morning and thank yo...",quarter,Jennifer Landis,Host,1,Disclaimer,Q1_2023,2023-04-14,Citigroup,transcript,"Jennifer Landis (Host)\nCitigroup, Q1, 2023 Ea..."
3,"With that, I'll turn it over to Jane.",quarter,Jennifer Landis,Host,1,Introduction,Q1_2023,2023-04-14,Citigroup,transcript,"Jennifer Landis (Host)\nCitigroup, Q1, 2023 Ea..."
4,"Thank you, Jenn, hello to everyone joining us ...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning..."
...,...,...,...,...,...,...,...,...,...,...,...
9654,"Adjusted overhead ratio, preceding year trend:...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted overhead ratio)\nJPMorgan,..."
9655,"Revenue, preceding year trend: Revenue increas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Revenue)\nJPMorgan, Q4, 2024, Earni..."
9656,"Expense, preceding year trend: Expense decreas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Expense)\nJPMorgan, Q4, 2024, Earni..."
9657,"Credit costs, preceding year trend: Credit cos...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Credit costs)\nJPMorgan, Q4, 2024, ..."


In [6]:
# Check categories of text
df_all_text['section'].value_counts(dropna=False, normalize=True).head(20)

section
Financial Results    0.472927
Q and A              0.261725
Prepared remarks     0.165442
Footnotes            0.041309
Outlook              0.018946
Vision               0.010871
Glossary             0.010871
Introduction         0.006626
Conclusion           0.005798
Disclaimer           0.003934
Title                0.001553
Name: proportion, dtype: float64

In [7]:
# Drop text tagged as Introduction, Conclusion, Disclaimer or Title
df_all_text_main = df_all_text[~df_all_text['section'].isin(['Introduction', 'Conclusion', 'Disclaimer', 'Title'])].reset_index(drop=True)

In [8]:
# Print rows before and after dropping
print("Rows before dropping:", df_all_text.shape[0])
print("Rows after dropping:", df_all_text_main.shape[0])

Rows before dropping: 9659
Rows after dropping: 9486


#### **SENTIMENT ANALYSIS**

In [9]:
# Let's check we don't have any missing data
df_all_text_main['text'].isna().sum()

np.int64(0)

In [10]:
# We'll use finbert-tone for sentiment analysis which is the finetuned version of BERT for financial sentiment analysis
model_name = "yiyanghkust/finbert-tone"

In [11]:
# Check max token length for the model
model = AutoModelForSequenceClassification.from_pretrained(model_name)
print(model.config.max_position_embeddings)

512


In [12]:
# Find max token length in the dataset
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
max_tokens = df_all_text_main["text"].apply(
    lambda x: len(tokenizer(x, truncation=False)["input_ids"])
).max()
print("Estimated max tokens in dataset:", max_tokens)

Estimated max tokens in dataset: 379


In [13]:
# Split the reviews into two parts for parallel processing (2x GPUs)
text_list = df_all_text_main['text'].tolist()
# Split list into 2 roughly equal parts
midpoint = len(text_list) // 2
text_list_split1 = text_list[:midpoint]
text_list_split2 = text_list[midpoint:]

In [14]:
def run_sentiment_analysis_on_gpu(text_subset, model_name, device_id):
	"""
	Run sentiment analysis on a subset of text using a specified model and device.
	"""
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)

	clf = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device_id)

	results = []
	batch_size = 32

	for i in range(0, len(text_subset), batch_size):
		batch = text_subset[i:i+batch_size]
		batch_results = clf(batch, truncation=True, max_length=512)
		results.extend(batch_results)
		print(f"Device {device_id}: Processed {i + batch_size} reviews out of {len(text_subset)}")

	return results

In [15]:
# Run sentiment analysis on both subsets in parallel using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=2) as executor:
    future_0 = executor.submit(run_sentiment_analysis_on_gpu, text_list_split1, model_name, 0)
    future_1 = executor.submit(run_sentiment_analysis_on_gpu, text_list_split2, model_name, 1)

    results_0 = future_0.result()
    results_1 = future_1.result()
    results = results_0 + results_1

Device set to use cuda:1
Device set to use cuda:0


Device 0: Processed 32 reviews out of 4743
Device 1: Processed 32 reviews out of 4743
Device 0: Processed 64 reviews out of 4743
Device 1: Processed 64 reviews out of 4743
Device 0: Processed 96 reviews out of 4743
Device 1: Processed 96 reviews out of 4743
Device 0: Processed 128 reviews out of 4743
Device 1: Processed 128 reviews out of 4743
Device 0: Processed 160 reviews out of 4743
Device 1: Processed 160 reviews out of 4743
Device 0: Processed 192 reviews out of 4743
Device 1: Processed 192 reviews out of 4743
Device 0: Processed 224 reviews out of 4743
Device 1: Processed 224 reviews out of 4743
Device 0: Processed 256 reviews out of 4743
Device 1: Processed 256 reviews out of 4743
Device 0: Processed 288 reviews out of 4743


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Device 1: Processed 288 reviews out of 4743
Device 0: Processed 320 reviews out of 4743
Device 1: Processed 320 reviews out of 4743
Device 0: Processed 352 reviews out of 4743
Device 1: Processed 352 reviews out of 4743
Device 0: Processed 384 reviews out of 4743
Device 1: Processed 384 reviews out of 4743
Device 0: Processed 416 reviews out of 4743
Device 1: Processed 416 reviews out of 4743
Device 0: Processed 448 reviews out of 4743
Device 1: Processed 448 reviews out of 4743
Device 0: Processed 480 reviews out of 4743
Device 1: Processed 480 reviews out of 4743
Device 0: Processed 512 reviews out of 4743
Device 1: Processed 512 reviews out of 4743
Device 0: Processed 544 reviews out of 4743
Device 1: Processed 544 reviews out of 4743
Device 0: Processed 576 reviews out of 4743
Device 1: Processed 576 reviews out of 4743
Device 0: Processed 608 reviews out of 4743
Device 1: Processed 608 reviews out of 4743
Device 0: Processed 640 reviews out of 4743
Device 1: Processed 640 reviews 

In [16]:
# Postprocess results and add to DataFrame
results_clean = [d['label'] for d in results]
df_all_text_main['sentiment'] = results_clean
df_all_text_main['sentiment'] = df_all_text_main['sentiment'].replace(
	{'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'},
)
df_all_text_main.head(10)

Unnamed: 0,text,fiscal_period_ref,speaker,role,page,section,reporting_period,date_of_earnings_call,bank,document_type,source,sentiment
0,"Thank you, Jenn, hello to everyone joining us ...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive
1,"First, our banking system as a whole is very s...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive
2,The U.S. system comprises a healthy mix of com...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive
3,I am pleased that Citi has been a source of st...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive
4,We are in a position to play this role because...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive
5,And it is also thanks to our people. I want to...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive
6,Recent events have shown that prudent asset an...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive
7,While Mark is going to walk you through our ap...,quarter,Jane Fraser,CEO,2,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Neutral
8,"In terms of assets, our loans are high-quality...",quarter,Jane Fraser,CEO,2,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Neutral
9,We have over $1 trillion of available liquidit...,quarter,Jane Fraser,CEO,2,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Neutral


In [17]:
df_all_text_main['sentiment'].value_counts(dropna=False, normalize=True)

sentiment
Positive    0.425258
Neutral     0.370651
Negative    0.204090
Name: proportion, dtype: float64

In [18]:
df_all_text_main.loc[df_all_text_main['sentiment']=="Negative"].to_csv("negative_sentiment.csv", index=False)

#### **TOPIC MODELLING - UNSUPERVISED**

In [19]:
def clean_text(text_list_raw):
    """
    Cleans a list of raw text by converting to lowercase and
    and filtering out stop words.

    Args:
        text_list_raw: List of raw text strings.

    Returns:
        text_list_clean: List of cleaned text strings.
    """
    stop_words = set(stopwords.words('english'))
    text_list_clean = []

    for text in text_list_raw:
        if text and text.lower() != "nan":
            text = text.lower()
            word_tokens = word_tokenize(text)
            filtered_tokens = [w for w in word_tokens if w not in stop_words]
            cleaned_text = " ".join(filtered_tokens)
            text_list_clean.append(cleaned_text)

    return text_list_clean

In [20]:
# Conduct some basic text cleaning
clean_text = clean_text(df_all_text_main['text'].tolist())
clean_text

["thank , jenn , hello everyone joining us today . well , 2023 shaping another interesting year , given tumultuous events last weeks . going share observations , 'll turn good quarter .",
 'first , banking system whole strong . small handful institutions still challenges overcome , u.s. financial system remains unmatched globally . feel confident saying someone worked many different systems around world .',
 'u.s. system comprises healthy mix community banks , regional banks global banks including citi . important different roles play , serving different clients different needs different scales . would also point rapid response state , federal international regulators helped reinforce confidence system critical juncture .',
 'pleased citi source stability financial system source strength clients . ’ accident .',
 'position play role strategy delivering simpler , focused bank . benefit diversified earnings base resilient business model . reinforced robust balance sheet management , liqu

In [21]:
# We'll use Fin-MPNET-Base for the embedding model in BERTopic as it hsa been tuned on financial data
embedding_model_name = "mukaj/fin-mpnet-base"
embedding_model = SentenceTransformer(embedding_model_name)

In [22]:
# BERTopic uses UMAP, a dimensionality reduction technique, to reduce the dimensionality of the embeddings before clustering
# UMAP introduces stochastic behaviour so we'll set a random seed for reproducibility
# https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html#preventing-stochastic-behavior
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0) # These are the default parameters for UMAP used in _bertopic.py with the additional parameter of random_state = 0

Let's take a look at the top topics ranked by the number of reviews they are assigned to

In [23]:
# Run BERTopic to extract topics from the cleaned text
model = BERTopic(verbose=True, embedding_model=embedding_model, umap_model=umap_model)
model.fit(clean_text)
topics, probabilities = model.transform(clean_text)

2025-06-17 12:31:22,115 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

2025-06-17 12:31:30,047 - BERTopic - Embedding - Completed ✓
2025-06-17 12:31:30,049 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-17 12:32:11,793 - BERTopic - Dimensionality - Completed ✓
2025-06-17 12:32:11,796 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-17 12:32:12,135 - BERTopic - Cluster - Completed ✓
2025-06-17 12:32:12,149 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-17 12:32:12,547 - BERTopic - Representation - Completed ✓


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

2025-06-17 12:32:20,669 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-06-17 12:32:20,739 - BERTopic - Dimensionality - Completed ✓
2025-06-17 12:32:20,741 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-06-17 12:32:21,028 - BERTopic - Cluster - Completed ✓


In [24]:
df_topics = model.get_topic_freq()
df_topics.head(11)

Unnamed: 0,Topic,Count
0,-1,2123
27,0,180
21,1,141
6,2,125
18,3,99
209,4,98
28,5,93
1,6,88
215,7,84
9,8,72


In [25]:
pct_outliers = df_topics.loc[df_topics['Topic'] == -1, 'Count'].iloc[0]/df_topics['Count'].sum() * 100
print(f"Percentage of outliers: {pct_outliers:.2f}%")

Percentage of outliers: 22.38%


In [26]:
df_all_text_main['topic_idx'] = topics
df_all_text_main

Unnamed: 0,text,fiscal_period_ref,speaker,role,page,section,reporting_period,date_of_earnings_call,bank,document_type,source,sentiment,topic_idx
0,"Thank you, Jenn, hello to everyone joining us ...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,-1
1,"First, our banking system as a whole is very s...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,-1
2,The U.S. system comprises a healthy mix of com...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,-1
3,I am pleased that Citi has been a source of st...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,6
4,We are in a position to play this role because...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9481,"Adjusted overhead ratio, preceding year trend:...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted overhead ratio)\nJPMorgan,...",Negative,24
9482,"Revenue, preceding year trend: Revenue increas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Revenue)\nJPMorgan, Q4, 2024, Earni...",Positive,-1
9483,"Expense, preceding year trend: Expense decreas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Expense)\nJPMorgan, Q4, 2024, Earni...",Neutral,44
9484,"Credit costs, preceding year trend: Credit cos...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Credit costs)\nJPMorgan, Q4, 2024, ...",Positive,-1


Let's look at the top words in the top two topics

In [27]:
model.get_topic(0)

[('buybacks', np.float64(0.028257192241204234)),
 ('stock', np.float64(0.021264376627203225)),
 ('dividend', np.float64(0.019526768722112245)),
 ('buyback', np.float64(0.019393495198891338)),
 ('common', np.float64(0.01690648707885611)),
 ('capital', np.float64(0.015992970833465903)),
 ('dividends', np.float64(0.014374735976861005)),
 ('shareholders', np.float64(0.014356433534986621)),
 ('buy', np.float64(0.012816734393634887)),
 ('repurchases', np.float64(0.012587657314229235))]

In [28]:
# Let's take a closer look a this topic
for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 0, "text"].tolist():
	print("-----------------------------------------------------")
	print(text)

-----------------------------------------------------
And as it relates to buybacks, we did not buy back any stock this quarter and we will continue to make that decision on a quarter-by-quarter basis.
-----------------------------------------------------
And as it relates to your question regarding capital, this in a normal cycle is a very healthy returning business, and as the market turns and as we recover, we would look to deploy capital appropriate with the growth and return prospects that we see in front of us.
-----------------------------------------------------
So think about the Basel III end game that's out there and the capital requirements that could come out of that. Think about the CCAR DFAST that has been submitted and currently under review, and what that might mean for stress capital buffers, and also think about just where we are in the broader economy and broader global macro environment that we're playing in and needing to see how that kind of evolves. And so when 

In [29]:
model.get_topic(1)

[('data', np.float64(0.039707061067306766)),
 ('consent', np.float64(0.022599771369456317)),
 ('infrastructure', np.float64(0.02195789852455233)),
 ('processes', np.float64(0.021645235198955323)),
 ('transformation', np.float64(0.019897026591180148)),
 ('platforms', np.float64(0.019227645426736748)),
 ('order', np.float64(0.018087301836141833)),
 ('controls', np.float64(0.017609633793786154)),
 ('regulatory', np.float64(0.016877079493896618)),
 ('reporting', np.float64(0.01634667807727264))]

In [30]:
# Let's take a closer look a this topic
for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 1, "text"].tolist():
	print("-----------------------------------------------------")
	print(text)

-----------------------------------------------------
We are also modernizing our infrastructure and the security of our data and information by enhancing cyber security through the use of AI and improving the security of our infrastructure and devices, leading to fewer operating losses. And we are leveraging industry-leading cloud-based solutions to modernize and streamline the connectivity between our front office systems and the general ledger, eliminating manual processes and operating costs over time.
-----------------------------------------------------
We are keeping a close eye on the execution of these efforts and overall resourcing to ensure we safeguard our commitment to the Transformation. As you know, given its magnitude and scale, the Transformation is a multi-year effort to address issues that have spanned over two decades.
-----------------------------------------------------
We have made steady progress as we retire multiple legacy platforms, streamline end-to-end proc

In [31]:
model.visualize_topics()

In [32]:
model.visualize_barchart()

In [33]:
model.visualize_heatmap(top_n_topics=10)

We can use a Hugging Face text generation model along with BERTopic's representation_model wrapper and pass this into BERTopic to summarise the topics for us

Since we'll be using Microsoft Phi 4 later, we'll use it here too.

In [34]:
generator = pipeline(
    "text-generation",
    model="microsoft/Phi-4-mini-instruct",
    model_kwargs={"torch_dtype": "auto"},
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [35]:
# Create prompt template to pass into BERTopic TextGeneration wrapper
prompt_template = """<|system|>You are a helpful assistant who can succinctly describe the main topic covered by a set of customer reviews provided to you<|end|><|user|>I have a topic that contains the following documents: \n[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS] Based on the above information, can you give a short label of the topic?<|end|><|assistant|>"""
result = generator(prompt_template, max_new_tokens=200)
result[0]["generated_text"]

"<|system|>You are a helpful assistant who can succinctly describe the main topic covered by a set of customer reviews provided to you<|end|><|user|>I have a topic that contains the following documents: \n[DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS] Based on the above information, can you give a short label of the topic?<|end|><|assistant|>Sure, I can help with that. However, you haven't provided the actual content of the documents or the keywords. Please share the documents or the list of keywords, and I'll be happy to create a short label for the topic."

In [36]:
# Let's create the BERTopic TextGeneration wrapper now
representation_model = TextGeneration(
	model=generator,
	prompt=prompt_template,
	pipeline_kwargs={"max_new_tokens": 200},
	random_state=0,
)

Now let's rerun the BERTopic model exactly as before except this time we'll include the Phi 4 representation model above which will provide clean, interpretable topic labels for us

In [37]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0) # These are the default parameters for UMAP used in _bertopic.py with the additional paramater of random_state = 0
model = BERTopic(verbose=True, embedding_model=embedding_model, umap_model=umap_model, representation_model=representation_model)
model.fit(clean_text)
topics, probabilities = model.transform(clean_text)

2025-06-17 12:32:39,463 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

2025-06-17 12:32:46,900 - BERTopic - Embedding - Completed ✓
2025-06-17 12:32:46,901 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-17 12:33:06,801 - BERTopic - Dimensionality - Completed ✓
2025-06-17 12:33:06,804 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-17 12:33:07,143 - BERTopic - Cluster - Completed ✓
2025-06-17 12:33:07,152 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 271/271 [05:21<00:00,  1.19s/it]
2025-06-17 12:38:29,762 - BERTopic - Representation - Completed ✓


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

2025-06-17 12:38:38,632 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-06-17 12:38:38,702 - BERTopic - Dimensionality - Completed ✓
2025-06-17 12:38:38,702 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-06-17 12:38:38,993 - BERTopic - Cluster - Completed ✓


Let's print out the topic summaries generated by Phi 4.

In [38]:
topic_summaries_dict = defaultdict(list)
max_topic_idx = df_all_text_main['topic_idx'].max()
for i in range(max_topic_idx + 1):
	topic_summaries_dict["topic_idx"].append(i)
	topic_summaries_dict["reviews"].append(model.get_topic_freq(i))
	topic_summaries_dict["summary"].append(model.topic_representations_[i][0][0])
	print("-----------------------------------------------------")
	print(f"Topic {i}:")
	print(f"Number of reviews: {model.get_topic_freq(i)}")
	print(f"Summary: {model.topic_representations_[i][0][0]}")

df_topic_summaries = pd.DataFrame(topic_summaries_dict)


-----------------------------------------------------
Topic 0:
Number of reviews: 180
Summary: Capital Share Buybacks and Dividends Growth
-----------------------------------------------------
Topic 1:
Number of reviews: 141
Summary: Data Transformation and Standardization for Regulatory Compliance and Modern Infrastructure
-----------------------------------------------------
Topic 2:
Number of reviews: 125
Summary: Yield Curve Analysis during Recessionary Period with Fed Rate Cuts and Dollar Sensitivity Considerations
-----------------------------------------------------
Topic 3:
Number of reviews: 99
Summary: "Year-over-Year Expense Growth Driven by Investments and Risk Controls, Partially Offset by Productivity Savings"
-----------------------------------------------------
Topic 4:
Number of reviews: 98
Summary: "Quarterly Loan Trends: Decreased Average Loans and Increased Year-on-Year Growth in Home Lending"
-----------------------------------------------------
Topic 5:
Number of 

#### **TOPIC MODELLING - SEMI-SUPERVISED - GUIDED**

In [39]:
# Specify some seed topics
seed_topic_list = [
    # 0. Capital Adequacy
    ["capital", "tier 1", "tier 2", "risk-weighted assets", "capital buffer", "regulatory capital", "cet1", "capital ratio", "tangible book value", "leverage ratio", "supplementary leverage ratio", "leverage exposure", "capital constraints", "basel iii"],

    # 1. Liquidity Risk
    ["liquidity", "cash", "short-term funding", "liquid assets", "deposit outflows", "liquidity coverage ratio", "cash reserves"],

    # 2. Profitability
    ["profit", "earnings", "revenue", "margin", "return on equity", "net income", "operating income", "eps", "return on tangible common equity", "efficiency ratio"],

    # 3. Asset Quality / Credit Risk (NPLs)
    ["non-performing loans", "credit risk", "default", "delinquencies", "loan loss provisions", "impairment", "charge-offs", "allowance for credit losses", "write-downs", "reserve to funded loans"],

    # 4. Macroeconomic Risk / Interest Rates
    ["interest rates", "rate hikes", "monetary policy", "inflation", "yield curve", "economic outlook", "central bank", "tariffs"],

    # 5. Market Risk / Volatility
    ["market volatility", "value at risk", "trading losses", "asset prices", "derivatives", "hedging", "market downturn"],

    # 6. Operational Risk / Technology
    ["cybersecurity", "system failure", "fraud", "data breach", "internal control", "technology risk", "disruption"],

    # 7. Regulatory & Compliance
    ["regulatory", "compliance", "supervisory", "basel", "reporting standards", "audit", "oversight"],

    # 8. ESG / Reputation Risk
    ["sustainability", "climate risk", "reputation", "governance", "social responsibility", "stakeholders", "diversity"],

    # 9. Capital Returns / Shareholder Value
    ["dividends", "share buybacks", "capital return", "payout ratio", "shareholder value", "stock repurchase"],

    # 10. Strategic Risk / Business Model
    ["business strategy", "growth plans", "restructuring", "core business", "competitive advantage", "market positioning"]
]

In [40]:
# Run BERTopic to extract topics from the cleaned text
model = BERTopic(
	verbose=True,
	nr_topics=50,
	seed_topic_list=seed_topic_list,
	embedding_model=embedding_model,
	umap_model=umap_model
	)
model.fit(clean_text)
topics, probabilities = model.transform(clean_text)

2025-06-17 12:38:39,365 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

2025-06-17 12:38:47,413 - BERTopic - Embedding - Completed ✓
2025-06-17 12:38:47,416 - BERTopic - Guided - Find embeddings highly related to seeded topics.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-17 12:38:47,536 - BERTopic - Guided - Completed ✓
2025-06-17 12:38:47,538 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-17 12:39:19,214 - BERTopic - Dimensionality - Completed ✓
2025-06-17 12:39:19,217 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-17 12:39:19,624 - BERTopic - Cluster - Completed ✓
2025-06-17 12:39:19,625 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-06-17 12:39:19,961 - BERTopic - Representation - Completed ✓
2025-06-17 12:39:19,964 - BERTopic - Topic reduction - Reducing number of topics
2025-06-17 12:39:19,991 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-17 12:39:20,165 - BERTopic - Representation - Completed ✓
2025-06-17 12:39:20,169 - BERTopic - Topic reduction - Reduced number of topics from 251 to 50


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

2025-06-17 12:39:28,805 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-06-17 12:39:47,590 - BERTopic - Dimensionality - Completed ✓
2025-06-17 12:39:47,592 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-06-17 12:39:47,961 - BERTopic - Cluster - Completed ✓


In [41]:
df_topics = model.get_topic_freq()
df_topics.head(50)

Unnamed: 0,Topic,Count
8,0,2377
1,-1,1765
0,1,768
17,2,649
3,3,281
9,4,258
6,5,251
4,6,220
16,7,204
30,8,203


In [42]:
pct_outliers = df_topics.loc[df_topics['Topic'] == -1, 'Count'].iloc[0]/df_topics['Count'].sum() * 100
print(f"Percentage of outliers: {pct_outliers:.2f}%")

Percentage of outliers: 18.61%


In [43]:
df_all_text_main['topic_idx'] = topics
df_all_text_main

Unnamed: 0,text,fiscal_period_ref,speaker,role,page,section,reporting_period,date_of_earnings_call,bank,document_type,source,sentiment,topic_idx
0,"Thank you, Jenn, hello to everyone joining us ...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,1
1,"First, our banking system as a whole is very s...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,-1
2,The U.S. system comprises a healthy mix of com...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,-1
3,I am pleased that Citi has been a source of st...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,-1
4,We are in a position to play this role because...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9481,"Adjusted overhead ratio, preceding year trend:...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted overhead ratio)\nJPMorgan,...",Negative,0
9482,"Revenue, preceding year trend: Revenue increas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Revenue)\nJPMorgan, Q4, 2024, Earni...",Positive,0
9483,"Expense, preceding year trend: Expense decreas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Expense)\nJPMorgan, Q4, 2024, Earni...",Neutral,0
9484,"Credit costs, preceding year trend: Credit cos...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Credit costs)\nJPMorgan, Q4, 2024, ...",Positive,-1


In [44]:
model.get_topic(0)

[('year', np.float64(0.03713016426951046)),
 ('trend', np.float64(0.035872534496043225)),
 ('quarter', np.float64(0.03553978130678159)),
 ('income', np.float64(0.03143450802396315)),
 ('preceding', np.float64(0.031151395584992723)),
 ('revenue', np.float64(0.030947558465530805)),
 ('on', np.float64(0.030638695336216853)),
 ('increased', np.float64(0.030559539320404294)),
 ('net', np.float64(0.026578524503654104)),
 ('revenues', np.float64(0.026411778518279312))]

In [45]:
# Let's take a closer look a this topic
for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 0, "text"].tolist():
	print("-----------------------------------------------------")
	print(text)

-----------------------------------------------------
Now, turning to how we performed during the quarter… We reported net income of $4.6 billion and an EPS of $2.19. We had good revenue growth of 6% ex-divestitures and both revenue and expenses were in line with our guidance. Our ROTCE of nearly 11% benefitted from the closing of the sales of our consumer businesses in India and Vietnam and would have been over 9% without those gains.
-----------------------------------------------------
Within Markets, our Fixed Income revenues were up 4% from a year ago. We benefitted from excellent performance in Rates and continued engagement from our corporate clients. The first quarter of 2022 was no slouch, as you may recall, but this quarter was our third-best in a decade. Equities was weaker however - down markedly in both derivatives and cash, but still had revenues north of $1 billion.
-----------------------------------------------------
In US Personal Banking, our cards businesses gained 

In [46]:
model.get_topic(6)

[('liquidity', np.float64(0.0634685624886439)),
 ('deposits', np.float64(0.032270832342674825)),
 ('cash', np.float64(0.03213719520951941)),
 ('hqla', np.float64(0.028565833840318324)),
 ('lcr', np.float64(0.028015744681667162)),
 ('resources', np.float64(0.026966377966119576)),
 ('available', np.float64(0.025398411683980392)),
 ('liquid', np.float64(0.025122624066750733)),
 ('quality', np.float64(0.022525961866683774)),
 ('high', np.float64(0.02184746731413847))]

In [47]:
# Let's take a closer look a this topic
for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 1, "text"].tolist():
	print("-----------------------------------------------------")
	print(text)

-----------------------------------------------------
Thank you, Jenn, hello to everyone joining us today. Well, 2023 is shaping up to be another interesting year, given the tumultuous events of the last few weeks. I am going to share some observations, and then we'll turn to what was a good quarter.
-----------------------------------------------------
And it is also thanks to our people. I want to express my pride in our colleagues around the world who have worked tirelessly last month to serve clients as they turned to Citi as a port in the storm.
-----------------------------------------------------
We made some significant leadership announcements. I am delighted that Andy Sieg will join Citi at my table as the new head of Wealth Management. Andy is a widely respected leader in this space and comes to us after running an $18 billion business with $2.8 trillion in client balances. He is the latest and the most visible example of the excellent talent we have attracted over the last 

In [48]:
model.visualize_topics()

In [49]:
model.visualize_barchart()

In [50]:
model.visualize_heatmap(top_n_topics=20)

We can use a Hugging Face text generation model along with BERTopic's representation_model wrapper and pass this into BERTopic to summarise the topics for us

Since we'll be using Microsoft Phi 4 later, we'll use it here too.

In [51]:
generator = pipeline(
    "text-generation",
    model="microsoft/Phi-4-mini-instruct",
    model_kwargs={"torch_dtype": "auto"},
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [52]:
# Create prompt template to pass into BERTopic TextGeneration wrapper
prompt_template = """<|system|>You are a helpful assistant who can succinctly describe the main topic covered by a set of customer reviews provided to you<|end|><|user|>I have a topic that contains the following documents: \n[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS] Based on the above information, can you give a short label of the topic?<|end|><|assistant|>"""
result = generator(prompt_template, max_new_tokens=200)
result[0]["generated_text"]

'<|system|>You are a helpful assistant who can succinctly describe the main topic covered by a set of customer reviews provided to you<|end|><|user|>I have a topic that contains the following documents: \n[DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS] Based on the above information, can you give a short label of the topic?<|end|><|assistant|>Sure, I can provide a short label for the topic. However, I need you to provide the actual documents and the keywords associated with them. Once I have that information, I can help you label the topic succinctly.'

In [53]:
# Let's create the BERTopic TextGeneration wrapper now
representation_model = TextGeneration(
	model=generator,
	prompt=prompt_template,
	pipeline_kwargs={"max_new_tokens": 200},
	random_state=0,
)

Now let's rerun the BERTopic model exactly as before except this time we'll include the Phi 4 representation model above which will provide clean, interpretable topic labels for us

In [54]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0) # These are the default parameters for UMAP used in _bertopic.py with the additional paramater of random_state = 0
model = BERTopic(
	verbose=True,
	nr_topics= 50,
	seed_topic_list=seed_topic_list,
	embedding_model=embedding_model,
	umap_model=umap_model,
	representation_model=representation_model
	)
model.fit(clean_text)
topics, probabilities = model.transform(clean_text)

2025-06-17 12:40:01,152 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

2025-06-17 12:40:09,319 - BERTopic - Embedding - Completed ✓
2025-06-17 12:40:09,320 - BERTopic - Guided - Find embeddings highly related to seeded topics.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-17 12:40:09,417 - BERTopic - Guided - Completed ✓
2025-06-17 12:40:09,420 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-17 12:40:39,686 - BERTopic - Dimensionality - Completed ✓
2025-06-17 12:40:39,688 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-17 12:40:40,050 - BERTopic - Cluster - Completed ✓
2025-06-17 12:40:40,052 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-06-17 12:40:40,397 - BERTopic - Representation - Completed ✓
2025-06-17 12:40:40,399 - BERTopic - Topic reduction - Reducing number of topics
2025-06-17 12:40:40,412 - BERTopic - Representation - Fine-tuning topics using representation models.
100%|██████████| 50/50 [00:59<00:00,  1.19s/it]
2025-06-17 12:41:40,405 - BERTopic - Representation - Completed ✓
2025-06-17 12:41:40,408 - BERTopic - Topic reduction - Reduced number of topics from 251 to 50


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

2025-06-17 12:41:48,479 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-06-17 12:41:55,941 - BERTopic - Dimensionality - Completed ✓
2025-06-17 12:41:55,943 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-06-17 12:41:56,291 - BERTopic - Cluster - Completed ✓


Let's print out the topic summaries generated by Phi 4.

In [55]:
topic_summaries_dict = defaultdict(list)
max_topic_idx = df_all_text_main['topic_idx'].max()
for i in range(max_topic_idx + 1):
	topic_summaries_dict["topic_idx"].append(i)
	topic_summaries_dict["reviews"].append(model.get_topic_freq(i))
	topic_summaries_dict["summary"].append(model.topic_representations_[i][0][0])
	print("-----------------------------------------------------")
	print(f"Topic {i}:")
	print(f"Number of reviews: {model.get_topic_freq(i)}")
	print(f"Summary: {model.topic_representations_[i][0][0]}")

df_topic_summaries = pd.DataFrame(topic_summaries_dict)

-----------------------------------------------------
Topic 0:
Number of reviews: 2377
Summary: "Quarterly Trends in Banking Net Income and Interest Revenue"
-----------------------------------------------------
Topic 1:
Number of reviews: 768
Summary: Transformation and Investment Strategy in Client and Business Growth.
-----------------------------------------------------
Topic 2:
Number of reviews: 649
Summary: Year-on-Year Quarterly Trends in Average Loans and Deposits in Retail Banking
-----------------------------------------------------
Topic 3:
Number of reviews: 281
Summary: CET1 Capital Ratio Increase in Banking Sector (13.4% YoY)
-----------------------------------------------------
Topic 4:
Number of reviews: 258
Summary: Branded Credit Card Spend Volume Growth and Trends
-----------------------------------------------------
Topic 5:
Number of reviews: 251
Summary: Federal Reserve Interest Rate Outlook and Economic Unemployment Scenarios
------------------------------------

#### **TOPIC MODELLING - SEMI-SUPERVISED - ZERO-SHOT**

In [119]:
# topic_mapping_dict = {
# 	"Capital Adequacy": ["capital", "tier 1 capital", "tier 2 capital", "risk-weighted assets", "capital buffer", "regulatory capital", "cet1 ratio", "capital ratio", "tangible book value per share", "leverage ratio", "supplementary leverage ratio", "leverage exposure", "capital constraints", "basel iii"],

# 	"Liquidity Risk": ["liquidity risk", "cash position", "short-term funding", "liquid assets", "deposit outflows", "liquidity coverage ratio", "cash reserves"],

# 	"Profitability": ["profitability", "earnings performance", "revenue growth", "profit margin", "net interest margin", "return on equity", "net income", "operating income", "earnings per share", "return on tangible common equity", "efficiency ratio"],

# 	"Asset Quality and Credit Risk": ["non-performing loans", "credit risk", "loan defaults", "delinquencies", "loan loss provisions", "impairments", "charge-offs", "allowance for credit losses", "loan write-downs", "reserve to funded loans", "stage 3 loans"],

# 	"Macroeconomic Risk": ["rate hikes", "monetary policy", "inflation", "economic slowdown", "central bank policy", "unemployment", "tariffs", "macroeconomic outlook", "geopolitical risk"],

# 	"Interest Rate Risk":  ["interest rate risk", "banking book interest rate risk", "IRRBB", "net interest margin sensitivity", "repricing gap", "duration mismatch", "yield curve exposure", "interest rate sensitivity", "rate shock scenarios", "basis risk"
# 	],

# 	"Market and Volatility Risk": ["market volatility", "value at risk", "trading losses", "asset price fluctuation", "derivative exposure", "hedging strategy", "market downturn"],

# 	"Operational Risk": ["cybersecurity threat", "system failure", "fraud risk", "data breach", "internal controls", "technology risk", "operational disruption"],

# 	"Regulatory & Compliance Risk": ["regulatory requirements", "compliance risk", "supervisory review", "basel framework", "reporting standards", "audit finding", "regulatory oversight"],

# 	"ESG and Reputation Risk": ["sustainability goals", "climate risk", "reputation risk", "corporate governance", "social responsibility", "stakeholder engagement", "diversity and inclusion", "community impact"],

# 	"Strategic and Business Model Risk": ["business strategy", "growth plans", "corporate restructuring", "core business focus", "competitive positioning", "market entry strategy"],

# 	"Legal Risk": ["litigation risk", "lawsuit", "legal proceedings", "class action", "settlement", "regulatory investigation", "legal exposure", "contractual dispute", "fines and penalties"]
# }


# "Macroeconomic and Interest Rate Risk": ["interest rates", "rate hikes", "monetary policy", "inflation", "yield curve", "economic outlook", "central bank policy", "tariffs", "macroeconomic environment", "geopolitical risk"],

# "Capital Returns and Shareholder Value": ["dividends", "share buybacks", "capital return", "payout ratio", "shareholder value", "stock repurchase"],


topic_mapping_dict = {
    "Capital Adequacy": [
        "Capital", "Tier 1 Capital", "Tier 2 Capital", "Risk-Weighted Assets", "Capital Buffer",
        "Regulatory Capital", "CET1 Ratio", "Capital Ratio", "Tangible Book Value Per Share",
        "Leverage Ratio", "Supplementary Leverage Ratio", "Leverage Exposure", "Capital Constraints", "Basel III"
    ],

    "Liquidity Risk": [
        "Liquidity Risk", "Cash Position", "Short-Term Funding", "Liquid Assets", "Deposit Outflows",
        "Liquidity Coverage Ratio", "Cash Reserves"
    ],

    "Profitability": [
        "Profitability", "Earnings Performance", "Revenue Growth", "Profit Margin", "Net Interest Margin",
        "Return On Equity", "Net Income", "Operating Income", "Earnings Per Share",
        "Return On Tangible Common Equity", "Efficiency Ratio"
    ],

    "Asset Quality and Credit Risk": [
        "Non-Performing Loans", "Credit Risk", "Loan Defaults", "Delinquencies", "Loan Loss Provisions",
        "Impairments", "Charge-Offs", "Allowance For Credit Losses", "Loan Write-Downs",
        "Reserve To Funded Loans", "Stage 3 Loans"
    ],

    "Macroeconomic Risk": [
        "Rate Hikes", "Monetary Policy", "Inflation", "Economic Slowdown", "Central Bank Policy",
        "Unemployment", "Tariffs", "Macroeconomic Outlook", "Geopolitical Risk"
    ],

    "Interest Rate Risk": [
        "Interest Rate Risk", "Banking Book Interest Rate Risk", "IRRBB", "Net Interest Margin Sensitivity",
        "Repricing Gap", "Duration Mismatch", "Yield Curve Exposure", "Interest Rate Sensitivity",
        "Rate Shock Scenarios", "Basis Risk"
    ],

    "Market and Volatility Risk": [
        "Market Volatility", "Value At Risk", "Trading Losses", "Asset Price Fluctuation",
        "Derivative Exposure", "Hedging Strategy", "Market Downturn"
    ],

    "Operational Risk": [
        "Cybersecurity Threat", "System Failure", "Fraud Risk", "Data Breach",
        "Internal Controls", "Technology Risk", "Operational Disruption"
    ],

    "Regulatory & Compliance Risk": [
        "Regulatory Requirements", "Compliance Risk", "Supervisory Review", "Basel Framework",
        "Reporting Standards", "Audit Finding", "Regulatory Oversight"
    ],

    "ESG and Reputation Risk": [
        "Sustainability Goals", "Climate Risk", "Reputation Risk", "Corporate Governance",
        "Social Responsibility", "Stakeholder Engagement", "Diversity And Inclusion", "Community Impact"
    ],

    "Strategic and Business Model Risk": [
        "Business Strategy", "Growth Plans", "Corporate Restructuring", "Core Business Focus",
        "Competitive Positioning", "Market Entry Strategy"
    ],

    "Legal Risk": [
        "Litigation Risk", "Lawsuit", "Legal Proceedings", "Class Action",
        "Settlement", "Regulatory Investigation", "Legal Exposure", "Contractual Dispute", "Fines And Penalties"
    ]
}

In [120]:
topic_mapping_df = pd.DataFrame()
for risk_category, keywords in topic_mapping_dict.items():
	for keyword in keywords:
		topic_mapping_df = pd.concat(
			[topic_mapping_df, pd.DataFrame({"risk_category": [risk_category], "topic_label": [keyword]})],
			ignore_index=True
		)
topic_mapping_df

Unnamed: 0,risk_category,topic_label
0,Capital Adequacy,Capital
1,Capital Adequacy,Tier 1 Capital
2,Capital Adequacy,Tier 2 Capital
3,Capital Adequacy,Risk-Weighted Assets
4,Capital Adequacy,Capital Buffer
...,...,...
101,Legal Risk,Settlement
102,Legal Risk,Regulatory Investigation
103,Legal Risk,Legal Exposure
104,Legal Risk,Contractual Dispute


In [121]:
# Now we'll use the topic mapping values in a zero shot topic modelling approach and later map back to the broader topic categories
zeroshot_topic_list = topic_mapping_df['topic_label'].tolist()

In [122]:
# Run BERTopic to extract topics from the cleaned text
model = BERTopic(
	verbose=True,
	# nr_topics=200,
	zeroshot_topic_list=zeroshot_topic_list,
	zeroshot_min_similarity= 0.5,
	embedding_model=embedding_model,
	umap_model=umap_model
	)
model.fit(clean_text)
topics, probabilities = model.transform(clean_text)

2025-06-17 16:03:41,249 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

2025-06-17 16:03:49,174 - BERTopic - Embedding - Completed ✓
2025-06-17 16:03:49,175 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-17 16:04:09,087 - BERTopic - Dimensionality - Completed ✓
2025-06-17 16:04:09,092 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2025-06-17 16:04:09,368 - BERTopic - Zeroshot Step 1 - Completed ✓
2025-06-17 16:04:14,558 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-17 16:04:14,831 - BERTopic - Cluster - Completed ✓
2025-06-17 16:04:14,833 - BERTopic - Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...
2025-06-17 16:04:14,888 - BERTopic - Zeroshot Step 2 - Completed ✓
2025-06-17 16:04:14,892 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-17 16:04:15,177 - BERTopic - Representation - Completed ✓


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

2025-06-17 16:04:23,298 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [123]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2178,-1_think_re_ve_going,"[think, re, ve, going, see, us, would, bit, th...","[, let 's pivot outlook 2024 , starting nii pa..."
1,0,3,Risk-Weighted Assets,"[prudently, weighted, considering, actively, p...","[rwa : risk-weighted assets, rwa : risk-weight..."
2,1,1,Regulatory Capital,"[45b, operational, 30b, 15b, regulations, mini...",[increase operational risk capital : proposed ...
3,2,194,CET1 Ratio,"[cet1, ratio, capital, standardized, 13, requi...","[cet1 capital ratio - standardized , year-on-y..."
4,3,1,Capital Ratio,"[150, ending, maintained, repurchases, returne...","[maintained strong capital , ending year preli..."
...,...,...,...,...,...
259,258,11,258_nii_94_completeness_bolstered,"[nii, 94, completeness, bolstered, uneven, dir...",[firmwide nii outlook increased $ 94.5 billion...
260,259,26,259_nii_ex_finish_consensus,"[nii, ex, finish, consensus, previously, outlo...","[finish , let 's turn outlook page 8. expect 2..."
261,260,15,260_technology_spend_cyber_development,"[technology, spend, cyber, development, transf...","[finally , 'll note n't actually talked techno..."
262,261,73,261_productivity_savings_expenses_investments,"[productivity, savings, expenses, investments,...","[expenses increased 8 % , driven investments t..."


In [124]:
df_topics = model.get_topic_freq()
df_topics

Unnamed: 0,Topic,Count
1,-1,2178
42,15,461
16,2,194
35,18,177
31,227,159
...,...,...
187,44,1
179,24,1
53,45,1
202,39,1


In [125]:
# Identify topics with only a few reviews
minor_topics_list = df_topics.loc[df_topics['Count'] <= 3,"Topic"].tolist()
minor_topics_list

[41,
 0,
 35,
 30,
 23,
 51,
 33,
 42,
 7,
 27,
 13,
 32,
 12,
 1,
 3,
 40,
 36,
 34,
 29,
 46,
 37,
 44,
 24,
 45,
 39,
 38]

In [126]:
pct_outliers = df_topics.loc[df_topics['Topic'] == -1, 'Count'].iloc[0]/df_topics['Count'].sum() * 100
print(f"Percentage of outliers: {pct_outliers:.2f}%")

Percentage of outliers: 22.96%


In [127]:
df_all_text_main['topic_idx'] = topics
df_all_text_main

Unnamed: 0,text,fiscal_period_ref,speaker,role,page,section,reporting_period,date_of_earnings_call,bank,document_type,source,sentiment,topic_idx
0,"Thank you, Jenn, hello to everyone joining us ...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,250
1,"First, our banking system as a whole is very s...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,223
2,The U.S. system comprises a healthy mix of com...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,223
3,I am pleased that Citi has been a source of st...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,237
4,We are in a position to play this role because...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,241
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9481,"Adjusted overhead ratio, preceding year trend:...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted overhead ratio)\nJPMorgan,...",Negative,60
9482,"Revenue, preceding year trend: Revenue increas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Revenue)\nJPMorgan, Q4, 2024, Earni...",Positive,15
9483,"Expense, preceding year trend: Expense decreas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Expense)\nJPMorgan, Q4, 2024, Earni...",Neutral,154
9484,"Credit costs, preceding year trend: Credit cos...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Credit costs)\nJPMorgan, Q4, 2024, ...",Positive,122


In [128]:
model.get_topic(52)

[('afs', np.float64(0.29605062653053543)),
 ('duration', np.float64(0.20150715493352367)),
 ('240b', np.float64(0.14102910151992495)),
 ('years', np.float64(0.13236746787553538)),
 ('255b', np.float64(0.09713335003572383)),
 ('securities', np.float64(0.08874884710640989)),
 ('237b', np.float64(0.06965094776412306)),
 ('249b', np.float64(0.06475556669048256)),
 ('257b', np.float64(0.06475556669048256)),
 ('265b', np.float64(0.0390166766650691))]

In [129]:
# Let's take a closer look a this topic
for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 52, "text"].tolist():
	print("-----------------------------------------------------")
	print(text)

-----------------------------------------------------
Third, we generated 12 basis points from unrealized AFS gains.
-----------------------------------------------------
Third, we benefitted from the impact of lower rates on our AFS investment portfolio, which drove an increase of 20 basis points.
-----------------------------------------------------
AFS: Available for Sale
-----------------------------------------------------
AFS Securities (Duration: ~2 Years), preceding quarter trend: Decreased from $250B in 4Q22 to $240B in 1Q23.
-----------------------------------------------------
AFS Securities (Duration: ~2 Years), preceding quarter trend: Decreased from $257B in 4Q23.
-----------------------------------------------------
AFS Securities (Duration: ~2 Years), preceding quarter trend: Decreased from $257B in 4Q23.
-----------------------------------------------------
AFS Securities (Duration: ~2 Years), preceding quarter trend: Decreased from $227B in 4Q24.
---------------------

In [130]:
topic_summary_df = model.get_topic_info()
topic_summary_df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2178,-1_think_re_ve_going,"[think, re, ve, going, see, us, would, bit, th...","[, let 's pivot outlook 2024 , starting nii pa..."
1,0,3,Risk-Weighted Assets,"[prudently, weighted, considering, actively, p...","[rwa : risk-weighted assets, rwa : risk-weight..."
2,1,1,Regulatory Capital,"[45b, operational, 30b, 15b, regulations, mini...",[increase operational risk capital : proposed ...
3,2,194,CET1 Ratio,"[cet1, ratio, capital, standardized, 13, requi...","[cet1 capital ratio - standardized , year-on-y..."
4,3,1,Capital Ratio,"[150, ending, maintained, repurchases, returne...","[maintained strong capital , ending year preli..."
...,...,...,...,...,...
259,258,11,258_nii_94_completeness_bolstered,"[nii, 94, completeness, bolstered, uneven, dir...",[firmwide nii outlook increased $ 94.5 billion...
260,259,26,259_nii_ex_finish_consensus,"[nii, ex, finish, consensus, previously, outlo...","[finish , let 's turn outlook page 8. expect 2..."
261,260,15,260_technology_spend_cyber_development,"[technology, spend, cyber, development, transf...","[finally , 'll note n't actually talked techno..."
262,261,73,261_productivity_savings_expenses_investments,"[productivity, savings, expenses, investments,...","[expenses increased 8 % , driven investments t..."


In [131]:
len(zeroshot_topic_list)

106

In [132]:
# Get unsupervised topics
sel_bool = ((topic_summary_df['Topic'] >= 0) & (~topic_summary_df['Name'].isin(zeroshot_topic_list)))
topic_summary_df.loc[sel_bool, ['Topic','Count','Name']]

Unnamed: 0,Topic,Count,Name
53,52,15,52_afs_duration_240b_years
54,53,35,53_thank_thanks_candor_appreciate
55,54,22,54_allocated_tce_average_166b
56,55,84,55_ebt_preceding_trend_decreased
57,56,19,56_ebt_meaningful_change_nm
...,...,...,...
259,258,11,258_nii_94_completeness_bolstered
260,259,26,259_nii_ex_finish_consensus
261,260,15,260_technology_spend_cyber_development
262,261,73,261_productivity_savings_expenses_investments


In [133]:
unsupervised_topics_list = topic_summary_df.loc[sel_bool, 'Topic'].tolist()

In [134]:
df_all_text_main_unsupervised = df_all_text_main[df_all_text_main['topic_idx'].isin(unsupervised_topics_list)]
df_all_text_main_unsupervised

Unnamed: 0,text,fiscal_period_ref,speaker,role,page,section,reporting_period,date_of_earnings_call,bank,document_type,source,sentiment,topic_idx
0,"Thank you, Jenn, hello to everyone joining us ...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,250
1,"First, our banking system as a whole is very s...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,223
2,The U.S. system comprises a healthy mix of com...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,223
3,I am pleased that Citi has been a source of st...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,237
4,We are in a position to play this role because...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,241
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9477,"NIR excluding Markets, preceding year trend: N...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (NIR excluding Markets)\nJPMorgan, Q...",Neutral,196
9480,"Adjusted expense, preceding year trend: Adjust...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted expense)\nJPMorgan, Q4, 20...",Neutral,113
9481,"Adjusted overhead ratio, preceding year trend:...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted overhead ratio)\nJPMorgan,...",Negative,60
9483,"Expense, preceding year trend: Expense decreas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Expense)\nJPMorgan, Q4, 2024, Earni...",Neutral,154


In [135]:
from boe_risk_monitoring.llms.processing_llms import TopicLabellingLLM
from pydantic import BaseModel, Field
from typing import List, Literal, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed

In [136]:
topic_mapping_dict.keys()

dict_keys(['Capital Adequacy', 'Liquidity Risk', 'Profitability', 'Asset Quality and Credit Risk', 'Macroeconomic Risk', 'Interest Rate Risk', 'Market and Volatility Risk', 'Operational Risk', 'Regulatory & Compliance Risk', 'ESG and Reputation Risk', 'Strategic and Business Model Risk', 'Legal Risk'])

In [137]:
# Create a Pydantic model for the topic label
class TopicLabel(BaseModel):
	topic_label: str
	broad_topic: Optional[Literal['Capital Adequacy', 'Liquidity Risk', 'Profitability', 'Asset Quality and Credit Risk', 'Macroeconomic Risk', 'Interest Rate Risk', 'Market and Volatility Risk', 'Operational Risk', 'Regulatory & Compliance Risk', 'ESG and Reputation Risk', 'Strategic and Business Model Risk', 'Legal Risk']]


In [138]:
def _make_topic_labelling_prompt(docs):
	"""Define a prompt for the topic labelling task."""
	return (
		"You are a financial risk analyst. Below is a topic extracted from earnings calls using topic modeling. Your task is to:\n"
		"1. topic_label: assign a concise and reusable topic label (2–4 words) that captures the subject without adding interpretations or qualifiers (e.g., avoid words like 'significant', 'concerning', or 'not meaningful'). Use neutral phrasing that could apply across quarters or contexts.\n"
		"2. broad_topic: classify the topic under one of the standard financial risk categories.\n"
		"If the topic does not reflect any financial risk (e.g., greetings, procedural comments), set 'broad_topic' to null.\n"
		"Here are the documents that make up the topic:\n"
		"\n---\n".join(docs) + "\n"
	)



In [139]:
df_all_text_main_unsupervised

Unnamed: 0,text,fiscal_period_ref,speaker,role,page,section,reporting_period,date_of_earnings_call,bank,document_type,source,sentiment,topic_idx
0,"Thank you, Jenn, hello to everyone joining us ...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,250
1,"First, our banking system as a whole is very s...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,223
2,The U.S. system comprises a healthy mix of com...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,223
3,I am pleased that Citi has been a source of st...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,237
4,We are in a position to play this role because...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,241
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9477,"NIR excluding Markets, preceding year trend: N...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (NIR excluding Markets)\nJPMorgan, Q...",Neutral,196
9480,"Adjusted expense, preceding year trend: Adjust...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted expense)\nJPMorgan, Q4, 20...",Neutral,113
9481,"Adjusted overhead ratio, preceding year trend:...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted overhead ratio)\nJPMorgan,...",Negative,60
9483,"Expense, preceding year trend: Expense decreas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Expense)\nJPMorgan, Q4, 2024, Earni...",Neutral,154


In [140]:
def label_topic_with_llm(topic_idx, topic_docs, llm_backend, llm_model_name):
	"""Label a topic using the LLM."""
	prompt = _make_topic_labelling_prompt(topic_docs)
	topic_label_llm = TopicLabellingLLM(
		topic_labelling_prompt=prompt,
		response_schema=TopicLabel,
		backend=llm_backend,
		model_name=llm_model_name,
	)
	topic_output = topic_label_llm.invoke()
	topic = topic_output.topic_label
	risk_category = topic_output.broad_topic
	print(f"Topic {topic_idx}: {topic} ({risk_category})")
	return {"topic": topic, "risk_category": risk_category}

In [141]:
# Initialize a dataframe to hold topic labels
unsupervised_topics_df = pd.DataFrame(columns=["topic_idx", "topic_label", "risk_category"])

In [142]:
# Create a ThreadPoolExecutor to process docs concurrently
with ThreadPoolExecutor(max_workers=10) as executor:
	futures = {}
	for unsupervised_topic_idx in unsupervised_topics_list:
		# Get the documents for the current topic
		topic_docs = df_all_text_main_unsupervised.loc[df_all_text_main_unsupervised['topic_idx'] == unsupervised_topic_idx, 'text'].tolist()
		future = executor.submit(
			label_topic_with_llm,
			topic_idx=unsupervised_topic_idx,
			topic_docs=topic_docs,
			llm_backend="gemini",
			llm_model_name="gemini-2.5-flash-preview-05-20"
			# llm_backend="openai",
			# llm_model_name="gpt-4o"
		)
		futures[future] = unsupervised_topic_idx

	# Wait for all futures to complete
	for future in as_completed(futures):
		try:
			topic_idx = futures[future]
			res = future.result()  # This will raise an exception if the processing failed
			topic = res['topic']
			risk_category = res['risk_category']
			# Append the result to the labels_df
			unsupervised_topics_df = pd.concat(
				[unsupervised_topics_df, pd.DataFrame({
					"topic_idx": [topic_idx],
					"topic_label": [topic],
					"risk_category": [risk_category]
				})],
				ignore_index=True
			)
		except Exception as e:
			print(f"Error processing {futures[future]}: {e}")

Topic 56: EBT Trend (Profitability)
Topic 59: Credit Costs Trend (Asset Quality and Credit Risk)
Topic 61: ROE Performance Trends (Profitability)
Topic 58: HTM Securities Duration (Interest Rate Risk)
Topic 53: Procedural comments (None)
Topic 54: Allocated TCE Trend (Capital Adequacy)
Topic 55: EBT Trends (Profitability)
Topic 52: AFS Securities Valuation (Interest Rate Risk)
Topic 57: Total and Investment Assets (Strategic and Business Model Risk)
Topic 62: Liquidity Metrics (Liquidity Risk)
Topic 60: Overhead Ratio Analysis (Profitability)
Topic 64: Net Interest Income Trend (Profitability)
Topic 67: ROTCE Performance (Profitability)
Topic 66: Capital & Equity Levels (Capital Adequacy)
Topic 71: ACLL Loan Ratio (Asset Quality and Credit Risk)
Topic 68: Citigold Performance Trends (Strategic and Business Model Risk)
Topic 70: Digital User Growth (None)
Topic 77: Analyst question intro (None)
Topic 72: EOP Loan Balances (Asset Quality and Credit Risk)
Topic 69: AUC/AUA Trend (Strategi

In [143]:
unsupervised_topics_df = unsupervised_topics_df.sort_values(by='topic_idx').reset_index(drop=True)
unsupervised_topics_df

Unnamed: 0,topic_idx,topic_label,risk_category
0,52,AFS Securities Valuation,Interest Rate Risk
1,53,Procedural comments,
2,54,Allocated TCE Trend,Capital Adequacy
3,55,EBT Trends,Profitability
4,56,EBT Trend,Profitability
...,...,...,...
206,258,Net Interest Income,Interest Rate Risk
207,259,NII Guidance,Profitability
208,260,Technology Spend and Transformation,Strategic and Business Model Risk
209,261,Operating Expense Trends,Profitability


In [144]:
# Zeroshot topic idxs and names
zeroshot_topics_df = topic_summary_df.loc[topic_summary_df['Name'].isin(zeroshot_topic_list), ['Topic', 'Name']]
# Rename columns for clarity
zeroshot_topics_df.rename(columns={'Topic': 'topic_idx', 'Name': 'topic_label'}, inplace=True)
# Add risk category based on the topic mapping
zeroshot_topics_df = zeroshot_topics_df.merge(topic_mapping_df, on='topic_label', how='left')
zeroshot_topics_df

Unnamed: 0,topic_idx,topic_label,risk_category
0,0,Risk-Weighted Assets,Capital Adequacy
1,1,Regulatory Capital,Capital Adequacy
2,2,CET1 Ratio,Capital Adequacy
3,3,Capital Ratio,Capital Adequacy
4,4,Tangible Book Value Per Share,Capital Adequacy
5,5,Leverage Ratio,Capital Adequacy
6,6,Supplementary Leverage Ratio,Capital Adequacy
7,7,Capital Constraints,Capital Adequacy
8,8,Basel III,Capital Adequacy
9,9,Liquid Assets,Liquidity Risk


In [215]:
all_topics_df = pd.concat([zeroshot_topics_df, unsupervised_topics_df], ignore_index=True)
all_topics_df

Unnamed: 0,topic_idx,topic_label,risk_category
0,0,Risk-Weighted Assets,Capital Adequacy
1,1,Regulatory Capital,Capital Adequacy
2,2,CET1 Ratio,Capital Adequacy
3,3,Capital Ratio,Capital Adequacy
4,4,Tangible Book Value Per Share,Capital Adequacy
...,...,...,...
258,258,Net Interest Income,Interest Rate Risk
259,259,NII Guidance,Profitability
260,260,Technology Spend and Transformation,Strategic and Business Model Risk
261,261,Operating Expense Trends,Profitability


In [216]:
# Let's take a closer look a this topic
for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 52, "text"].tolist():
	print("-----------------------------------------------------")
	print(text)

-----------------------------------------------------
Third, we generated 12 basis points from unrealized AFS gains.
-----------------------------------------------------
Third, we benefitted from the impact of lower rates on our AFS investment portfolio, which drove an increase of 20 basis points.
-----------------------------------------------------
AFS: Available for Sale
-----------------------------------------------------
AFS Securities (Duration: ~2 Years), preceding quarter trend: Decreased from $250B in 4Q22 to $240B in 1Q23.
-----------------------------------------------------
AFS Securities (Duration: ~2 Years), preceding quarter trend: Decreased from $257B in 4Q23.
-----------------------------------------------------
AFS Securities (Duration: ~2 Years), preceding quarter trend: Decreased from $257B in 4Q23.
-----------------------------------------------------
AFS Securities (Duration: ~2 Years), preceding quarter trend: Decreased from $227B in 4Q24.
---------------------

In [217]:
# Aid the clustering process by replacing acronyms with their full forms
glossary_df = pd.read_csv(GLOSSARY_FPATH)
glossary_dict = glossary_df.set_index('Term')['Definition'].to_dict()
glossary_dict

{'1H': 'First Half',
 'ACL': 'Allowance for Credit Losses',
 'ACLL': 'Allowance for Credit Losses on Loans',
 'AFS': 'Available for Sale',
 'AI': 'Artificial Intelligence',
 'AML': 'Anti-Money Laundering',
 'AOCI': 'Accumulated Other',
 'API': 'Application Programming Interface',
 'ATM': 'Automated Teller Machine',
 'AUA': 'Assets Under Administration',
 'AUC': 'Assets Under Custody',
 'BCRA': 'Central Bank of Argentina',
 'BHC': 'Bank Holding Company',
 'BOPREAL': 'Bonds for the',
 'bps': 'Basis Points',
 'BSA': 'Bank Secrecy Act',
 'CAGR': 'Compound Annual Growth Rate',
 'CCAR': 'Comprehensive Capital Analysis',
 'CCB': 'Citi Commercial Bank',
 'CECL': 'Current Expected Credit Losses',
 'CET1': 'Common Equity Tier 1',
 'CoC': 'Cost of Credit',
 'CRE': 'Commercial Real Estate',
 'CSO': 'Common Shares Outstanding',
 'CTA': 'Cumulative Translation Adjustment',
 'DCM': 'Debt Capital Markets',
 'DFAST': 'Dodd-Frank Act Stress Test',
 'DIA': 'Deposit Insurance Agency',
 'DM': 'Developed Ma

In [218]:
# Map occurrences of the acronyms to their full forms in the text
def replace_acronyms(text, glossary):
	"""
	Replace acronyms in the text with their full forms based on the provided glossary.
	"""
	for acronym, full_form in glossary.items():
		text = re.sub(r'\b' + re.escape(acronym) + r'\b', full_form, text)
	return text

In [219]:
# Apply the acronym replacement to the t column
all_topics_df['topic_label'] = all_topics_df['topic_label'].apply(lambda x: replace_acronyms(x, glossary_dict))

In [220]:
all_topics_df

Unnamed: 0,topic_idx,topic_label,risk_category
0,0,Risk-Weighted Assets,Capital Adequacy
1,1,Regulatory Capital,Capital Adequacy
2,2,Common Equity Tier 1 Ratio,Capital Adequacy
3,3,Capital Ratio,Capital Adequacy
4,4,Tangible Book Value Per Share,Capital Adequacy
...,...,...,...
258,258,Net Interest Income,Interest Rate Risk
259,259,Net Interest Income Guidance,Profitability
260,260,Technology Spend and Transformation,Strategic and Business Model Risk
261,261,Operating Expense Trends,Profitability


In [221]:
embedding_model_name = "mukaj/fin-mpnet-base"
# embedding_model_name = "FinLang/finance-embeddings-investopedia"
embedding_model = SentenceTransformer(embedding_model_name)

In [222]:
# Let's reduce the number of topics by merging similar topic labels through clustering
all_topic_labels = all_topics_df['topic_label'].tolist()
all_topic_label_embeddings = embedding_model.encode(all_topic_labels, normalize_embeddings=True)

In [223]:
from sklearn.metrics.pairwise import cosine_distances
distance_matrix = cosine_distances(all_topic_label_embeddings)
distance_matrix

array([[0.        , 0.8662983 , 0.89418864, ..., 0.9262887 , 0.8849335 ,
        0.84150475],
       [0.8662983 , 0.        , 0.6472338 , ..., 0.82055473, 0.785542  ,
        0.74531627],
       [0.89418864, 0.6472338 , 0.        , ..., 0.9387093 , 0.94066685,
        0.8823922 ],
       ...,
       [0.9262887 , 0.82055473, 0.9387093 , ..., 0.        , 0.6426339 ,
        0.8145535 ],
       [0.8849335 , 0.785542  , 0.94066685, ..., 0.6426339 , 0.        ,
        0.2514702 ],
       [0.84150475, 0.74531627, 0.8823922 , ..., 0.8145535 , 0.2514702 ,
        0.        ]], shape=(263, 263), dtype=float32)

In [224]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.2, min_samples=2, metric='precomputed') # EPS of 0.2 ensures that only very similar topic labels are clustered together
cluster_labels = dbscan.fit_predict(distance_matrix)
cluster_labels

array([-1,  0, -1, -1, -1,  1,  1, -1,  2, -1, -1,  3, -1, -1, -1,  4, -1,
       -1, -1,  5, -1,  6,  7, -1, -1,  8, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  2, -1, -1, -1, -1,  9, -1, -1, -1,
       -1, -1, -1, -1, 10, 10, -1, -1, 11, -1, -1, -1, -1, 12,  7, -1, 13,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, 14,  6, 13, -1,  3, 12, -1, -1,
       14, 12, 12, -1, -1, 14, -1, 12, -1, -1,  8, 15, -1,  8,  8, -1,  6,
       16, -1, 17, 17, 14, -1, -1, 18, -1,  4, 18, -1, 14, 14, -1, -1, -1,
       -1, 15, -1, 11, 11, -1, 19, 11, 11, 20, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, 21, -1, -1, -1, -1, 22, 19, -1, 21,  4,  4,
        4, 17, -1, -1, -1,  4,  4, 17, 17, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, 23, -1, -1, 24, -1, -1, -1,  4, 25, -1, 24, -1, -1,
       -1, -1, 23, -1, 16, -1, -1, -1, -1, 20, 20, -1, -1, 15, -1, -1, -1,
       -1, -1, 16, -1, 16, -1, -1, -1, 25, -1, 26,  4, 26, -1,  0, -1, -1,
       -1, -1, -1, -1, -1

In [225]:
cluster_labels.shape

(263,)

In [226]:
# Merge topics which are in the same cluster together
all_topics_df['cluster'] = cluster_labels
all_topics_df.groupby('cluster').size()


cluster
-1     172
 0       4
 1       2
 2       2
 3       2
 4       9
 5       2
 6       3
 7       2
 8       4
 9       2
 10      2
 11      5
 12     10
 13      2
 14      6
 15      3
 16      4
 17      5
 18      3
 19      2
 20      3
 21      2
 22      2
 23      2
 24      2
 25      2
 26      2
 27      2
dtype: int64

In [227]:
all_topics_df[all_topics_df['cluster'] == 27]

Unnamed: 0,topic_idx,topic_label,risk_category,cluster
248,248,Net Interest Income and Guidance,Interest Rate Risk,27
259,259,Net Interest Income Guidance,Profitability,27


In [228]:
all_topics_df

Unnamed: 0,topic_idx,topic_label,risk_category,cluster
0,0,Risk-Weighted Assets,Capital Adequacy,-1
1,1,Regulatory Capital,Capital Adequacy,0
2,2,Common Equity Tier 1 Ratio,Capital Adequacy,-1
3,3,Capital Ratio,Capital Adequacy,-1
4,4,Tangible Book Value Per Share,Capital Adequacy,-1
...,...,...,...,...
258,258,Net Interest Income,Interest Rate Risk,12
259,259,Net Interest Income Guidance,Profitability,27
260,260,Technology Spend and Transformation,Strategic and Business Model Risk,-1
261,261,Operating Expense Trends,Profitability,18


In [229]:
all_topics_df[['topic_idx','topic_label']].duplicated().sum()

np.int64(0)

In [230]:
# Merge topics which are in the same cluster together
def merge_topics(df):
	"""
	Merge topics based on the cluster and risk category.
	"""
	if df['cluster'].iloc[0] == -1:
		# If the cluster is -1, it means it's an outlier, so we don't make any updates
		return df[['topic_idx', 'topic_label']].rename(columns={
			'topic_idx': 'post_merge_topic_idx',
			'topic_label': 'post_merge_topic_label'
		})
	# For other clusters, get the most common topic index and associated topic label
	most_common_topic_idx = df['topic_idx'].mode()[0]
	most_common_topic_label = df['topic_label'].mode()[0]
	return pd.DataFrame({
		'post_merge_topic_idx': [most_common_topic_idx]*len(df),
		'post_merge_topic_label': [most_common_topic_label]*len(df),
	})
merged_df = all_topics_df.groupby(['cluster', 'risk_category'], group_keys=False,sort=False).apply(merge_topics).reset_index(drop=True)
merged_topics_df = pd.concat([all_topics_df.reset_index(drop=True), merged_df], axis=1)
merged_topics_df

Unnamed: 0,topic_idx,topic_label,risk_category,cluster,post_merge_topic_idx,post_merge_topic_label
0,0,Risk-Weighted Assets,Capital Adequacy,-1,0,Risk-Weighted Assets
1,1,Regulatory Capital,Capital Adequacy,0,2,Common Equity Tier 1 Ratio
2,2,Common Equity Tier 1 Ratio,Capital Adequacy,-1,3,Capital Ratio
3,3,Capital Ratio,Capital Adequacy,-1,4,Tangible Book Value Per Share
4,4,Tangible Book Value Per Share,Capital Adequacy,-1,7,Capital Constraints
...,...,...,...,...,...,...
258,258,Net Interest Income,Interest Rate Risk,12,,
259,259,Net Interest Income Guidance,Profitability,27,,
260,260,Technology Spend and Transformation,Strategic and Business Model Risk,-1,,
261,261,Operating Expense Trends,Profitability,18,,


In [231]:
df_all_text_main

Unnamed: 0,text,fiscal_period_ref,speaker,role,page,section,reporting_period,date_of_earnings_call,bank,document_type,source,sentiment,topic_idx
0,"Thank you, Jenn, hello to everyone joining us ...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,250
1,"First, our banking system as a whole is very s...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,223
2,The U.S. system comprises a healthy mix of com...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,223
3,I am pleased that Citi has been a source of st...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,237
4,We are in a position to play this role because...,quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning...",Positive,241
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9481,"Adjusted overhead ratio, preceding year trend:...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted overhead ratio)\nJPMorgan,...",Negative,60
9482,"Revenue, preceding year trend: Revenue increas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Revenue)\nJPMorgan, Q4, 2024, Earni...",Positive,15
9483,"Expense, preceding year trend: Expense decreas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Expense)\nJPMorgan, Q4, 2024, Earni...",Neutral,154
9484,"Credit costs, preceding year trend: Credit cos...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Credit costs)\nJPMorgan, Q4, 2024, ...",Positive,122
