In [1]:
from collections import Counter, defaultdict
import re
import time
from typing import List, Literal, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import string
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pydantic import BaseModel, Field

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
import torch

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from bertopic import BERTopic
from bertopic.representation import TextGeneration
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sklearn.cluster import DBSCAN

import boe_risk_monitoring.config as config
from boe_risk_monitoring.llms.processing_llms import TopicLabellingLLM, QuestionAnswerTaggingLLM, EvasivenessTaggingLLM

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahim1z\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahim1z\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahim1z\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rahim1z\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Check if CUDA is available and print the version
print(torch.version.cuda)
print(torch.cuda.is_available())

11.8
True


#### **SET GLOBAL VARIABLES**

In [3]:
# Make sure the working directory is set correctly
print("Initial working directory: ", os.getcwd())
os.chdir("../..")
print("New working directory: ", os.getcwd())

Initial working directory:  c:\python\bank-of-eng-risk-monitoring\boe_risk_monitoring\nlp
New working directory:  c:\python\bank-of-eng-risk-monitoring


In [4]:
ROOT_FPATH = os.getcwd()
DATA_FOLDER = config.DATA_FOLDER
AGGREGATED_DATA_FOLDER_NAME = config.AGGREGATED_DATA_FOLDER_NAME
ALL_TEXT_FNAME = "all_text.parquet"
ALL_TEXT_FPATH = f"{ROOT_FPATH}/{DATA_FOLDER}/{AGGREGATED_DATA_FOLDER_NAME}/{ALL_TEXT_FNAME}"
GLOSSARY_FNAME = "glossary_dictionary_citigroup.csv"
GLOSSARY_FPATH = f"{ROOT_FPATH}/{DATA_FOLDER}/{GLOSSARY_FNAME}"

RERUN_SENTIMENT_ANALYSIS = False
RERUN_UNSUPERVISED_TOPIC_MODELLING = False
RERUN_GUIDED_TOPIC_MODELLING = False
RERUN_ZERO_SHOT_TOPIC_MODELLING = False
RERUN_MULTI_TOPIC_LABELLING = False
RERUN_Q_AND_A_TAGGING = False
RERUN_EVASIVE_ANSWER_DETECTION = True

In [5]:
# Load the glossary dictionary
GLOSSARY_DF = pd.read_csv(GLOSSARY_FPATH)
GLOSSARY_DICT = GLOSSARY_DF.set_index('Term')['Definition'].to_dict()

#### **SETUP**

In [6]:
# Read in the data
df_all_text = pd.read_parquet(ALL_TEXT_FPATH)
df_all_text

Unnamed: 0,text,fiscal_period_ref,speaker,role,page,section,reporting_period,date_of_earnings_call,bank,document_type,source
0,"Hello, and welcome to Citi's First Quarter 202...",quarter,Operator,Host,1,Introduction,Q1_2023,2023-04-14,Citigroup,transcript,"Operator (Host)\nCitigroup, Q1, 2023 Earnings ..."
1,"Ms. Landis, you may begin.",quarter,Operator,Host,1,Introduction,Q1_2023,2023-04-14,Citigroup,transcript,"Operator (Host)\nCitigroup, Q1, 2023 Earnings ..."
2,"Thank you, operator. Good morning and thank yo...",quarter,Jennifer Landis,Host,1,Disclaimer,Q1_2023,2023-04-14,Citigroup,transcript,"Jennifer Landis (Host)\nCitigroup, Q1, 2023 Ea..."
3,"With that, I'll turn it over to Jane.",quarter,Jennifer Landis,Host,1,Introduction,Q1_2023,2023-04-14,Citigroup,transcript,"Jennifer Landis (Host)\nCitigroup, Q1, 2023 Ea..."
4,"Thank you, Jenn, hello to everyone joining us ...",quarter,Jane Fraser,CEO,1,Prepared remarks,Q1_2023,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning..."
...,...,...,...,...,...,...,...,...,...,...,...
9654,"Adjusted overhead ratio, preceding year trend:...",year,,,5,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted overhead ratio)\nJPMorgan,..."
9655,"Revenue, preceding year trend: Revenue increas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Revenue)\nJPMorgan, Q4, 2024, Earni..."
9656,"Expense, preceding year trend: Expense decreas...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Expense)\nJPMorgan, Q4, 2024, Earni..."
9657,"Credit costs, preceding year trend: Credit cos...",year,,,8,Financial Results,Q4_2024,2025-01-15,JPMorgan,presentation,"Table Row (Credit costs)\nJPMorgan, Q4, 2024, ..."


In [7]:
# Let's process the reporting period to give something which sorts chronologically
reporting_period_split_srs = df_all_text['reporting_period'].str.split("_")
df_all_text['reporting_period'] = reporting_period_split_srs.str.get(1).astype(str) + reporting_period_split_srs.str.get(0)
df_all_text['reporting_period'].value_counts()

reporting_period
2024Q3    1220
2023Q1    1205
2023Q4    1182
2024Q2    1175
2024Q1    1091
2023Q2    1070
2025Q1    1013
2023Q3    1003
2024Q4     700
Name: count, dtype: int64

In [8]:
# Check categories of text
df_all_text['section'].value_counts(dropna=False, normalize=True).head(20)

section
Financial Results    0.472927
Q and A              0.261725
Prepared remarks     0.165442
Footnotes            0.041309
Outlook              0.018946
Vision               0.010871
Glossary             0.010871
Introduction         0.006626
Conclusion           0.005798
Disclaimer           0.003934
Title                0.001553
Name: proportion, dtype: float64

In [9]:
# Drop text tagged as Introduction, Conclusion, Disclaimer or Title
df_all_text_main = df_all_text[~df_all_text['section'].isin(['Introduction', 'Conclusion', 'Disclaimer', 'Title'])].reset_index(drop=True)

In [10]:
# Print rows before and after dropping
print("Rows before dropping:", df_all_text.shape[0])
print("Rows after dropping:", df_all_text_main.shape[0])

Rows before dropping: 9659
Rows after dropping: 9486


In [11]:
# Let's check we don't have any missing data
df_all_text_main['text'].isna().sum()

np.int64(0)

#### **TEXT CLEANING**

In [12]:
# Rename text column as orig_text
df_all_text_main.rename(columns={'text': 'orig_text'}, inplace=True)

In [13]:
# Map occurrences of the acronyms to their full forms in the text
def replace_acronyms(text, glossary=GLOSSARY_DICT):
	"""
	Replace acronyms in the text with their full forms based on the provided glossary.
	"""
	for acronym, full_form in glossary.items():
		text = re.sub(r'\b' + re.escape(acronym) + r'\b', full_form, text)
		# pattern = r'\b' + re.escape(acronym) + r'(s)?\b'
		# text = re.sub(pattern, full_form, text, flags=re.IGNORECASE)

	return text

In [14]:
def clean_text_func(text_list_raw):
    """
    Cleans a list of raw text by converting to lowercase and
    and filtering out stop words.

    Args:
        text_list_raw: List of raw text strings.

    Returns:
        text_list_clean: List of cleaned text strings.
    """
    stop_words = set(stopwords.words('english'))
    text_list_clean = []

    for text in text_list_raw:
        if text and text.lower() != "nan":
            text = replace_acronyms(text)
            text = text.lower()
            word_tokens = word_tokenize(text)
            filtered_tokens = [w for w in word_tokens if w not in stop_words]
            cleaned_text = " ".join(filtered_tokens)
            text_list_clean.append(cleaned_text)

    return text_list_clean

In [15]:
def clean_text_func2(text_list_raw):
    """
    Cleans a list of raw text by converting to lowercase and
    and filtering out stop words.

    Args:
        text_list_raw: List of raw text strings.

    Returns:
        text_list_clean: List of cleaned text strings.
    """
    text_list_clean = []

    for text in text_list_raw:
        if text and text.lower() != "nan":
            cleaned_text = replace_acronyms(text)
            cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
            text_list_clean.append(cleaned_text)

    return text_list_clean

In [16]:
# Conduct some basic text cleaning
clean_text = clean_text_func2(df_all_text_main['orig_text'].tolist())
df_all_text_main.insert(1, 'text', clean_text)  # Insert cleaned text at index 1
df_all_text_main

Unnamed: 0,orig_text,text,fiscal_period_ref,speaker,role,page,section,reporting_period,date_of_earnings_call,bank,document_type,source
0,"Thank you, Jenn, hello to everyone joining us ...","Thank you, Jenn, hello to everyone joining us ...",quarter,Jane Fraser,CEO,1,Prepared remarks,2023Q1,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning..."
1,"First, our banking system as a whole is very s...","First, our banking system as a whole is very s...",quarter,Jane Fraser,CEO,1,Prepared remarks,2023Q1,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning..."
2,The U.S. system comprises a healthy mix of com...,The U.S. system comprises a healthy mix of com...,quarter,Jane Fraser,CEO,1,Prepared remarks,2023Q1,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning..."
3,I am pleased that Citi has been a source of st...,I am pleased that Citi has been a source of st...,quarter,Jane Fraser,CEO,1,Prepared remarks,2023Q1,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning..."
4,We are in a position to play this role because...,We are in a position to play this role because...,quarter,Jane Fraser,CEO,1,Prepared remarks,2023Q1,2023-04-14,Citigroup,transcript,"Jane Fraser (CEO)\nCitigroup, Q1, 2023 Earning..."
...,...,...,...,...,...,...,...,...,...,...,...,...
9481,"Adjusted overhead ratio, preceding year trend:...","Adjusted overhead ratio, preceding year trend:...",year,,,5,Financial Results,2024Q4,2025-01-15,JPMorgan,presentation,"Table Row (Adjusted overhead ratio)\nJPMorgan,..."
9482,"Revenue, preceding year trend: Revenue increas...","Revenue, preceding year trend: Revenue increas...",year,,,8,Financial Results,2024Q4,2025-01-15,JPMorgan,presentation,"Table Row (Revenue)\nJPMorgan, Q4, 2024, Earni..."
9483,"Expense, preceding year trend: Expense decreas...","Expense, preceding year trend: Expense decreas...",year,,,8,Financial Results,2024Q4,2025-01-15,JPMorgan,presentation,"Table Row (Expense)\nJPMorgan, Q4, 2024, Earni..."
9484,"Credit costs, preceding year trend: Credit cos...","Credit costs, preceding year trend: Credit cos...",year,,,8,Financial Results,2024Q4,2025-01-15,JPMorgan,presentation,"Table Row (Credit costs)\nJPMorgan, Q4, 2024, ..."


#### **SENTIMENT ANALYSIS**

In [17]:
def run_sentiment_analysis_on_gpu(text_subset, model_name, device_id):
	"""
	Run sentiment analysis on a subset of text using a specified model and device.
	"""
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)

	clf = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device_id)

	results = []
	batch_size = 32

	for i in range(0, len(text_subset), batch_size):
		batch = text_subset[i:i+batch_size]
		batch_results = clf(batch, truncation=True, max_length=512)
		results.extend(batch_results)
		print(f"Device {device_id}: Processed {i + batch_size} reviews out of {len(text_subset)}")

	return results

In [18]:
if RERUN_SENTIMENT_ANALYSIS:
    # We'll use finbert-tone for sentiment analysis which is the finetuned version of BERT for financial sentiment analysis
    model_name = "yiyanghkust/finbert-tone"

    # Check max token length for the model
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    print("Max token length of model:", model.config.max_position_embeddings)

    # Find max token length in the dataset
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    max_tokens = df_all_text_main["text"].apply(
        lambda x: len(tokenizer(x, truncation=False)["input_ids"])
    ).max()
    print("Estimated max tokens in dataset:", max_tokens)

    # Split the reviews into two parts for parallel processing (2x GPUs)
    text_list = df_all_text_main['text'].tolist()
    # Split list into 2 roughly equal parts
    midpoint = len(text_list) // 2
    text_list_split1 = text_list[:midpoint]
    text_list_split2 = text_list[midpoint:]

    # Run sentiment analysis on both subsets in parallel using ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=2) as executor:
        future_0 = executor.submit(run_sentiment_analysis_on_gpu, text_list_split1, model_name, 0)
        future_1 = executor.submit(run_sentiment_analysis_on_gpu, text_list_split2, model_name, 1)

        results_0 = future_0.result()
        results_1 = future_1.result()
        results = results_0 + results_1
    # Postprocess results and add to DataFrame
    results_clean = [d['label'] for d in results]
    df_all_text_main['sentiment'] = results_clean
    df_all_text_main['sentiment'] = df_all_text_main['sentiment'].replace(
        {'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'},
    )
    # Save the results to a CSV file
    df_all_text_main.to_csv("sentiment_analysis_results.csv", index=False)
else:
    # Load the results from the CSV file
    df_all_text_main = pd.read_csv("sentiment_analysis_results.csv")

#### **TOPIC MODELLING - UNSUPERVISED**

In [19]:
if RERUN_UNSUPERVISED_TOPIC_MODELLING:
	# We'll use Fin-MPNET-Base for the embedding model in BERTopic as it has been tuned on financial data
	embedding_model_name = "mukaj/fin-mpnet-base"
	embedding_model = SentenceTransformer(embedding_model_name)
	# BERTopic uses UMAP, a dimensionality reduction technique, to reduce the dimensionality of the embeddings before clustering
	# UMAP introduces stochastic behaviour so we'll set a random seed for reproducibility
	# https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html#preventing-stochastic-behavior
	umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0) # These are the default parameters for UMAP used in _bertopic.py with the additional parameter of random_state = 0

	# Run BERTopic to extract topics from the cleaned text
	model = BERTopic(verbose=True, embedding_model=embedding_model, umap_model=umap_model)
	model.fit(clean_text)
	topics, probabilities = model.transform(clean_text)

	df_topics = model.get_topic_freq()
	print("Top 10 topics and outliers: ", df_topics.head(11))
	pct_outliers = df_topics.loc[df_topics['Topic'] == -1, 'Count'].iloc[0]/df_topics['Count'].sum() * 100
	print(f"Percentage of outliers: {pct_outliers:.2f}%")

	df_all_text_main['topic_idx'] = topics

	print("---------------------TOPIC 0 ---------------------")
	print(model.get_topic(0))
	# Let's take a closer look a this topic
	for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 0, "text"].tolist():
		print("-----------------------------------------------------")
		print(text)

	print("---------------------TOPIC 1 ---------------------")
	print(model.get_topic(1))
	# Let's take a closer look a this topic
	for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 1, "text"].tolist():
		print("-----------------------------------------------------")
		print(text)

	# We can use a Hugging Face text generation model along with BERTopic's representation_model wrapper and pass this into BERTopic to summarise the topics for us
	# Since we'll be using Microsoft Phi 4 later, we'll use it here too.
	generator = pipeline(
		"text-generation",
		model="microsoft/Phi-4-mini-instruct",
		model_kwargs={"torch_dtype": "auto"},
		device_map="auto",
	)
	# Create prompt template to pass into BERTopic TextGeneration wrapper
	prompt_template = """<|system|>You are a helpful assistant who can succinctly describe the main topic covered by a set of customer reviews provided to you<|end|><|user|>I have a topic that contains the following documents: \n[DOCUMENTS]
	The topic is described by the following keywords: [KEYWORDS] Based on the above information, can you give a short label of the topic?<|end|><|assistant|>"""

	# Let's create the BERTopic TextGeneration wrapper now
	representation_model = TextGeneration(
		model=generator,
		prompt=prompt_template,
		pipeline_kwargs={"max_new_tokens": 200},
		random_state=0,
	)

	# Now let's rerun the BERTopic model exactly as before except this time we'll include the Phi 4 representation model above which will provide clean, interpretable topic labels for us
	umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0) # These are the default parameters for UMAP used in _bertopic.py with the additional paramater of random_state = 0
	model = BERTopic(verbose=True, embedding_model=embedding_model, umap_model=umap_model, representation_model=representation_model)
	model.fit(clean_text)
	topics, probabilities = model.transform(clean_text)

	# Let's print out the topic summaries generated by Phi 4.
	topic_summaries_dict = defaultdict(list)
	max_topic_idx = df_all_text_main['topic_idx'].max()
	for i in range(max_topic_idx + 1):
		topic_summaries_dict["topic_idx"].append(i)
		topic_summaries_dict["reviews"].append(model.get_topic_freq(i))
		topic_summaries_dict["summary"].append(model.topic_representations_[i][0][0])
		print("-----------------------------------------------------")
		print(f"Topic {i}:")
		print(f"Number of reviews: {model.get_topic_freq(i)}")
		print(f"Summary: {model.topic_representations_[i][0][0]}")

	df_topic_summaries = pd.DataFrame(topic_summaries_dict)


In [20]:
if RERUN_UNSUPERVISED_TOPIC_MODELLING:
	# Plot the topics
	model.visualize_topics()


In [21]:
if RERUN_UNSUPERVISED_TOPIC_MODELLING:
	model.visualize_barchart()

In [22]:
if RERUN_UNSUPERVISED_TOPIC_MODELLING:
	model.visualize_heatmap(top_n_topics=10)

#### **TOPIC MODELLING - SEMI-SUPERVISED - GUIDED**

In [23]:
# Specify some seed topics
seed_topic_list = [
    # 0. Capital Adequacy
    ["capital", "tier 1", "tier 2", "risk-weighted assets", "capital buffer", "regulatory capital", "cet1", "capital ratio", "tangible book value", "leverage ratio", "supplementary leverage ratio", "leverage exposure", "capital constraints", "basel iii"],

    # 1. Liquidity Risk
    ["liquidity", "cash", "short-term funding", "liquid assets", "deposit outflows", "liquidity coverage ratio", "cash reserves"],

    # 2. Profitability
    ["profit", "earnings", "revenue", "margin", "return on equity", "net income", "operating income", "eps", "return on tangible common equity", "efficiency ratio"],

    # 3. Asset Quality / Credit Risk (NPLs)
    ["non-performing loans", "credit risk", "default", "delinquencies", "loan loss provisions", "impairment", "charge-offs", "allowance for credit losses", "write-downs", "reserve to funded loans"],

    # 4. Macroeconomic Risk / Interest Rates
    ["interest rates", "rate hikes", "monetary policy", "inflation", "yield curve", "economic outlook", "central bank", "tariffs"],

    # 5. Market Risk / Volatility
    ["market volatility", "value at risk", "trading losses", "asset prices", "derivatives", "hedging", "market downturn"],

    # 6. Operational Risk / Technology
    ["cybersecurity", "system failure", "fraud", "data breach", "internal control", "technology risk", "disruption"],

    # 7. Regulatory & Compliance
    ["regulatory", "compliance", "supervisory", "basel", "reporting standards", "audit", "oversight"],

    # 8. ESG / Reputation Risk
    ["sustainability", "climate risk", "reputation", "governance", "social responsibility", "stakeholders", "diversity"],

    # 9. Capital Returns / Shareholder Value
    ["dividends", "share buybacks", "capital return", "payout ratio", "shareholder value", "stock repurchase"],

    # 10. Strategic Risk / Business Model
    ["business strategy", "growth plans", "restructuring", "core business", "competitive advantage", "market positioning"]
]

In [24]:
if RERUN_GUIDED_TOPIC_MODELLING:
	# Run BERTopic to extract topics from the cleaned text
    model = BERTopic(
        verbose=True,
        nr_topics=50,
        seed_topic_list=seed_topic_list,
        embedding_model=embedding_model,
        umap_model=umap_model
        )
    model.fit(clean_text)
    topics, probabilities = model.transform(clean_text)

    df_topics = model.get_topic_freq()
    print(df_topics.head(50))

    pct_outliers = df_topics.loc[df_topics['Topic'] == -1, 'Count'].iloc[0]/df_topics['Count'].sum() * 100
    print(f"Percentage of outliers: {pct_outliers:.2f}%")

    df_all_text_main['topic_idx'] = topics

    print("---------------------TOPIC 0 ---------------------")
    print(model.get_topic(0))
    # Let's take a closer look a this topic
    for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 0, "text"].tolist():
        print("-----------------------------------------------------")
        print(text)

    print("---------------------TOPIC 1 ---------------------")
    print(model.get_topic(1))
    # Let's take a closer look a this topic
    for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 1, "text"].tolist():
        print("-----------------------------------------------------")
        print(text)

    # We can use a Hugging Face text generation model along with BERTopic's representation_model wrapper and pass this into BERTopic to summarise the topics for us
    # Since we'll be using Microsoft Phi 4 later, we'll use it here too.
    generator = pipeline(
        "text-generation",
        model="microsoft/Phi-4-mini-instruct",
        model_kwargs={"torch_dtype": "auto"},
        device_map="auto",
    )

    # Create prompt template to pass into BERTopic TextGeneration wrapper
    prompt_template = """<|system|>You are a helpful assistant who can succinctly describe the main topic covered by a set of customer reviews provided to you<|end|><|user|>I have a topic that contains the following documents: \n[DOCUMENTS]
    The topic is described by the following keywords: [KEYWORDS] Based on the above information, can you give a short label of the topic?<|end|><|assistant|>"""
    result = generator(prompt_template, max_new_tokens=200)
    result[0]["generated_text"]

    # Let's create the BERTopic TextGeneration wrapper now
    representation_model = TextGeneration(
        model=generator,
        prompt=prompt_template,
        pipeline_kwargs={"max_new_tokens": 200},
        random_state=0,
    )

    # Now let's rerun the BERTopic model exactly as before except this time we'll include the Phi 4 representation model above which will provide clean, interpretable topic labels for us

    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0) # These are the default parameters for UMAP used in _bertopic.py with the additional paramater of random_state = 0
    model = BERTopic(
        verbose=True,
        nr_topics= 50,
        seed_topic_list=seed_topic_list,
        embedding_model=embedding_model,
        umap_model=umap_model,
        representation_model=representation_model
    )
    model.fit(clean_text)
    topics, probabilities = model.transform(clean_text)

    # Let's print out the topic summaries generated by Phi 4.
    topic_summaries_dict = defaultdict(list)
    max_topic_idx = df_all_text_main['topic_idx'].max()
    for i in range(max_topic_idx + 1):
        topic_summaries_dict["topic_idx"].append(i)
        topic_summaries_dict["reviews"].append(model.get_topic_freq(i))
        topic_summaries_dict["summary"].append(model.topic_representations_[i][0][0])
        print("-----------------------------------------------------")
        print(f"Topic {i}:")
        print(f"Number of reviews: {model.get_topic_freq(i)}")
        print(f"Summary: {model.topic_representations_[i][0][0]}")

    df_topic_summaries = pd.DataFrame(topic_summaries_dict)




In [25]:
if RERUN_GUIDED_TOPIC_MODELLING:
	# Plot the topics
	model.visualize_topics()

In [26]:
if RERUN_GUIDED_TOPIC_MODELLING:
	model.visualize_barchart()

In [27]:
if RERUN_GUIDED_TOPIC_MODELLING:
	model.visualize_heatmap(top_n_topics=10)

#### **TOPIC MODELLING - SEMI-SUPERVISED - ZERO-SHOT**

In [28]:
topic_mapping_dict = {
    "Capital Adequacy": [
        "Capital", "Tier 1 Capital", "Tier 2 Capital", "Risk-Weighted Assets", "Capital Buffer",
        "Regulatory Capital", "CET1 Ratio", "Capital Ratio", "Tangible Book Value Per Share",
        "Leverage Ratio", "Supplementary Leverage Ratio", "Leverage Exposure", "Capital Constraints", "Basel III"
    ],

    "Liquidity Risk": [
        "Liquidity Risk", "Cash Position", "Short-Term Funding", "Liquid Assets", "Deposit Outflows",
        "Liquidity Coverage Ratio", "Cash Reserves"
    ],

    "Profitability": [
        "Profitability", "Earnings Performance", "Revenue Growth", "Profit Margin", "Net Interest Margin",
        "Return On Equity", "Net Income", "Operating Income", "Earnings Per Share",
        "Return On Tangible Common Equity", "Efficiency Ratio"
    ],

    "Asset Quality and Credit Risk": [
        "Non-Performing Loans", "Credit Risk", "Loan Defaults", "Delinquencies", "Loan Loss Provisions",
        "Impairments", "Charge-Offs", "Allowance For Credit Losses", "Loan Write-Downs",
        "Reserve To Funded Loans", "Stage 3 Loans"
    ],

    "Macroeconomic Risk": [
        "Rate Hikes", "Monetary Policy", "Inflation", "Economic Slowdown", "Central Bank Policy",
        "Unemployment", "Tariffs", "Macroeconomic Outlook", "Geopolitical Risk"
    ],

    "Interest Rate Risk": [
        "Interest Rate Risk", "Banking Book Interest Rate Risk", "IRRBB", "Net Interest Margin Sensitivity",
        "Repricing Gap", "Duration Mismatch", "Yield Curve Exposure", "Interest Rate Sensitivity",
        "Rate Shock Scenarios", "Basis Risk"
    ],

    "Market and Volatility Risk": [
        "Market Volatility", "Value At Risk", "Trading Losses", "Asset Price Fluctuation",
        "Derivative Exposure", "Hedging Strategy", "Market Downturn"
    ],

    "Operational Risk": [
        "Cybersecurity Threat", "System Failure", "Fraud Risk", "Data Breach",
        "Internal Controls", "Technology Risk", "Operational Disruption"
    ],

    "Regulatory & Compliance Risk": [
        "Regulatory Requirements", "Compliance Risk", "Supervisory Review", "Basel Framework",
        "Reporting Standards", "Audit Finding", "Regulatory Oversight"
    ],

    "ESG and Reputation Risk": [
        "Sustainability Goals", "Climate Risk", "Reputation Risk", "Corporate Governance",
        "Social Responsibility", "Stakeholder Engagement", "Diversity And Inclusion", "Community Impact"
    ],

    "Strategic and Business Model Risk": [
        "Business Strategy", "Growth Plans", "Corporate Restructuring", "Core Business Focus",
        "Competitive Positioning", "Market Entry Strategy"
    ],

    "Legal Risk": [
        "Litigation Risk", "Lawsuit", "Legal Proceedings", "Class Action",
        "Settlement", "Regulatory Investigation", "Legal Exposure", "Contractual Dispute", "Fines And Penalties"
    ]
}

In [29]:
topic_mapping_df = pd.DataFrame()
for risk_category, keywords in topic_mapping_dict.items():
	for keyword in keywords:
		topic_mapping_df = pd.concat(
			[topic_mapping_df, pd.DataFrame({"risk_category": [risk_category], "topic_label": [keyword]})],
			ignore_index=True
		)
topic_mapping_df

Unnamed: 0,risk_category,topic_label
0,Capital Adequacy,Capital
1,Capital Adequacy,Tier 1 Capital
2,Capital Adequacy,Tier 2 Capital
3,Capital Adequacy,Risk-Weighted Assets
4,Capital Adequacy,Capital Buffer
...,...,...
101,Legal Risk,Settlement
102,Legal Risk,Regulatory Investigation
103,Legal Risk,Legal Exposure
104,Legal Risk,Contractual Dispute


In [30]:
# Create a Pydantic model for the topic label
class TopicLabel(BaseModel):
	topic_label: str
	broad_topic: Optional[Literal['Capital Adequacy', 'Liquidity Risk', 'Profitability', 'Asset Quality and Credit Risk', 'Macroeconomic Risk', 'Interest Rate Risk', 'Market and Volatility Risk', 'Operational Risk', 'Regulatory & Compliance Risk', 'ESG and Reputation Risk', 'Strategic and Business Model Risk', 'Legal Risk']]


In [31]:
def _make_topic_labelling_prompt(docs):
	"""Define a prompt for the topic labelling task."""
	return (
		"You are a financial risk analyst. Below is a topic extracted from earnings calls using topic modeling. Your task is to:\n"
		"1. topic_label: assign a concise and reusable topic label (2–4 words) that captures the subject without adding interpretations or qualifiers (e.g., avoid words like 'significant', 'concerning', or 'not meaningful'). Use neutral phrasing that could apply across quarters or contexts.\n"
		"2. broad_topic: classify the topic under one of the standard financial risk categories.\n"
		"If the topic does not reflect any financial risk (e.g., greetings, procedural comments), set 'broad_topic' to null.\n"
		"Avoid mentioning any specific banks in the topic labels e.g. don't mention Citigroup, JPMorgan etc.\n"
		"Here are the documents that make up the topic:\n"
		"\n---\n".join(docs) + "\n"
	)



In [32]:
def label_topic_with_llm(topic_idx, topic_docs, llm_backend, llm_model_name):
	"""Label a topic using the LLM."""
	prompt = _make_topic_labelling_prompt(topic_docs)
	topic_label_llm = TopicLabellingLLM(
		topic_labelling_prompt=prompt,
		response_schema=TopicLabel,
		backend=llm_backend,
		model_name=llm_model_name,
	)
	topic_output = topic_label_llm.invoke()
	topic = topic_output.topic_label
	risk_category = topic_output.broad_topic
	print(f"Topic {topic_idx}: {topic} ({risk_category})")
	return {"topic": topic, "risk_category": risk_category}

In [33]:
# Merge topics which are in the same cluster together
def merge_topics(df):
	"""
	Merge topics based on the cluster and risk category.
	"""
	if df['cluster'].iloc[0] == -1:
		# If the cluster is -1, it means it's an outlier, so we don't make any updates
		return df[['topic_idx', 'topic_label']].rename(columns={
			'topic_idx': 'post_merge_topic_idx',
			'topic_label': 'post_merge_topic_label'
		}).set_index(df.index)
	# For other clusters, get the most common topic index and associated topic label
	most_common_topic_idx = df['topic_idx'].mode()[0]
	most_common_topic_label = df['topic_label'].mode()[0]
	return pd.DataFrame({
		'post_merge_topic_idx': [most_common_topic_idx]*len(df),
		'post_merge_topic_label': [most_common_topic_label]*len(df),
	}, index=df.index)

In [34]:
if RERUN_ZERO_SHOT_TOPIC_MODELLING:
	# Now we'll use the topic mapping values in a zero shot topic modelling approach and later map back to the broader topic categories
	zeroshot_topic_list = topic_mapping_df['topic_label'].tolist()

	# Run BERTopic to extract topics from the cleaned text
	model = BERTopic(
		verbose=True,
		# nr_topics=200,
		zeroshot_topic_list=zeroshot_topic_list,
		zeroshot_min_similarity= 0.5,
		embedding_model=embedding_model,
		umap_model=umap_model
		)
	model.fit(clean_text)
	topics, probabilities = model.transform(clean_text)

	df_topics = model.get_topic_freq()

	# Identify topics with only a few reviews
	minor_topics_list = df_topics.loc[df_topics['Count'] <= 3,"Topic"].tolist()
	print(minor_topics_list)

	pct_outliers = df_topics.loc[df_topics['Topic'] == -1, 'Count'].iloc[0]/df_topics['Count'].sum() * 100
	print(f"Percentage of outliers: {pct_outliers:.2f}%")

	df_all_text_main['topic_idx'] = topics

	print("---------------------TOPIC 0 ---------------------")
	print(model.get_topic(0))
	# Let's take a closer look at this topic
	for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 0, "text"].tolist():
		print("-----------------------------------------------------")
		print(text)

	print("---------------------TOPIC 1 ---------------------")
	print(model.get_topic(1))
	# Let's take a closer look at this topic
	for text in df_all_text_main.loc[df_all_text_main['topic_idx'] == 1, "text"].tolist():
		print("-----------------------------------------------------")
		print(text)

	topic_summary_df = model.get_topic_info()
	print("Topic Summary DataFrame:")
	print(topic_summary_df)

	# Print number of zero shot topics
	print("Number of zero shot topics: ", len(zeroshot_topic_list))

	# Get unsupervised topics
	sel_bool = ((topic_summary_df['Topic'] >= 0) & (~topic_summary_df['Name'].isin(zeroshot_topic_list)))
	print("Unsupervised topics:")
	print(topic_summary_df.loc[sel_bool, ['Topic','Count','Name']])
	unsupervised_topics_list = topic_summary_df.loc[sel_bool, 'Topic'].tolist()

	df_all_text_main_unsupervised = df_all_text_main[df_all_text_main['topic_idx'].isin(unsupervised_topics_list)]

	# Initialize a dataframe to hold topic labels
	unsupervised_topics_df = pd.DataFrame(columns=["topic_idx", "topic_label", "risk_category"])

	# Create a ThreadPoolExecutor to process docs concurrently
	with ThreadPoolExecutor(max_workers=10) as executor:
		futures = {}
		for unsupervised_topic_idx in unsupervised_topics_list:
			# Get the documents for the current topic
			topic_docs = df_all_text_main_unsupervised.loc[df_all_text_main_unsupervised['topic_idx'] == unsupervised_topic_idx, 'text'].tolist()
			future = executor.submit(
				label_topic_with_llm,
				topic_idx=unsupervised_topic_idx,
				topic_docs=topic_docs,
				llm_backend="gemini",
				llm_model_name="gemini-2.5-flash-preview-05-20"
				# llm_backend="openai",
				# llm_model_name="gpt-4o"
			)
			futures[future] = unsupervised_topic_idx

		# Wait for all futures to complete
		for future in as_completed(futures):
			try:
				topic_idx = futures[future]
				res = future.result()  # This will raise an exception if the processing failed
				topic = res['topic']
				risk_category = res['risk_category']
				# Append the result to the labels_df
				unsupervised_topics_df = pd.concat(
					[unsupervised_topics_df, pd.DataFrame({
						"topic_idx": [topic_idx],
						"topic_label": [topic],
						"risk_category": [risk_category]
					})],
					ignore_index=True
				)
			except Exception as e:
				print(f"Error processing {futures[future]}: {e}")

	# Sort the DataFrame by topic_idx
	unsupervised_topics_df = unsupervised_topics_df.sort_values(by='topic_idx').reset_index(drop=True)

	# Zeroshot topic idxs and names
	zeroshot_topics_df = topic_summary_df.loc[topic_summary_df['Name'].isin(zeroshot_topic_list), ['Topic', 'Name']]
	# Rename columns for clarity
	zeroshot_topics_df.rename(columns={'Topic': 'topic_idx', 'Name': 'topic_label'}, inplace=True)
	# Add risk category based on the topic mapping
	zeroshot_topics_df = zeroshot_topics_df.merge(topic_mapping_df, on='topic_label', how='left')

	# Combine the zeroshot and unsupervised topics into a single DataFrame
	all_topics_df = pd.concat([zeroshot_topics_df, unsupervised_topics_df], ignore_index=True)

	# Let's reduce the number of topics by merging similar topic labels through clustering
	# Aid the topic merging process by replacing acronyms with their full forms
	# Apply the acronym replacement to the topic column
	all_topics_df['topic_label'] = all_topics_df['topic_label'].apply(lambda x: replace_acronyms(x, glossary_dict))

	# Perform clustering to identify similar topic labels
	embedding_model_name = "mukaj/fin-mpnet-base"
	# embedding_model_name = "FinLang/finance-embeddings-investopedia"
	embedding_model = SentenceTransformer(embedding_model_name)

	all_topic_labels = all_topics_df['topic_label'].tolist()
	all_topic_label_embeddings = embedding_model.encode(all_topic_labels, normalize_embeddings=True)

	distance_matrix = cosine_distances(all_topic_label_embeddings)

	dbscan = DBSCAN(eps=0.25, min_samples=2, metric='precomputed') # EPS of 0.2 ensures that only very similar topic labels are clustered together
	cluster_labels = dbscan.fit_predict(distance_matrix)

	# Merge topics which are in the same cluster together
	all_topics_df['cluster'] = cluster_labels
	all_topics_df.groupby('cluster').size()
	merged_df = all_topics_df.groupby(['cluster', 'risk_category'], group_keys=False, dropna=False).apply(merge_topics).reset_index(drop=True)
	merged_topics_df = pd.concat([all_topics_df.reset_index(drop=True), merged_df], axis=1)

	# Print number of unique topics before and after merging
	print("Number of unique topics before merging:", all_topics_df['topic_idx'].nunique())
	print("Number of unique topics after merging:", merged_topics_df['post_merge_topic_idx'].nunique())

	# Merge the main DataFrame with the merged topics DataFrame
	df_all_text_topics = df_all_text_main.merge(
		merged_topics_df[['topic_idx', 'topic_label', 'post_merge_topic_idx', 'post_merge_topic_label', 'risk_category']],
		on='topic_idx',
		how='left'
	)

	# Save the DataFrame with topics to a CSV file
	df_all_text_topics.to_csv("zero_shot_topic_modelling_with_merging.csv", index=False)

else:
	# Load the DataFrame with topics from the CSV file
	df_all_text_topics = pd.read_csv("zero_shot_topic_modelling_with_merging.csv")







#### **MULTI-TOPIC LABELLING**

In [35]:
if RERUN_MULTI_TOPIC_LABELLING:
	embedding_model_name = "mukaj/fin-mpnet-base"
	embedding_model = SentenceTransformer(embedding_model_name)
	text_embeddings = embedding_model.encode(df_all_text_topics['text'].tolist())
	topic_labels = df_all_text_topics['post_merge_topic_label'].dropna().unique().tolist()
	topic_embeddings = embedding_model.encode(topic_labels)
	similarities_arr = cosine_similarity(text_embeddings, topic_embeddings)
	similarities_df = pd.DataFrame(similarities_arr, columns=topic_labels)
	threshold = 0.35
	# similarities_df_flags = (similarities_df > threshold)*1
	# df_all_text_topics_flags = df_all_text_topics.copy()
	# df_all_text_topics_flags = pd.concat([df_all_text_topics_flags, similarities_df_flags], axis=1)
	# df_all_text_topics_flags.to_csv("df_all_text_topics_flags.csv", index=False)

	similarities_df_relevance = similarities_df[(similarities_df > threshold)].fillna(0).copy()
	df_all_text_topics_relevance = df_all_text_topics.copy()
	df_all_text_topics_relevance = pd.concat([df_all_text_topics_relevance, similarities_df_relevance], axis=1)

	# Let's aggregate topic relevancy to the risk category level
	risk_category_mapping = defaultdict(set)
	for cat, df in df_all_text_topics_relevance[['risk_category','post_merge_topic_label']].groupby('risk_category'):
		for topic in df['post_merge_topic_label'].unique():
			risk_category_mapping[cat].add(topic)

	risk_category_mapping = {k: list(v) for k, v in risk_category_mapping.items()}

	for risk_category in risk_category_mapping.keys():
		new_col_name = "RISK CATEGORY: " + risk_category
		cols_to_agg = risk_category_mapping[risk_category]
		df_all_text_topics_relevance[new_col_name] = df_all_text_topics_relevance[cols_to_agg].sum(axis=1)

	# Save the DataFrame with topic relevance to a CSV file
	df_all_text_topics_relevance.to_csv("multi_topic_modelling_with_relevance.csv", index=False)

	# Column names for subtopics and risk categories
	subtopic_cols = similarities_df.columns.tolist()
	risk_category_cols = [col for col in df_all_text_topics_relevance.columns if col.startswith("RISK CATEGORY")]

	# # Normalize the topic relevance scores so rows add up to 1
	# df_all_text_topics_relevance_normalized = df_all_text_topics_relevance.copy()

	# df_all_text_topics_relevance_normalized[subtopic_cols] = df_all_text_topics_relevance_normalized[subtopic_cols].div(
	# 	df_all_text_topics_relevance_normalized[subtopic_cols].sum(axis=1), axis=0
	# ).fillna(0)
	# df_all_text_topics_relevance_normalized[risk_category_cols] = df_all_text_topics_relevance_normalized[risk_category_cols].div(
	# 	df_all_text_topics_relevance_normalized[risk_category_cols].sum(axis=1), axis=0
	# ).fillna(0)

	# # Save the normalized DataFrame to a csv file
	# df_all_text_topics_relevance_normalized.to_csv("multi_topic_modelling_with_relevance_normalized.csv", index=False)

	# Aggregate by bank and reporting period - two ways to aggregate using mean of normalized topic relevance scores and sum of raw topic relevance scores
	# df_all_text_topics_relevance_quarter_norm_agg = df_all_text_topics_relevance_normalized.groupby(
	# 	['bank', 'reporting_period']
	# )[subtopic_cols + risk_category_cols].mean().reset_index()
	df_all_text_topics_relevance_quarter_abs_agg = df_all_text_topics_relevance.groupby(
		['bank', 'reporting_period']
	)[subtopic_cols + risk_category_cols].sum().reset_index()

	# Save the aggregated DataFrame to a csv file
	# df_all_text_topics_relevance_quarter_agg.to_csv("multi_topic_modelling_with_relevance_quarter_norm_agg.csv", index=False)
	df_all_text_topics_relevance_quarter_abs_agg.to_csv("multi_topic_modelling_with_relevance_quarter_abs_agg.csv", index=False)

	# Add sentiment weighting to the topic relevance scores
	df_all_text_topics_relevance_sentiment = df_all_text_topics_relevance.copy()
	# df_all_text_topics_relevance_sentiment = df_all_text_topics_relevance_normalized.copy()
	df_all_text_topics_relevance_sentiment['sentiment_score'] = df_all_text_topics_relevance_sentiment['sentiment'].replace(
		{'Negative': -1, 'Neutral': 0, 'Positive': 1}
	)
	df_all_text_topics_relevance_sentiment[subtopic_cols] = df_all_text_topics_relevance_sentiment[subtopic_cols].mul(
		df_all_text_topics_relevance_sentiment['sentiment_score'], axis=0
	)
	df_all_text_topics_relevance_sentiment[risk_category_cols] = df_all_text_topics_relevance_sentiment[risk_category_cols].mul(
		df_all_text_topics_relevance_sentiment['sentiment_score'], axis=0
	)

	# Save the DataFrame with sentiment weighting to a csv file
	df_all_text_topics_relevance_sentiment.to_csv("multi_topic_modelling_with_relevance_sentiment.csv", index=False)
	# df_all_text_topics_relevance_sentiment.to_csv("multi_topic_modelling_with_relevance_normalized_sentiment.csv", index=False)

	# Aggregate by bank and reporting period
	df_all_text_topics_relevance_sentiment_quarter_agg = df_all_text_topics_relevance_sentiment.groupby(
		['bank', 'reporting_period']
	)[subtopic_cols + risk_category_cols].mean().reset_index()


	# Save the aggregated DataFrame to a csv file
	df_all_text_topics_relevance_sentiment_quarter_agg.to_csv("multi_topic_modelling_with_relevance_sentiment_quarter_agg.csv", index=False)
	# df_all_text_topics_relevance_sentiment_quarter_agg.to_csv("multi_topic_modelling_with_relevance_normalized_sentiment_quarter_agg.csv", index=False)

else:
	# Load the DataFrame with topic relevance from the csv file
	df_all_text_topics_relevance = pd.read_csv("multi_topic_modelling_with_relevance.csv")

	# Load the normalized DataFrame with topic relevance from the csv file
	df_all_text_topics_relevance_normalized = pd.read_csv("multi_topic_modelling_with_relevance_normalized.csv")

	# Load the aggregated DataFrame with topic relevance from the csv file
	df_all_text_topics_relevance_quarter_agg = pd.read_csv("multi_topic_modelling_with_relevance_quarter_agg.csv")

	# Load the DataFrame with sentiment weighting from the csv file
	df_all_text_topics_relevance_sentiment = pd.read_csv("multi_topic_modelling_with_relevance_normalized_sentiment.csv")

	# Load the aggregated DataFrame with sentiment weighting from the csv file
	df_all_text_topics_relevance_sentiment_quarter_agg = pd.read_csv("multi_topic_modelling_with_relevance_normalized_sentiment_quarter_agg.csv")







Columns (3,4) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (3,4) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (3,4) have mixed types. Specify dtype option on import or set low_memory=False.



#### **Q & A Analysis**

In [36]:
# Create a Pydantic model for the topic label
class QandALabel(BaseModel):
	q_and_a_label: Literal['Question', 'Answer', 'Other']

In [37]:
def _make_q_and_a_labelling_prompt(chunk, role):
    """Define a prompt for the Q & A labelling task."""
    return (
        "You are a financial analyst. Classify the following text chunk from a financial earnings transcript as either:\n"
        "- Question: A question asked by an external participant (e.g. analyst or investor)\n"
        "- Answer: A firm representative answering a question\n"
        "- Other: Any other remarks, like greetings or transitions\n"
        "Only classify something as a Question if it's asked by an analyst or external participant.\n\n"
        "Here's the chunk:\n"
        f"{role}: {chunk.strip()}"
        # "Examples:\n"
        # "Role: Analyst\n"
        # "Text: Can you explain the increase in net interest income this quarter?\n"
        # "--> Label: Question\n\n"
        # "Role: CFO\n"
        # "Text: Yes, that's mainly driven by higher rates and repricing of deposits.\n"
        # "--> Label: Answer\n\n"
        # "Role: Operator\n"
        # "Text: Next up, we have James from Barclays.\n"
        # "--> Label: Other\n\n"
        # f"Role: {role}\n"
        # f"Text: {chunk.strip()}\n"
        # "--> Label:"
    )

In [38]:
def label_question_answer_with_llm(chunk, role, llm_backend, llm_model_name):
	"""Label a topic using the LLM."""
	prompt = _make_q_and_a_labelling_prompt(chunk, role)
	labelling_llm = QuestionAnswerTaggingLLM(
		q_and_a_tagging_prompt=prompt,
		response_schema=QandALabel,
		backend=llm_backend,
		model_name=llm_model_name,
	)
	output = labelling_llm.invoke()
	label = output.q_and_a_label
	print(f"{role}: {chunk.strip()} --> Label: {label}")
	return label

In [52]:
if RERUN_Q_AND_A_TAGGING:
	df_q_and_a_all = df_all_text_topics_relevance.copy()
	df_q_and_a_all = df_q_and_a_all.loc[(df_q_and_a_all['section'] == "Q and A")].reset_index(drop=True)
	# Check document type is all "transcript"
	assert (df_q_and_a_all['document_type'] == "transcript").all(), "Expected transcript for all entries"

	# Drop rows where the host or "other" is speaking
	df_q_and_a_all = df_q_and_a_all[~df_q_and_a_all['role'].isin(['Host', 'Other'])].copy()
	df_q_and_a_all.reset_index(drop=True)

	# Initialize a dataframe to hold topic labels
	q_and_a_labels_srs = pd.Series(index=df_q_and_a_all.index)

	# Create a ThreadPoolExecutor to process docs concurrently
	with ThreadPoolExecutor(max_workers=50) as executor:
		futures = {}
		for row_idx, chunk, role in df_q_and_a_all[['text','role']].itertuples():
			future = executor.submit(
				label_question_answer_with_llm,
				chunk=chunk,
				role=role,
				llm_backend="gemini",
				llm_model_name="gemini-2.5-flash-preview-05-20"
			)
			futures[future] = row_idx

		# Wait for all futures to complete
		for future in as_completed(futures):
			try:
				row_idx = futures[future]
				label = future.result()  # This will raise an exception if the processing failed
				q_and_a_labels_srs.loc[row_idx] = label
			except Exception as e:
				print(f"Error processing {futures[future]}: {e}")

	# Get index of section column
	df_q_and_a_all['q_and_a_label'] = q_and_a_labels_srs
	# section_col_idx = df_q_and_a_all.columns.get_loc('section')
	# df_q_and_a_all.insert(section_col_idx+1, 'q_and_a_label', q_and_a_labels_srs)

	for grp_idx, df_q_and_a in df_q_and_a_all.groupby(['bank', 'reporting_period']):
		bank, reporting_period = grp_idx
		# Initialize
		q_and_a_block_tag = []
		previous_label = ""
		previous_tag = 0
		previous_was_analyst_question = False
		num_consecutive_analyst_statements = 0
		for idx, row in df_q_and_a.iterrows():
			role = row['role']
			q_and_a_category = row['q_and_a_label']
			if (role == "Analyst") and (q_and_a_category == "Question") and not previous_was_analyst_question:
				previous_tag += 1
				if num_consecutive_analyst_statements > 0:
					# If the previous statements were from an analyst, we'll reassign those e.g. could have been categorised as Other but are actually the context for the question in the current row
					for i in range(1, num_consecutive_analyst_statements + 1):
						q_and_a_block_tag[-i] = previous_tag
				q_and_a_block_tag.append(previous_tag)
				num_consecutive_analyst_statements += 1
				previous_was_analyst_question = True
			else:
				if role == "Analyst":
					num_consecutive_analyst_statements += 1
				else:
					num_consecutive_analyst_statements = 0
				q_and_a_block_tag.append(pd.NA)
				previous_was_analyst_question = False

		q_and_a_block_srs = pd.Series(q_and_a_block_tag, index=df_q_and_a.index).ffill()
		sel_bool = ((df_q_and_a_all['bank'] == bank) & (df_q_and_a_all['reporting_period'] == reporting_period))
		df_q_and_a_all.loc[sel_bool, 'q_and_a_block'] = q_and_a_block_srs

	# Save the DataFrame with Q & A labels to a csv file
	df_q_and_a_all.to_csv("q_and_a_tagged.csv", index=False)

	# Aggregate within each Q & A block
	df_q_and_a_all_agg = df_q_and_a_all.copy()
	df_q_and_a_all_agg = df_q_and_a_all_agg.dropna(subset=['q_and_a_block']).reset_index(drop=True)
	updated_text_srs = pd.Series(index=df_q_and_a_all_agg.index)
	for grp_idx, df in df_q_and_a_all_agg.groupby(['bank', 'reporting_period', 'q_and_a_block']):
		bank, reporting_period, q_and_a_block = grp_idx
		previous_speaker = ""
		for row_idx, row in df.iterrows():
			speaker = row['speaker']
			role = row['role']
			text = row['text']
			if speaker != previous_speaker:
				updated_text_srs.loc[row_idx] = f"{speaker} ({role}): {text}\n"
			else:
				updated_text_srs.loc[row_idx] = f"{text}\n"
			previous_speaker = speaker

	df_q_and_a_all_agg['text'] = updated_text_srs

	first_cols = ['page', 'section', 'reporting_period', 'date_of_earnings_call', 'bank', 'document_type']
	topic_cols = subtopic_cols + risk_category_cols
	sum_cols = ['text'] + topic_cols
	grp_cols = ['bank', 'reporting_period', 'q_and_a_block']

	agg_dict = {col: "first" for col in first_cols}
	agg_dict.update({col: "sum" for col in sum_cols})

	df_q_and_a_all_agg = df_q_and_a_all_agg.groupby(grp_cols).agg(agg_dict)[['text'] + first_cols + topic_cols].reset_index(drop=True)

	# Add a column for source
	reporting_period_split = df_q_and_a_all_agg['reporting_period'].str.split("Q")
	reporting_period_clean = "Q" + reporting_period_split.str.get(1) + ", " + reporting_period_split.str.get(0)
	source_srs = df_q_and_a_all_agg['bank'] + ", " + reporting_period_clean + " Earnings Call Transcript, Page " + df_q_and_a_all_agg['page'].astype(str)
	# Get document_type col index
	document_type_col_idx = df_q_and_a_all_agg.columns.get_loc('document_type')
	# Insert the source column after the document_type column
	df_q_and_a_all_agg.insert(document_type_col_idx + 1, 'source', source_srs)
	# Save the aggregated DataFrame with Q & A labels to a csv file
	df_q_and_a_all_agg.to_csv("q_and_a_aggregated.csv", index=False)

else:
	# Load the DataFrame with Q & A labels from the csv file
	df_q_and_a_all = pd.read_csv("q_and_a_tagged.csv")
	# Load the aggregated DataFrame with Q & A labels from the csv file
	df_q_and_a_all_agg = pd.read_csv("q_and_a_aggregated.csv")



In [54]:
# Create a Pydantic model for the evasiveness score
class EvasivenessScore(BaseModel):
	evasiveness_score: int = Field(...,ge=1,le=5)
	justification: str

In [55]:
def _make_q_and_a_evasiveness_scoring_prompt(q_and_a_text):
    """Define a prompt for evasiveness scoring task."""
    return (
        "You are a financial communication analyst. You are analyzing the clarity and directness of responses in earnings call transcripts of bank firms.\n"
        "Below is a Q&A block from an earnings call. Your task is to:\n"
        "Assess the **evasion level** of the answer(s) provided by management or executives in response to the analyst's question(s).\n"
		"Use the following 5-point scale to rate **evasion**:\n"
		"- **1 (Very Direct):** The answer is clear, specific, and fully addresses the question.\n"
		"- **2 (Mostly Direct):** The answer mostly addresses the question, with some minor vagueness or deflection.\n"
		"- **3 (Neutral):** The answer is somewhat vague or generic, and only partially addresses the question.\n"
		"- **4 (Evasive):** The answer avoids the question with general statements or redirects.\n"
		"- **5 (Very Evasive):** The answer clearly avoids the question, changes topic, or provides no relevant information.\n"
        "In addition to the numeric score, provide a brief explanation (1–3 sentences) justifying why the answer was rated at that level. Be specific about how the response addressed or avoided the question.\n\n"
        f"Here is the Q&A block:\n"
        f"{q_and_a_text.strip()}"
    )

In [56]:
def assign_evasiveness_score_with_llm(q_and_a_text, llm_backend, llm_model_name):
	"""Label a topic using the LLM."""
	prompt = _make_q_and_a_evasiveness_scoring_prompt(q_and_a_text)
	evasiveness_tagging_llm = EvasivenessTaggingLLM(
		evasiveness_tagging_prompt=prompt,
		response_schema=EvasivenessScore,
		backend=llm_backend,
		model_name=llm_model_name,
	)
	output = evasiveness_tagging_llm.invoke()
	# Print the output
	print("---------------------------------------")
	print(q_and_a_text.strip())
	print(f"Evasiveness Score: {output.evasiveness_score}, Justification: {output.justification}")
	return {
		"evasiveness_score": output.evasiveness_score,
		"justification": output.justification
	}

In [57]:
if RERUN_EVASIVE_ANSWER_DETECTION:
	# Initialize a dataframe to hold topic labels
	evasiveness_score_df = pd.DataFrame(index=df_q_and_a_all_agg.index, columns=['evasiveness_score', 'justification'])
	# Create a ThreadPoolExecutor to process docs concurrently
	with ThreadPoolExecutor(max_workers=25) as executor:
		futures = {}
		for row_idx, q_and_a_text in df_q_and_a_all_agg['text'].items():
			future = executor.submit(
				assign_evasiveness_score_with_llm,
				q_and_a_text=q_and_a_text,
				llm_backend="gemini",
				llm_model_name="gemini-2.5-flash-preview-05-20"
			)
			futures[future] = row_idx

		# Wait for all futures to complete
		for future in as_completed(futures):
			try:
				row_idx = futures[future]
				res = future.result()  # This will raise an exception if the processing failed
				evasiveness_score = res['evasiveness_score']
				justification = res['justification']
				# Add the results to the DataFrame
				evasiveness_score_df.loc[row_idx, 'evasiveness_score'] = evasiveness_score
				evasiveness_score_df.loc[row_idx, 'justification'] = justification
			except Exception as e:
				print(f"Error processing {futures[future]}: {e}")

	# Get index of section column
	df_q_and_a_all_agg_evasiveness = df_q_and_a_all_agg.join(evasiveness_score_df, how='left')

	# Save the DataFrame with evasiveness scores to a csv file
	df_q_and_a_all_agg_evasiveness.to_csv("q_and_a_evasiveness_scores.csv", index=False)
else:
	# Load the DataFrame with evasiveness scores from the csv file
	df_q_and_a_all_agg_evasiveness = pd.read_csv("q_and_a_evasiveness_scores.csv")



---------------------------------------
Ken Usdin (Analyst): Thanks. Hey, Mark, just a follow-up on the credit. So you mentioned obviously that you moved your part of your Current Expected Credit Losses adjustment a little bit in your weightings, and you had previously talked about getting towards normalized card losses, I think you'd said by around the end of the year. So can you just given the changes that we're seeing ahead of us and definitely saw some normalization happen this quarter, can you just, are you still on line for that getting to that 3-3.5% and 5-5.5% in the respective card businesses by around year-end this year?
Mark Mason (CFO): Yeah, year-end, early next year, yes. We're still kind of on track, on trend for that. Again, I'd expect that they pick up a little bit after that before they start tapering down. But the answer to your question, Ken, is yes. That's still the timeline, fourth quarter, early 2024 reaching those normalized levels.
Evasiveness Score: 1, Justifi

In [58]:
df_q_and_a_all_agg_evasiveness

Unnamed: 0,text,page,section,reporting_period,date_of_earnings_call,bank,document_type,source,Macroeconomic Outlook,Systemic Stability & Regulation,...,RISK CATEGORY: Interest Rate Risk,RISK CATEGORY: Liquidity Risk,RISK CATEGORY: Macroeconomic Risk,RISK CATEGORY: Market and Volatility Risk,RISK CATEGORY: Operational Risk,RISK CATEGORY: Profitability,RISK CATEGORY: Regulatory & Compliance Risk,RISK CATEGORY: Strategic and Business Model Risk,evasiveness_score,justification
0,"Glenn Schorr (Analyst): Hi, thank you, a simpl...",9,Q and A,2023Q1,2023-04-14,Citigroup,transcript,"Citigroup, Q1, 2023 Earnings Call Transcript, ...",0.509946,0.0,...,1.003364,0.000000,0.999674,1.119637,0.362235,5.709532,0.000000,3.122807,4,The CFO acknowledges the strong Q1 but largely...
1,Glenn Schorr (Analyst): I appreciate that. May...,9,Q and A,2023Q1,2023-04-14,Citigroup,transcript,"Citigroup, Q1, 2023 Earnings Call Transcript, ...",0.000000,0.0,...,2.783724,6.176254,0.000000,0.409973,0.000000,0.749640,0.000000,0.803981,2,The management directly addressed the analyst'...
2,"Mike Mayo (Analyst): Hi, Jane, I challenged yo...",10,Q and A,2023Q1,2023-04-14,Citigroup,transcript,"Citigroup, Q1, 2023 Earnings Call Transcript, ...",0.000000,0.0,...,0.353618,0.723926,0.374837,1.634003,0.000000,8.027205,0.000000,0.000000,4,The CEO's response confirms the strong fee gro...
3,Mike Mayo (Analyst): Okay. And then as it rela...,11,Q and A,2023Q1,2023-04-14,Citigroup,transcript,"Citigroup, Q1, 2023 Earnings Call Transcript, ...",0.000000,0.0,...,3.124435,0.000000,0.000000,0.000000,0.000000,2.394076,0.000000,0.000000,4,The CFO acknowledges the opportunity from rate...
4,"Betsy Graseck (Analyst): Hi, good morning. I k...",11,Q and A,2023Q1,2023-04-14,Citigroup,transcript,"Citigroup, Q1, 2023 Earnings Call Transcript, ...",0.000000,0.0,...,0.817347,0.390393,0.000000,0.736496,0.455433,4.189883,0.000000,6.050321,4,The management provided a general overview of ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,"Mike Mayo (Analyst): No, and you also mentione...",18,Q and A,2025Q1,2025-04-11,JPMorgan,transcript,"JPMorgan, Q1, 2025 Earnings Call Transcript, P...",0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3,The CEO confirms the premise of freeing up hun...
395,Mike Mayo (Analyst): One short...\nJamie Dimon...,18,Q and A,2025Q1,2025-04-11,JPMorgan,transcript,"JPMorgan, Q1, 2025 Earnings Call Transcript, P...",0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.774205,0.000000,5,The CEO's response is highly general and argum...
396,Mike Mayo (Analyst): One short follow-up. Just...,18,Q and A,2025Q1,2025-04-11,JPMorgan,transcript,"JPMorgan, Q1, 2025 Earnings Call Transcript, P...",0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.410289,0.000000,0.000000,2,Jamie Dimon directly acknowledges the analyst'...
397,Mike Mayo (Analyst): All right. Thank you.\nGl...,18,Q and A,2025Q1,2025-04-11,JPMorgan,transcript,"JPMorgan, Q1, 2025 Earnings Call Transcript, P...",0.000000,0.0,...,0.000000,0.402061,0.000000,2.208702,0.419099,3.340518,0.000000,3.497499,2,The management directly answers whether differ...


#### **VISUALISATIONS**