In [None]:
# ==============================================================================
# 1. SETUP: INSTALL AND IMPORT LIBRARIES
# ==============================================================================
# Install required libraries. BERTopic[all] includes sentence-transformers,
# UMAP, HDBSCAN, and other core dependencies. 'contractions' helps with text cleaning.
!pip install bertopic[all] --quiet
!pip install contractions --quiet
!pip install PyPDF2 --quiet
!pip install spacy
!python -m spacy download en_core_web_sm

import pandas as pd
import numpy as np
import re
import torch




# Text preprocessing libraries
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# Core modeling libraries
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic

# Download necessary NLTK data for preprocessing
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)

print("Setup complete. Libraries are installed and loaded.")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Setup complete. Libraries are installed and loaded.


In [None]:
!ls "/content/drive/MyDrive"


 2022
 Cambridge
'Colab Notebooks'
 IMG_20240424_203450.jpg
'Max_Hands_3.1.3_Exploring_ethics_and_inclusivity_in_data science.gdoc'
'Topic m odeling.drawio'
'Topic m odeling.drawio.png'
'Warner Bros Answers'


In [None]:
from google.colab import drive
import os
import glob

# Step 1: Mount Google Drive
drive.mount('/content/drive')  # Authorise in the popup

# Needed to read the folders and iterate
from google.colab import drive
from googleapiclient.discovery import build
from google.colab import auth
# import PyPDF2

# Will be needed to do anything with the files
import googleapiclient.http
from googleapiclient.http import MediaIoBaseDownload
import os
import io


auth.authenticate_user()
service = build('drive', 'v3')

# List of folder IDs to search through
folder_ids = [
    "1y1-7h6JbgFJ5l4lGiz2kFNhhzeUTsaQ6" #files to process
]

def get_files_from_folder(folder_id):
    """Retrieve all files from a specific folder"""
    try:
        files = service.files().list(
            q=f"'{folder_id}' in parents",
            fields="files(id, name, mimeType)"
        ).execute()
        return files.get('files', [])
    except Exception as e:
        print(f"Error accessing folder {folder_id}: {e}")
        return []

all_files = []
folder_file_counts = {}

for folder_id in folder_ids:
    folder_files = get_files_from_folder(folder_id)
    folder_file_counts[folder_id] = len(folder_files)
    all_files.extend(folder_files)

print(f"Total folders searched: {len(folder_ids)}")
print(f"Total files found: {len(all_files)}")
print(f"\nFiles per folder:")
for folder_id, count in folder_file_counts.items():
    print(f"  {folder_id}: {count} files")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total folders searched: 1
Total files found: 24

Files per folder:
  1y1-7h6JbgFJ5l4lGiz2kFNhhzeUTsaQ6: 24 files


In [None]:
for file in all_files:
  print(file['name'])

1q23-earnings-transcript.pdf
2q23-earnings-transcript.pdf
jpm-3q23-earnings-call-transcript.pdf
jpm-4q23-earnings-call-transcript.pdf
jpm-1q24-earnings-call-transcript.pdf
jpm-2q24-earnings-call-transcript-final.pdf
jpmc-third-quarter-2024-earnings-conference-call-transcript.pdf
4q24-earnings-transcript.pdf
1q25-earnings-transcript.pdf
UBS_Q3_2023_Earnings_Call_Remarks.pdf
UBS_Q2_2023_Earnings_Call_Remarks.pdf
UBS_Q1_2023_Earnings_Call_Remarks.pdf
UBS_Q4_2023_Earnings_Call_Remarks.pdf
UBS_Q2_2024_Earnings_Call_Remarks.pdf
UBS_Q1_2025_Earnings_Call_Remarks.pdf
UBS_Q3_2024_Earnings_Call_Remarks.pdf
UBS_Q1_2024_Earnings_Call_Remarks.pdf
UBS_Q4_2024_Earnings_Call_Remarks.pdf
Citibank_Q1_2025_Transcript.pdf
Citibank_Q4_2024_Transcript.pdf
Citibank_Q3_2024_Transcript.pdf
Citibank_Q2_2024_Transcript.pdf
Citibank_Q4_2023_Transcript.pdf
Citibank_Q3_2023_Transcript.pdf


In [None]:
# Convert all_files to a dataframe
import pandas as pd
df = pd.DataFrame(all_files)
df.head()

# output df as csv
df.to_csv('all_files.csv', index=False)

In [None]:
def read_pdf_to_text(service, file_id, file_name=None):
    try:
        # Download the PDF file
        request = service.files().get_media(fileId=file_id)
        file_content = io.BytesIO()

        # Download file content
        downloader = MediaIoBaseDownload(file_content, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()

        file_content.seek(0)

        # Extract text using PyPDF2
        pdf_reader = PyPDF2.PdfReader(file_content)
        full_text = ""

        for page_num, page in enumerate(pdf_reader.pages, 1):
            try:
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text + "\n"
            except Exception as page_error:
                print(f"Warning: Error reading page {page_num}: {page_error}")

        return full_text.strip()

    except Exception as e:
        error_msg = f"Error reading PDF"
        if file_name:
            error_msg += f" '{file_name}'"
        error_msg += f": {str(e)}"
        print(error_msg)
        return ""

In [None]:
import PyPDF2

# Iterate through files and process PDFs - really just a placeholder for iteration
max_iterations = 25 # added for testing
pdf_results = []

files = all_files
current_count = 0

for file in files:
    if file['mimeType'] == 'application/pdf':
        current_count += 1
        if current_count > max_iterations:
            break

        print(f"Processing: {file['name']}")

        pdf_text = read_pdf_to_text(service, file['id'],file['name'])

        result = {
            'name': file['name'],
            'id': file['id'],
            'text': pdf_text,
        }

        pdf_results.append(result)

Processing: 1q23-earnings-transcript.pdf
Processing: 2q23-earnings-transcript.pdf
Processing: jpm-3q23-earnings-call-transcript.pdf
Processing: jpm-4q23-earnings-call-transcript.pdf
Processing: jpm-1q24-earnings-call-transcript.pdf
Processing: jpm-2q24-earnings-call-transcript-final.pdf
Processing: jpmc-third-quarter-2024-earnings-conference-call-transcript.pdf
Processing: 4q24-earnings-transcript.pdf
Processing: 1q25-earnings-transcript.pdf
Processing: UBS_Q3_2023_Earnings_Call_Remarks.pdf
Processing: UBS_Q2_2023_Earnings_Call_Remarks.pdf
Processing: UBS_Q1_2023_Earnings_Call_Remarks.pdf
Processing: UBS_Q4_2023_Earnings_Call_Remarks.pdf
Processing: UBS_Q2_2024_Earnings_Call_Remarks.pdf
Processing: UBS_Q1_2025_Earnings_Call_Remarks.pdf
Processing: UBS_Q3_2024_Earnings_Call_Remarks.pdf
Processing: UBS_Q1_2024_Earnings_Call_Remarks.pdf
Processing: UBS_Q4_2024_Earnings_Call_Remarks.pdf
Processing: Citibank_Q1_2025_Transcript.pdf
Processing: Citibank_Q4_2024_Transcript.pdf
Processing: Citi

In [None]:
# Summary
print(f"\nSUMMARY:")
if pdf_results:
    print(f"Total PDFs processed: {len(pdf_results)}")
    for result in pdf_results:
        print(result)
else:
    print("No PDFs were processed.")


SUMMARY:
Total PDFs processed: 24
{'name': '1q23-earnings-transcript.pdf', 'id': '1LxZaEcbfhKDkwzZkmn3TBgCYf5ZBpveg', 'text': "1Q 23 F I NANCI AL  RE SULT S  \nEARNINGS CALL TRANSCRIPT  \nApril 14, 2023 \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n  \n   \n \n  \n  \n \n \n2 \n \nMANAGEMENT DISCUSSION SECTION  \n ................................ ................................ ................................ ................................ ................................ ................................ ................................ ......................   \n \nOperator:  Good morning, ladies and gentlemen. Welcome to JPMorgan Chase’s First Quarter 2023 Earnings Call. This call is being recorded . \nYour line will be muted for the duration of the call. We will now go live to the presentation. Please stand by.  \n \nAt this time, I would like to turn the call over to JPMorgan Chase's Chairman and CEO, Jamie D imon, and Chief Financial Officer, Jeremy \nBarnum.  \n

In [None]:
# ==============================================================================
# 3. TEXT PREPROCESSING & CHUNKING
# ==============================================================================
# --- Integration: Chunk sentences before preprocessing ---
# This preserves the semantic coherence within each chunk.
# Instead of flattening all text, we process each document individually to maintain its identity.
# We will create a list of dictionaries, where each entry represents a chunk and its source document.

print("--- Starting Chunking and Preprocessing ---")
all_chunks_with_metadata = []
chunk_size = 2 # Number of sentences per chunk

for doc in pdf_results:
    doc_name = doc['name']
    doc_text = doc['text']

    # Split the document text into sentences
    sentences = doc_text.split('.')
    sentences = [s.strip() for s in sentences if len(s.strip()) > 5] # Basic sentence cleaning

    if not sentences:
        continue

    # Create chunks from sentences
    chunks = [" ".join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]

    # Store each chunk with its source document name
    for chunk_text in chunks:
        all_chunks_with_metadata.append({
            'doc_name': doc_name,
            'chunk_text': chunk_text
        })

# Convert the list of chunks into a pandas DataFrame for easier manipulation
import pandas as pd
df_chunks = pd.DataFrame(all_chunks_with_metadata)

print(f"Created {len(df_chunks)} total chunks from {len(pdf_results)} documents.")
print("DataFrame with document-chunk mapping created:")
print(df_chunks.head())


--- Starting Chunking and Preprocessing ---
Created 7815 total chunks from 24 documents.
DataFrame with document-chunk mapping created:
                       doc_name  \
0  1q23-earnings-transcript.pdf   
1  1q23-earnings-transcript.pdf   
2  1q23-earnings-transcript.pdf   
3  1q23-earnings-transcript.pdf   
4  1q23-earnings-transcript.pdf   

                                          chunk_text  
0  1Q 23 F I NANCI AL  RE SULT S  \nEARNINGS CALL...  
1  Welcome to JPMorgan Chase’s First Quarter 2023...  
2  Your line will be muted for the duration of th...  
3  Please stand by At this time, I would like to ...  
4  Barnum, please go ahead Jeremy Barnum  \nChief...  


In [None]:
# import nltk
# # Download necessary NLTK data for preprocessing
# nltk.download('punkt', quiet=True)
# nltk.download('stopwords', quiet=True)
# # Ensure the specific English averaged perceptron tagger is downloaded
# nltk.download('averaged_perceptron_tagger_eng', quiet=True)
# nltk.download('wordnet', quiet=True)

# # Remove the unnecessary punkt_tab download
# # nltk.download('punkt_tab')

# # --- Recommendation Integration: Expand stopwords and use POS tagging ---
# stop_words = set(stopwords.words("english"))
# custom_stopwords = {
#     "quarter", "call", "conference", "today", "thank", "forward", "statement",
#     "morning", "presentation", "result", "financial", "welcome", "good", "company",
#     "operator", "question", "analyst", "year", "inc", "ltd"
# }
# all_stopwords = stop_words.union(custom_stopwords)
# lemmatizer = WordNetLemmatizer()

# def preprocess_text(doc):
#     """
#     Cleans, tokenizes, and lemmatizes text, keeping only relevant parts of speech.
#     - Expands contractions.
#     - Removes speaker tags and special characters.
#     - Filters for nouns, verbs, and adjectives (POS tagging).
#     - Lemmatizes and removes stopwords.
#     """
#     # 1. Expand contractions (e.g., "isn't" -> "is not")
#     text = contractions.fix(doc)

#     # 2. Remove speaker tags (e.g., "JANE FRASER:") and make lowercase
#     text = re.sub(r"^[a-z\s\.-]+:", "", text.lower())

#     # 3. Remove special characters and digits
#     text = re.sub(r"[^a-z\s]", "", text)


#     # 4. Tokenize and apply Part-of-Speech (POS) tagging
#     words = word_tokenize(text)
#     pos_tagged_words = pos_tag(words)

#     # 5. Keep only nouns (NN), verbs (VB), and adjectives (JJ)
#     # This focuses the model on the most meaningful terms.
#     filtered_words = [
#         word for word, tag in pos_tagged_words
#         if tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ')
#     ]

#     # 6. Lemmatize and remove stopwords
#     lemmatized_words = [
#         lemmatizer.lemmatize(word) for word in filtered_words if word not in all_stopwords and len(word) > 2
#     ]

#     return " ".join(lemmatized_words)

# # Apply preprocessing to the document chunks
# print("\nApplying preprocessing to all chunks in the DataFrame...")
# df_chunks['preprocessed_chunk'] = df_chunks['chunk_text'].apply(preprocess_text)

# # Also, remove any empty rows that might result from preprocessing
# df_chunks.dropna(subset=['preprocessed_chunk'], inplace=True)
# df_chunks = df_chunks[df_chunks['preprocessed_chunk'] != '']

# print("Preprocessing complete. DataFrame is now ready for modeling.")
# print(df_chunks.head())

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
import re
import nltk
import contractions
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

# NLTK Downloads
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)

# Initialise NLP Tools
stop_words = set(stopwords.words("english"))
custom_stopwords = {
    "quarter", "call", "conference", "today", "thank", "forward", "statement",
    "morning", "presentation", "result", "financial", "welcome", "good", "company",
    "operator", "question", "analyst", "year", "inc", "ltd"
}
all_stopwords = stop_words.union(custom_stopwords)
lemmatizer = WordNetLemmatizer()

# Load SpaCy with custom EntityRuler
nlp = spacy.load("en_core_web_sm")

# Add EntityRuler before NER in SpaCy pipeline
ruler = nlp.add_pipe("entity_ruler", before="ner", config={"overwrite_ents": True})

blacklisted_entities = [
    "jpmorgan", "jp morgan", "jpmorgan chase", "j.p. morgan", "jpmorganchase",
    "goldman sachs", "goldman", "morgan stanley", "citigroup", "citi", "hsbc",
    "jeremy", "barnum", "graseck", "jennifer"
]

patterns = [{"label": "BLACKLISTED", "pattern": [{"LOWER": name.lower()}]} for name in blacklisted_entities]
ruler.add_patterns(patterns)


In [None]:
def preprocess_text(doc):
    """
    Cleans, tokenises, removes blacklisted entities (ORG, PERSON, custom),
    lemmatises and keeps only relevant POS for topic modelling.
    """
    # 1. Expand contractions
    text = contractions.fix(doc)

    # 2. Replace underscores and hyphens early (n-gram remnants)
    text = text.replace("_", " ").replace("-", " ")

    # 3. Apply SpaCy NER + EntityRuler
    spacy_doc = nlp(text)

    # 4. Remove tokens that are part of PERSON or BLACKLISTED entities
    text_filtered = " ".join([
        token.text for token in spacy_doc
        if not any(ent.label_ in {"PERSON", "BLACKLISTED"} and token.text in ent.text for ent in spacy_doc.ents)
    ])

    # 5. Remove speaker labels / tags at start of lines
    text_filtered = re.sub(r"^[a-z\s\.-]+:", "", text_filtered.lower())

    # 6. Remove special characters and numbers
    text_filtered = re.sub(r"[^a-z\s]", "", text_filtered)

    # 7. Tokenise and POS tag
    words = word_tokenize(text_filtered)
    pos_tagged_words = pos_tag(words)

    # 8. Keep only Nouns, Verbs, Adjectives
    filtered_words = [
        word for word, tag in pos_tagged_words
        if tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ')
    ]

    # 9. Lemmatise and remove stopwords
    lemmatised_words = [
        lemmatizer.lemmatize(word.lower()) for word in filtered_words
        if word.lower() not in all_stopwords and len(word) > 2
    ]

    return " ".join(lemmatised_words)


In [None]:
print("\nApplying NER-enhanced preprocessing to all chunks in the DataFrame...")
df_chunks['preprocessed_chunk'] = df_chunks['chunk_text'].apply(preprocess_text)

# Drop empty rows
df_chunks.dropna(subset=['preprocessed_chunk'], inplace=True)
df_chunks = df_chunks[df_chunks['preprocessed_chunk'].str.strip() != '']

print("Preprocessing complete. DataFrame is now ready for modelling.")
print(df_chunks.head())



Applying NER-enhanced preprocessing to all chunks in the DataFrame...
Preprocessing complete. DataFrame is now ready for modelling.
                       doc_name  \
0  1q23-earnings-transcript.pdf   
1  1q23-earnings-transcript.pdf   
2  1q23-earnings-transcript.pdf   
3  1q23-earnings-transcript.pdf   
4  1q23-earnings-transcript.pdf   

                                          chunk_text  \
0  1Q 23 F I NANCI AL  RE SULT S  \nEARNINGS CALL...   
1  Welcome to JPMorgan Chase’s First Quarter 2023...   
2  Your line will be muted for the duration of th...   
3  Please stand by At this time, I would like to ...   
4  Barnum, please go ahead Jeremy Barnum  \nChief...   

                                  preprocessed_chunk  
0  nanci sult earnings transcript april managemen...  
1                      chase first earnings recorded  
2                           line muted duration live  
3  please stand time like turn chase chairman ceo...  
4                         please chief offic

In [None]:
test_text = "Jeremy Barnum of JPMorgan Chase met with Citi and Goldman Sachs executives in New York."



# Run SpaCy pipeline
spacy_doc = nlp(test_text)

# Print detected entities
for ent in spacy_doc.ents:
    print(ent.text, ent.label_)


Jeremy BLACKLISTED
Barnum BLACKLISTED
JPMorgan BLACKLISTED
Citi BLACKLISTED
Goldman BLACKLISTED
New York GPE


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired
from hdbscan import HDBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import numpy as np

# ==============================================================================
# 4. MODEL INITIALIZATION
# ==============================================================================

# --- Embedding Model ---
embedding_model = SentenceTransformer("ProsusAI/finbert")

# --- HDBSCAN for clustering with an adjusted min_cluster_size ---
hdbscan_model = HDBSCAN(
    min_cluster_size=15,  # Start here and adjust as needed
    min_samples=1,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

# --- UMAP for dimensionality reduction ---
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# --- Vectorizer to include n-grams ---
vectorizer_model = CountVectorizer(ngram_range=(1, 3))  # Use 1, 2, and 3-word phrases, This has provided a more verstile topic modeling list, mono-grams yielded overly generalised results.

# --- Representation model for longer topic descriptions ---
representation_model = KeyBERTInspired(top_n_words=8)  # Aim for 8 words per topic

# --- Initialize BERTopic with all components ---
topic_model = BERTopic(
    embedding_model=embedding_model,
    hdbscan_model=hdbscan_model,
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    calculate_probabilities=True,
    verbose=True
)



In [None]:
# If you find too many documents are labeled as outliers (-1), this can help.
# new_topics = topic_model.reduce_outliers(preprocessed_docs, topics, strategy="probabilities")
# topic_model.update_topics(preprocessed_docs, topics=new_topics)
# print("Outlier reduction applied.")

In [None]:
# ==============================================================================
# 5. FIT TOPIC MODEL TO DOCUMENT CHUNKS
# ==============================================================================
# List of preprocessed documents (chunks)
documents_to_fit = df_chunks['preprocessed_chunk'].tolist()

print("\n--- Fitting BERTopic model on all chunks ---")
topics, probs = topic_model.fit_transform(documents_to_fit)

# ==============================================================================
# 6. INTEGRATE TOPIC RESULTS INTO DATAFRAME
# ==============================================================================

# Add topic IDs to the DataFrame
df_chunks['topic_id'] = topics

# ✅ FIX IMPLEMENTED HERE:
# Instead of assigning the entire probs array (which is 2D), we extract max probability per chunk
df_chunks['topic_probability'] = np.max(probs, axis=1)  # Now each row gets a single float value

# to keep the entire vector, uncomment the next line
df_chunks['topic_prob_vector'] = probs.tolist()

print("\n✅ Topic modelling complete. Results mapped back to DataFrame:")
print(df_chunks.head())

2025-06-18 14:42:41,162 - BERTopic - Embedding - Transforming documents to embeddings.



--- Fitting BERTopic model on all chunks ---


Batches:   0%|          | 0/243 [00:00<?, ?it/s]

2025-06-18 14:42:54,165 - BERTopic - Embedding - Completed ✓
2025-06-18 14:42:54,166 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-18 14:43:09,198 - BERTopic - Dimensionality - Completed ✓
2025-06-18 14:43:09,199 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-18 14:43:18,087 - BERTopic - Cluster - Completed ✓
2025-06-18 14:43:18,094 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-18 14:43:29,573 - BERTopic - Representation - Completed ✓



✅ Topic modelling complete. Results mapped back to DataFrame:
                       doc_name  \
0  1q23-earnings-transcript.pdf   
1  1q23-earnings-transcript.pdf   
2  1q23-earnings-transcript.pdf   
3  1q23-earnings-transcript.pdf   
4  1q23-earnings-transcript.pdf   

                                          chunk_text  \
0  1Q 23 F I NANCI AL  RE SULT S  \nEARNINGS CALL...   
1  Welcome to JPMorgan Chase’s First Quarter 2023...   
2  Your line will be muted for the duration of th...   
3  Please stand by At this time, I would like to ...   
4  Barnum, please go ahead Jeremy Barnum  \nChief...   

                                  preprocessed_chunk  topic_id  \
0  nanci sult earnings transcript april managemen...        -1   
1                      chase first earnings recorded        -1   
2                           line muted duration live        -1   
3  please stand time like turn chase chairman ceo...        -1   
4                         please chief officer chase       

In [None]:
# ==============================================================================
# 7. AGGREGATE TOPICS PER DOCUMENT (New Section)
# ==============================================================================
from collections import Counter

def analyze_topics_per_document(df, topic_model, top_n=7):
    """
    Groups topics by the original document and prints a summary.

    Args:
        df (pd.DataFrame): DataFrame containing 'doc_name', 'topic_id'.
        topic_model (BERTopic): The fitted BERTopic model.
        top_n (int): The number of top topics to display for each document.
    """
    print("\n" + "="*50)
    print("--- TOPIC ANALYSIS PER DOCUMENT ---")
    print("="*50)

    # Get topic names for easier interpretation
    topic_info = topic_model.get_topic_info()
    topic_id_to_name = dict(zip(topic_info['Topic'], topic_info['Name']))

    # Group by the original document name
    grouped = df.groupby('doc_name')

    for doc_name, group in grouped:
        print(f"\nDOCUMENT: {doc_name}")

        # Count noise chunks (topic_id == -1)
        noise_count = (group['topic_id'] == -1).sum()
        total_chunks = len(group)

        # Count the frequency of each topic in this document
        topic_counts = Counter(group['topic_id'])

        # Separate noise from actual topics for clarity
        if -1 in topic_counts:
            del topic_counts[-1]

        print(f"  - Total Chunks: {total_chunks}")
        print(f"  - Chunks classified as Noise (Outliers): {noise_count} ({noise_count/total_chunks:.1%})")

        if not topic_counts:
            print("  - No dominant topics found (all chunks were noise).")
            continue

        # Get the most common topics for this document
        most_common_topics = topic_counts.most_common(top_n)

        print(f"\n  - Top {len(most_common_topics)} Dominant Topics:")
        for topic_id, count in most_common_topics:
            topic_name = topic_id_to_name.get(topic_id, f"Unknown Topic {topic_id}")
            # Get average probability for this topic within this document
            avg_prob = group[group['topic_id'] == topic_id]['topic_probability'].mean()

            print(f"    - Topic {topic_id} ('{topic_name}'):")
            print(f"      - Frequency: {count} chunks ({count/total_chunks:.1%})")
            if avg_prob is not None:
                print(f"      - Avg. Probability: {avg_prob:.2f}")

# Run the analysis function
analyze_topics_per_document(df_chunks, topic_model)


--- TOPIC ANALYSIS PER DOCUMENT ---

DOCUMENT: 1q23-earnings-transcript.pdf
  - Total Chunks: 320
  - Chunks classified as Noise (Outliers): 107 (33.4%)

  - Top 7 Dominant Topics:
    - Topic 1 ('1_chief executive officer_chief officer chase_executive officer chase_executive officer chief'):
      - Frequency: 27 chunks (8.4%)
      - Avg. Probability: 0.82
    - Topic 10 ('10_trade war_trying make_economy think_going lot'):
      - Frequency: 9 chunks (2.8%)
      - Avg. Probability: 0.39
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 9 chunks (2.8%)
      - Avg. Probability: 0.60
    - Topic 20 ('20_chief officer chase_chief officer_chase interesting_chase interesting chief'):
      - Frequency: 7 chunks (2.2%)
      - Avg. Probability: 0.75
    - Topic 11 ('11_moving part going_ask little bit_get thing_going spend'):
      - Frequency: 6 chunks (1.9%)
      - Avg. Probability: 0.54
    - Topic 24 ('24_rbc next come_next com

In [None]:
# Get the DataFrame with topic info
topic_info_df = topic_model.get_topic_info()

# Create dictionary: topic_id (int) → topic description (str)
topic_dict = dict(zip(
    topic_info_df.Topic,  # numeric topic IDs
    topic_info_df.Name    # descriptive strings
))

# Check example
print(topic_dict[6])


6_make sense_business guiding_big picture_product level reprice


In [None]:
heatmap_data = df_chunks.groupby(['doc_name', 'topic_id'])['topic_probability'].mean().unstack(fill_value=0)
freq_data = df_chunks.groupby(['doc_name', 'topic_id']).size().reset_index(name='count')


In [None]:
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
from ipywidgets import widgets, interact

# ---------- 1. Heatmap: Topic Distribution per Document ----------
# topic_dict is your dictionary {topic_id: description}

topic_ids = heatmap_data.columns.tolist()
topic_labels = [topic_dict.get(t, str(t)) for t in topic_ids]

fig = go.Figure(
    data=go.Heatmap(
        z=heatmap_data.values,
        x=topic_labels,
        y=heatmap_data.index.tolist(),
        text=[
            [
                f"Document: {doc}<br>Topic: {topic_dict.get(topic, topic)}<br>Probability: {prob:.4f}"
                for topic, prob in zip(topic_ids, row)
            ]
            for doc, row in zip(heatmap_data.index.tolist(), heatmap_data.values)
        ],
        hoverinfo="text",
        colorscale="Reds",
        colorbar=dict(title="Avg. Topic Probability")
    )
)

fig.update_layout(
    title="Topic Distribution Across Documents",
    xaxis_title="Topics",
    xaxis= dict(showticklabels=False),
    yaxis_title="Document"
)
# fig.update_layout(
#     xaxis=dict(
#         showticklabels=False
#     )
# )

fig.show()


In [None]:
topic_model.visualize_topics()


In [None]:
# Contains n-grams 1-3
results ="""
==================================================
--- TOPIC ANALYSIS PER DOCUMENT ---
==================================================

DOCUMENT: 1q23-earnings-transcript.pdf
  - Total Chunks: 320
  - Chunks classified as Noise (Outliers): 107 (33.4%)

  - Top 7 Dominant Topics:
    - Topic 1 ('1_chief executive officer_chief officer chase_executive officer chase_executive officer chief'):
      - Frequency: 27 chunks (8.4%)
      - Avg. Probability: 0.82
    - Topic 10 ('10_trade war_trying make_economy think_going lot'):
      - Frequency: 9 chunks (2.8%)
      - Avg. Probability: 0.39
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 9 chunks (2.8%)
      - Avg. Probability: 0.60
    - Topic 20 ('20_chief officer chase_chief officer_chase interesting_chase interesting chief'):
      - Frequency: 7 chunks (2.2%)
      - Avg. Probability: 0.75
    - Topic 11 ('11_moving part going_ask little bit_get thing_going spend'):
      - Frequency: 6 chunks (1.9%)
      - Avg. Probability: 0.54
    - Topic 24 ('24_rbc next come_next come rbc_come line rbc_line rbc capital'):
      - Frequency: 6 chunks (1.9%)
      - Avg. Probability: 0.31
    - Topic 3 ('3_see little bit_bit higher_bit less_little bit line'):
      - Frequency: 5 chunks (1.6%)
      - Avg. Probability: 0.48

DOCUMENT: 1q25-earnings-transcript.pdf
  - Total Chunks: 369
  - Chunks classified as Noise (Outliers): 137 (37.1%)

  - Top 7 Dominant Topics:
    - Topic 1 ('1_chief executive officer_chief officer chase_executive officer chase_executive officer chief'):
      - Frequency: 22 chunks (6.0%)
      - Avg. Probability: 0.80
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 18 chunks (4.9%)
      - Avg. Probability: 0.39
    - Topic 10 ('10_trade war_trying make_economy think_going lot'):
      - Frequency: 11 chunks (3.0%)
      - Avg. Probability: 0.49
    - Topic 57 ('57_thanks chief officer_right chief officer_chase thanks chief_chief officer yeah'):
      - Frequency: 8 chunks (2.2%)
      - Avg. Probability: 0.91
    - Topic 35 ('35_truist security_truist security okay_proceed truist security_security truist security'):
      - Frequency: 8 chunks (2.2%)
      - Avg. Probability: 0.13
    - Topic 3 ('3_see little bit_bit higher_bit less_little bit line'):
      - Frequency: 6 chunks (1.6%)
      - Avg. Probability: 0.37
    - Topic 24 ('24_rbc next come_next come rbc_come line rbc_line rbc capital'):
      - Frequency: 6 chunks (1.6%)
      - Avg. Probability: 0.63

DOCUMENT: 2q23-earnings-transcript.pdf
  - Total Chunks: 345
  - Chunks classified as Noise (Outliers): 125 (36.2%)

  - Top 7 Dominant Topics:
    - Topic 20 ('20_chief officer chase_chief officer_chase interesting_chase interesting chief'):
      - Frequency: 17 chunks (4.9%)
      - Avg. Probability: 0.60
    - Topic 1 ('1_chief executive officer_chief officer chase_executive officer chase_executive officer chief'):
      - Frequency: 10 chunks (2.9%)
      - Avg. Probability: 0.79
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 9 chunks (2.6%)
      - Avg. Probability: 0.71
    - Topic 10 ('10_trade war_trying make_economy think_going lot'):
      - Frequency: 8 chunks (2.3%)
      - Avg. Probability: 0.31
    - Topic 6 ('6_make sense_business guiding_big picture_product level reprice'):
      - Frequency: 7 chunks (2.0%)
      - Avg. Probability: 0.35
    - Topic 29 ('29_chief officer chase_chief officer hey_chief officer thanks_chief officer well'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 0.67
    - Topic 8 ('8_higher revolving balance_revenue driven higher_revenue increased driven_higher deposit spread'):
      - Frequency: 5 chunks (1.4%)
      - Avg. Probability: 0.26

DOCUMENT: 4q24-earnings-transcript.pdf
  - Total Chunks: 263
  - Chunks classified as Noise (Outliers): 94 (35.7%)

  - Top 7 Dominant Topics:
    - Topic 1 ('1_chief executive officer_chief officer chase_executive officer chase_executive officer chief'):
      - Frequency: 19 chunks (7.2%)
      - Avg. Probability: 0.92
    - Topic 8 ('8_higher revolving balance_revenue driven higher_revenue increased driven_higher deposit spread'):
      - Frequency: 7 chunks (2.7%)
      - Avg. Probability: 0.42
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 7 chunks (2.7%)
      - Avg. Probability: 0.45
    - Topic 11 ('11_moving part going_ask little bit_get thing_going spend'):
      - Frequency: 6 chunks (2.3%)
      - Avg. Probability: 0.39
    - Topic 29 ('29_chief officer chase_chief officer hey_chief officer thanks_chief officer well'):
      - Frequency: 6 chunks (2.3%)
      - Avg. Probability: 0.01
    - Topic 68 ('68_kind flow fund_level reprice happen_basel iii endgame_policy rate'):
      - Frequency: 5 chunks (1.9%)
      - Avg. Probability: 0.81
    - Topic 50 ('50_bofa security_nii market_nii_bofa'):
      - Frequency: 4 chunks (1.5%)
      - Avg. Probability: 0.32

DOCUMENT: Citibank_Q1_2025_Transcript.pdf
  - Total Chunks: 365
  - Chunks classified as Noise (Outliers): 146 (40.0%)

  - Top 7 Dominant Topics:
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 8 chunks (2.2%)
      - Avg. Probability: 0.44
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 7 chunks (1.9%)
      - Avg. Probability: 0.49
    - Topic 47 ('47_come line open_coming line wolfe_come line line_first come well'):
      - Frequency: 6 chunks (1.6%)
      - Avg. Probability: 0.75
    - Topic 14 ('14_integration everything control_drive growth_innovating_integration everything'):
      - Frequency: 6 chunks (1.6%)
      - Avg. Probability: 1.00
    - Topic 31 ('31_statement based management_available download website_document occur manner_guidance gave'):
      - Frequency: 5 chunks (1.4%)
      - Avg. Probability: 0.62
    - Topic 4 ('4_high net worth_optimizing_revenue growth_global wealth'):
      - Frequency: 5 chunks (1.4%)
      - Avg. Probability: 0.06
    - Topic 5 ('5_comment made comment_guidance gave_guidance given_comment made'):
      - Frequency: 5 chunks (1.4%)
      - Avg. Probability: 0.63

DOCUMENT: Citibank_Q2_2024_Transcript.pdf
  - Total Chunks: 343
  - Chunks classified as Noise (Outliers): 127 (37.0%)

  - Top 7 Dominant Topics:
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 14 chunks (4.1%)
      - Avg. Probability: 0.58
    - Topic 102 ('102_line open please_open please_line open_please line open'):
      - Frequency: 8 chunks (2.3%)
      - Avg. Probability: 0.97
    - Topic 8 ('8_higher revolving balance_revenue driven higher_revenue increased driven_higher deposit spread'):
      - Frequency: 7 chunks (2.0%)
      - Avg. Probability: 0.12
    - Topic 12 ('12_common dividend share_proportion cash dividend_dividend accrual_repurchase drove reduction'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 0.52
    - Topic 89 ('89_infrastructure simplify process_infrastructure reducing modernizing_faster decision making_layer fostering clear'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 0.85
    - Topic 26 ('26_growth expect_narrow gap_achievable_resiliency'):
      - Frequency: 5 chunks (1.5%)
      - Avg. Probability: 0.24
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 5 chunks (1.5%)
      - Avg. Probability: 0.28

DOCUMENT: Citibank_Q3_2023_Transcript.pdf
  - Total Chunks: 329
  - Chunks classified as Noise (Outliers): 128 (38.9%)

  - Top 7 Dominant Topics:
    - Topic 14 ('14_integration everything control_drive growth_innovating_integration everything'):
      - Frequency: 7 chunks (2.1%)
      - Avg. Probability: 0.48
    - Topic 5 ('5_comment made comment_guidance gave_guidance given_comment made'):
      - Frequency: 7 chunks (2.1%)
      - Avg. Probability: 0.35
    - Topic 17 ('17_share growth_market share_new business_winning new'):
      - Frequency: 7 chunks (2.1%)
      - Avg. Probability: 0.32
    - Topic 77 ('77_integrate permissioned tokenized_investing commercial digital_implemented cloud based_commercial card volume'):
      - Frequency: 6 chunks (1.8%)
      - Avg. Probability: 0.85
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 5 chunks (1.5%)
      - Avg. Probability: 0.25
    - Topic 10 ('10_trade war_trying make_economy think_going lot'):
      - Frequency: 5 chunks (1.5%)
      - Avg. Probability: 0.45
    - Topic 69 ('69_interconnected business align_integration need model_improvement continue optimize_progress advancing strategy'):
      - Frequency: 5 chunks (1.5%)
      - Avg. Probability: 1.00

DOCUMENT: Citibank_Q3_2024_Transcript.pdf
  - Total Chunks: 362
  - Chunks classified as Noise (Outliers): 134 (37.0%)

  - Top 7 Dominant Topics:
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 9 chunks (2.5%)
      - Avg. Probability: 0.43
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 8 chunks (2.2%)
      - Avg. Probability: 0.31
    - Topic 4 ('4_high net worth_optimizing_revenue growth_global wealth'):
      - Frequency: 7 chunks (1.9%)
      - Avg. Probability: 0.08
    - Topic 27 ('27_usefulness_investment needed_deliberate path_progress made'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 0.37
    - Topic 26 ('26_growth expect_narrow gap_achievable_resiliency'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 0.37
    - Topic 102 ('102_line open please_open please_line open_please line open'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 1.00
    - Topic 3 ('3_see little bit_bit higher_bit less_little bit line'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 0.57

DOCUMENT: Citibank_Q4_2023_Transcript.pdf
  - Total Chunks: 395
  - Chunks classified as Noise (Outliers): 159 (40.3%)

  - Top 7 Dominant Topics:
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 12 chunks (3.0%)
      - Avg. Probability: 0.32
    - Topic 9 ('9_net ppa adjustment_risk weight foreign_total risk weighted_excluding prior net'):
      - Frequency: 11 chunks (2.8%)
      - Avg. Probability: 0.59
    - Topic 8 ('8_higher revolving balance_revenue driven higher_revenue increased driven_higher deposit spread'):
      - Frequency: 10 chunks (2.5%)
      - Avg. Probability: 0.45
    - Topic 4 ('4_high net worth_optimizing_revenue growth_global wealth'):
      - Frequency: 7 chunks (1.8%)
      - Avg. Probability: 0.33
    - Topic 14 ('14_integration everything control_drive growth_innovating_integration everything'):
      - Frequency: 7 chunks (1.8%)
      - Avg. Probability: 0.87
    - Topic 21 ('21_estimate increase fluctuate_fluctuate target illustrative_increase fluctuate target_increase fluctuate'):
      - Frequency: 6 chunks (1.5%)
      - Avg. Probability: 0.22
    - Topic 74 ('74_delivered rotce_rotce_revenue delivered rotce_service delivered rotce'):
      - Frequency: 5 chunks (1.3%)
      - Avg. Probability: 0.72

DOCUMENT: Citibank_Q4_2024_Transcript.pdf
  - Total Chunks: 338
  - Chunks classified as Noise (Outliers): 142 (42.0%)

  - Top 7 Dominant Topics:
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 10 chunks (3.0%)
      - Avg. Probability: 0.33
    - Topic 4 ('4_high net worth_optimizing_revenue growth_global wealth'):
      - Frequency: 7 chunks (2.1%)
      - Avg. Probability: 0.21
    - Topic 54 ('54_positive operating leverage_generated positive operating_strong positive_increase strong positive'):
      - Frequency: 6 chunks (1.8%)
      - Avg. Probability: 0.39
    - Topic 14 ('14_integration everything control_drive growth_innovating_integration everything'):
      - Frequency: 6 chunks (1.8%)
      - Avg. Probability: 0.70
    - Topic 6 ('6_make sense_business guiding_big picture_product level reprice'):
      - Frequency: 6 chunks (1.8%)
      - Avg. Probability: 0.37
    - Topic 81 ('81_improving business mix_improve return time_record progress confident_improvement underlying driver'):
      - Frequency: 5 chunks (1.5%)
      - Avg. Probability: 1.00
    - Topic 74 ('74_delivered rotce_rotce_revenue delivered rotce_service delivered rotce'):
      - Frequency: 5 chunks (1.5%)
      - Avg. Probability: 1.00

DOCUMENT: UBS_Q1_2023_Earnings_Call_Remarks.pdf
  - Total Chunks: 285
  - Chunks classified as Noise (Outliers): 121 (42.5%)

  - Top 7 Dominant Topics:
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 9 chunks (3.2%)
      - Avg. Probability: 0.59
    - Topic 19 ('19_bank america jefferies_america bank america_bank america merrill_bank america bank'):
      - Frequency: 7 chunks (2.5%)
      - Avg. Probability: 0.74
    - Topic 44 ('44_slide message everyone_sergio ermotti com_ermotti com investor_page sergio ermotti'):
      - Frequency: 6 chunks (2.1%)
      - Avg. Probability: 0.50
    - Topic 4 ('4_high net worth_optimizing_revenue growth_global wealth'):
      - Frequency: 6 chunks (2.1%)
      - Avg. Probability: 0.05
    - Topic 5 ('5_comment made comment_guidance gave_guidance given_comment made'):
      - Frequency: 6 chunks (2.1%)
      - Avg. Probability: 0.54
    - Topic 14 ('14_integration everything control_drive growth_innovating_integration everything'):
      - Frequency: 5 chunks (1.8%)
      - Avg. Probability: 0.45
    - Topic 23 ('23_strong capital liquidity_attractive capital return_maintaining balance sheet_capital liquidity strength'):
      - Frequency: 5 chunks (1.8%)
      - Avg. Probability: 0.61

DOCUMENT: UBS_Q1_2024_Earnings_Call_Remarks.pdf
  - Total Chunks: 270
  - Chunks classified as Noise (Outliers): 109 (40.4%)

  - Top 7 Dominant Topics:
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 8 chunks (3.0%)
      - Avg. Probability: 0.07
    - Topic 44 ('44_slide message everyone_sergio ermotti com_ermotti com investor_page sergio ermotti'):
      - Frequency: 6 chunks (2.2%)
      - Avg. Probability: 0.83
    - Topic 50 ('50_bofa security_nii market_nii_bofa'):
      - Frequency: 5 chunks (1.9%)
      - Avg. Probability: 0.26
    - Topic 13 ('13_reduction stranded cost_cost reduction_low multiplier_gross cost save'):
      - Frequency: 4 chunks (1.5%)
      - Avg. Probability: 0.53
    - Topic 54 ('54_positive operating leverage_generated positive operating_strong positive_increase strong positive'):
      - Frequency: 4 chunks (1.5%)
      - Avg. Probability: 0.60
    - Topic 84 ('84_achieved additional gross_cost save compared_cost save realized_additional gross cost'):
      - Frequency: 4 chunks (1.5%)
      - Avg. Probability: 1.00
    - Topic 79 ('79_afternoon_april_letter_llc'):
      - Frequency: 4 chunks (1.5%)
      - Avg. Probability: 1.00

DOCUMENT: UBS_Q1_2025_Earnings_Call_Remarks.pdf
  - Total Chunks: 235
  - Chunks classified as Noise (Outliers): 102 (43.4%)

  - Top 7 Dominant Topics:
    - Topic 7 ('7_plan sort_economic scenario accepted_set measure_outcome think'):
      - Frequency: 7 chunks (3.0%)
      - Avg. Probability: 0.21
    - Topic 23 ('23_strong capital liquidity_attractive capital return_maintaining balance sheet_capital liquidity strength'):
      - Frequency: 4 chunks (1.7%)
      - Avg. Probability: 0.30
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 4 chunks (1.7%)
      - Avg. Probability: 0.31
    - Topic 21 ('21_estimate increase fluctuate_fluctuate target illustrative_increase fluctuate target_increase fluctuate'):
      - Frequency: 4 chunks (1.7%)
      - Avg. Probability: 0.76
    - Topic 60 ('60_majority understand rationale_iversified portfolio think_fact emotion mindful_follow lending environment'):
      - Frequency: 4 chunks (1.7%)
      - Avg. Probability: 1.00
    - Topic 44 ('44_slide message everyone_sergio ermotti com_ermotti com investor_page sergio ermotti'):
      - Frequency: 3 chunks (1.3%)
      - Avg. Probability: 0.36
    - Topic 4 ('4_high net worth_optimizing_revenue growth_global wealth'):
      - Frequency: 3 chunks (1.3%)
      - Avg. Probability: 0.39

DOCUMENT: UBS_Q2_2023_Earnings_Call_Remarks.pdf
  - Total Chunks: 354
  - Chunks classified as Noise (Outliers): 150 (42.4%)

  - Top 7 Dominant Topics:
    - Topic 9 ('9_net ppa adjustment_risk weight foreign_total risk weighted_excluding prior net'):
      - Frequency: 12 chunks (3.4%)
      - Avg. Probability: 0.69
    - Topic 5 ('5_comment made comment_guidance gave_guidance given_comment made'):
      - Frequency: 11 chunks (3.1%)
      - Avg. Probability: 0.42
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 9 chunks (2.5%)
      - Avg. Probability: 0.11
    - Topic 33 ('33_interpretation regulatory regime_ilm finalization_ilm finalization guidance_intermediate holding xpected'):
      - Frequency: 7 chunks (2.0%)
      - Avg. Probability: 0.47
    - Topic 4 ('4_high net worth_optimizing_revenue growth_global wealth'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 0.21
    - Topic 80 ('80_legacy segment beginning_investment platform recalibrate_investment capability gwm_group resource excluding'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 0.85
    - Topic 6 ('6_make sense_business guiding_big picture_product level reprice'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 0.08

DOCUMENT: UBS_Q2_2024_Earnings_Call_Remarks.pdf
  - Total Chunks: 238
  - Chunks classified as Noise (Outliers): 99 (41.6%)

  - Top 7 Dominant Topics:
    - Topic 5 ('5_comment made comment_guidance gave_guidance given_comment made'):
      - Frequency: 7 chunks (2.9%)
      - Avg. Probability: 0.62
    - Topic 15 ('15_result material webcast_replay available www_material webcast replay_available website please'):
      - Frequency: 4 chunks (1.7%)
      - Avg. Probability: 0.56
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 4 chunks (1.7%)
      - Avg. Probability: 0.10
    - Topic 13 ('13_reduction stranded cost_cost reduction_low multiplier_gross cost save'):
      - Frequency: 4 chunks (1.7%)
      - Avg. Probability: 0.53
    - Topic 44 ('44_slide message everyone_sergio ermotti com_ermotti com investor_page sergio ermotti'):
      - Frequency: 3 chunks (1.3%)
      - Avg. Probability: 0.67
    - Topic 73 ('73_improved resource discipline_profitability progress integration_improved risk management_enhanced scale capability'):
      - Frequency: 3 chunks (1.3%)
      - Avg. Probability: 0.72
    - Topic 23 ('23_strong capital liquidity_attractive capital return_maintaining balance sheet_capital liquidity strength'):
      - Frequency: 3 chunks (1.3%)
      - Avg. Probability: 0.03

DOCUMENT: UBS_Q3_2023_Earnings_Call_Remarks.pdf
  - Total Chunks: 282
  - Chunks classified as Noise (Outliers): 118 (41.8%)

  - Top 7 Dominant Topics:
    - Topic 5 ('5_comment made comment_guidance gave_guidance given_comment made'):
      - Frequency: 11 chunks (3.9%)
      - Avg. Probability: 0.50
    - Topic 22 ('22_transaction strategic initiative_future development goal_goal intention achieve_performance statement relating'):
      - Frequency: 6 chunks (2.1%)
      - Avg. Probability: 0.19
    - Topic 44 ('44_slide message everyone_sergio ermotti com_ermotti com investor_page sergio ermotti'):
      - Frequency: 5 chunks (1.8%)
      - Avg. Probability: 0.61
    - Topic 13 ('13_reduction stranded cost_cost reduction_low multiplier_gross cost save'):
      - Frequency: 5 chunks (1.8%)
      - Avg. Probability: 0.25
    - Topic 9 ('9_net ppa adjustment_risk weight foreign_total risk weighted_excluding prior net'):
      - Frequency: 5 chunks (1.8%)
      - Avg. Probability: 0.63
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 5 chunks (1.8%)
      - Avg. Probability: 0.08
    - Topic 33 ('33_interpretation regulatory regime_ilm finalization_ilm finalization guidance_intermediate holding xpected'):
      - Frequency: 5 chunks (1.8%)
      - Avg. Probability: 0.05

DOCUMENT: UBS_Q3_2024_Earnings_Call_Remarks.pdf
  - Total Chunks: 267
  - Chunks classified as Noise (Outliers): 97 (36.3%)

  - Top 7 Dominant Topics:
    - Topic 12 ('12_common dividend share_proportion cash dividend_dividend accrual_repurchase drove reduction'):
      - Frequency: 7 chunks (2.6%)
      - Avg. Probability: 0.34
    - Topic 63 ('63_record revenue prime_outperformed global fee_strong performance_capability acquired credit'):
      - Frequency: 6 chunks (2.2%)
      - Avg. Probability: 0.86
    - Topic 6 ('6_make sense_business guiding_big picture_product level reprice'):
      - Frequency: 6 chunks (2.2%)
      - Avg. Probability: 0.39
    - Topic 53 ('53_improvement market sentiment_improved valuation consistency_gained market share_revenue increased leveraged'):
      - Frequency: 5 chunks (1.9%)
      - Avg. Probability: 1.00
    - Topic 90 ('90_positive inflow region_positive operating leverage_position remains strong_lcr question profitability'):
      - Frequency: 4 chunks (1.5%)
      - Avg. Probability: 0.78
    - Topic 46 ('46_including entity credit_including purchase price_including buyback_including buyback part'):
      - Frequency: 4 chunks (1.5%)
      - Avg. Probability: 0.31
    - Topic 17 ('17_share growth_market share_new business_winning new'):
      - Frequency: 4 chunks (1.5%)
      - Avg. Probability: 0.76

DOCUMENT: UBS_Q4_2023_Earnings_Call_Remarks.pdf
  - Total Chunks: 408
  - Chunks classified as Noise (Outliers): 166 (40.7%)

  - Top 7 Dominant Topics:
    - Topic 4 ('4_high net worth_optimizing_revenue growth_global wealth'):
      - Frequency: 13 chunks (3.2%)
      - Avg. Probability: 0.22
    - Topic 2 ('2_persistent inflation high_earnings january copyright_persistent inflation_headwind'):
      - Frequency: 12 chunks (2.9%)
      - Avg. Probability: 0.09
    - Topic 13 ('13_reduction stranded cost_cost reduction_low multiplier_gross cost save'):
      - Frequency: 10 chunks (2.5%)
      - Avg. Probability: 0.05
    - Topic 39 ('39_driver cost reduction_simplification stranded cost_cost reduction gwm_associated organizational simplification'):
      - Frequency: 8 chunks (2.0%)
      - Avg. Probability: 0.18
    - Topic 7 ('7_plan sort_economic scenario accepted_set measure_outcome think'):
      - Frequency: 7 chunks (1.7%)
      - Avg. Probability: 0.45
    - Topic 58 ('58_maintain total liquidity_liquidity resource_resilient_liquidity funding'):
      - Frequency: 6 chunks (1.5%)
      - Avg. Probability: 0.69
    - Topic 9 ('9_net ppa adjustment_risk weight foreign_total risk weighted_excluding prior net'):
      - Frequency: 6 chunks (1.5%)
      - Avg. Probability: 0.38

DOCUMENT: UBS_Q4_2024_Earnings_Call_Remarks.pdf
  - Total Chunks: 366
  - Chunks classified as Noise (Outliers): 135 (36.9%)

  - Top 7 Dominant Topics:
    - Topic 4 ('4_high net worth_optimizing_revenue growth_global wealth'):
      - Frequency: 27 chunks (7.4%)
      - Avg. Probability: 0.28
    - Topic 13 ('13_reduction stranded cost_cost reduction_low multiplier_gross cost save'):
      - Frequency: 12 chunks (3.3%)
      - Avg. Probability: 0.38
    - Topic 23 ('23_strong capital liquidity_attractive capital return_maintaining balance sheet_capital liquidity strength'):
      - Frequency: 8 chunks (2.2%)
      - Avg. Probability: 0.06
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 7 chunks (1.9%)
      - Avg. Probability: 0.49
    - Topic 15 ('15_result material webcast_replay available www_material webcast replay_available website please'):
      - Frequency: 6 chunks (1.6%)
      - Avg. Probability: 0.86
    - Topic 75 ('75_performance reflects strength_half growth business_high market adjusted_gwm seeing progress'):
      - Frequency: 6 chunks (1.6%)
      - Avg. Probability: 0.70
    - Topic 12 ('12_common dividend share_proportion cash dividend_dividend accrual_repurchase drove reduction'):
      - Frequency: 6 chunks (1.6%)
      - Avg. Probability: 0.37

DOCUMENT: jpm-1q24-earnings-call-transcript.pdf
  - Total Chunks: 349
  - Chunks classified as Noise (Outliers): 124 (35.5%)

  - Top 7 Dominant Topics:
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 17 chunks (4.9%)
      - Avg. Probability: 0.60
    - Topic 3 ('3_see little bit_bit higher_bit less_little bit line'):
      - Frequency: 13 chunks (3.7%)
      - Avg. Probability: 0.30
    - Topic 34 ('34_chief officer chase_chief officer sorry_chief officer yeah_nii chief officer'):
      - Frequency: 9 chunks (2.6%)
      - Avg. Probability: 0.86
    - Topic 10 ('10_trade war_trying make_economy think_going lot'):
      - Frequency: 7 chunks (2.0%)
      - Avg. Probability: 0.48
    - Topic 24 ('24_rbc next come_next come rbc_come line rbc_line rbc capital'):
      - Frequency: 6 chunks (1.7%)
      - Avg. Probability: 0.74
    - Topic 1 ('1_chief executive officer_chief officer chase_executive officer chase_executive officer chief'):
      - Frequency: 5 chunks (1.4%)
      - Avg. Probability: 0.24
    - Topic 6 ('6_make sense_business guiding_big picture_product level reprice'):
      - Frequency: 5 chunks (1.4%)
      - Avg. Probability: 0.65

DOCUMENT: jpm-2q24-earnings-call-transcript-final.pdf
  - Total Chunks: 292
  - Chunks classified as Noise (Outliers): 108 (37.0%)

  - Top 7 Dominant Topics:
    - Topic 3 ('3_see little bit_bit higher_bit less_little bit line'):
      - Frequency: 9 chunks (3.1%)
      - Avg. Probability: 0.61
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 7 chunks (2.4%)
      - Avg. Probability: 0.88
    - Topic 67 ('67_regulatory requirement_market development_acquisition credit_liquidity'):
      - Frequency: 7 chunks (2.4%)
      - Avg. Probability: 1.00
    - Topic 29 ('29_chief officer chase_chief officer hey_chief officer thanks_chief officer well'):
      - Frequency: 6 chunks (2.1%)
      - Avg. Probability: 0.37
    - Topic 6 ('6_make sense_business guiding_big picture_product level reprice'):
      - Frequency: 6 chunks (2.1%)
      - Avg. Probability: 0.54
    - Topic 57 ('57_thanks chief officer_right chief officer_chase thanks chief_chief officer yeah'):
      - Frequency: 5 chunks (1.7%)
      - Avg. Probability: 0.51
    - Topic 51 ('51_appreciate_sure_expense_complicated'):
      - Frequency: 5 chunks (1.7%)
      - Avg. Probability: 0.61

DOCUMENT: jpm-3q23-earnings-call-transcript.pdf
  - Total Chunks: 330
  - Chunks classified as Noise (Outliers): 109 (33.0%)

  - Top 7 Dominant Topics:
    - Topic 1 ('1_chief executive officer_chief officer chase_executive officer chase_executive officer chief'):
      - Frequency: 18 chunks (5.5%)
      - Avg. Probability: 0.92
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 10 chunks (3.0%)
      - Avg. Probability: 0.41
    - Topic 20 ('20_chief officer chase_chief officer_chase interesting_chase interesting chief'):
      - Frequency: 8 chunks (2.4%)
      - Avg. Probability: 0.36
    - Topic 34 ('34_chief officer chase_chief officer sorry_chief officer yeah_nii chief officer'):
      - Frequency: 8 chunks (2.4%)
      - Avg. Probability: 0.53
    - Topic 6 ('6_make sense_business guiding_big picture_product level reprice'):
      - Frequency: 7 chunks (2.1%)
      - Avg. Probability: 0.62
    - Topic 3 ('3_see little bit_bit higher_bit less_little bit line'):
      - Frequency: 7 chunks (2.1%)
      - Avg. Probability: 0.62
    - Topic 16 ('16_least nii_net new money_interest bearing_end day'):
      - Frequency: 7 chunks (2.1%)
      - Avg. Probability: 0.32

DOCUMENT: jpm-4q23-earnings-call-transcript.pdf
  - Total Chunks: 268
  - Chunks classified as Noise (Outliers): 97 (36.2%)

  - Top 7 Dominant Topics:
    - Topic 3 ('3_see little bit_bit higher_bit less_little bit line'):
      - Frequency: 10 chunks (3.7%)
      - Avg. Probability: 0.56
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 9 chunks (3.4%)
      - Avg. Probability: 0.41
    - Topic 20 ('20_chief officer chase_chief officer_chase interesting_chase interesting chief'):
      - Frequency: 8 chunks (3.0%)
      - Avg. Probability: 0.52
    - Topic 25 ('25_credit cost net_credit cost reflecting_net charge offs_expense credit cost'):
      - Frequency: 6 chunks (2.2%)
      - Avg. Probability: 0.38
    - Topic 36 ('36_tax margin revenue_revenue tax margin_revenue revenue expense_margin revenue'):
      - Frequency: 5 chunks (1.9%)
      - Avg. Probability: 0.44
    - Topic 59 ('59_yeah thanks_yeah pretty_yeah delighted_yeah question'):
      - Frequency: 5 chunks (1.9%)
      - Avg. Probability: 0.62
    - Topic 34 ('34_chief officer chase_chief officer sorry_chief officer yeah_nii chief officer'):
      - Frequency: 5 chunks (1.9%)
      - Avg. Probability: 0.70

DOCUMENT: jpmc-third-quarter-2024-earnings-conference-call-transcript.pdf
  - Total Chunks: 378
  - Chunks classified as Noise (Outliers): 136 (36.0%)

  - Top 7 Dominant Topics:
    - Topic 1 ('1_chief executive officer_chief officer chase_executive officer chase_executive officer chief'):
      - Frequency: 23 chunks (6.1%)
      - Avg. Probability: 0.81
    - Topic 0 ('0_say couple thing_guess say couple_guess let try_couple thing let'):
      - Frequency: 16 chunks (4.2%)
      - Avg. Probability: 0.41
    - Topic 3 ('3_see little bit_bit higher_bit less_little bit line'):
      - Frequency: 12 chunks (3.2%)
      - Avg. Probability: 0.32
    - Topic 11 ('11_moving part going_ask little bit_get thing_going spend'):
      - Frequency: 9 chunks (2.4%)
      - Avg. Probability: 0.28
    - Topic 7 ('7_plan sort_economic scenario accepted_set measure_outcome think'):
      - Frequency: 7 chunks (1.9%)
      - Avg. Probability: 0.59
    - Topic 18 ('18_model run bank_look suitability_merged commitment said_mentioned think prepared'):
      - Frequency: 7 chunks (1.9%)
      - Avg. Probability: 0.19
    - Topic 24 ('24_rbc next come_next come rbc_come line rbc_line rbc capital'):
      - Frequency: 5 chunks (1.3%)
      - Avg. Probability: 0.14
"""

In [None]:
import io
import requests

import torch
import json
import pandas as pd

In [None]:
## Label each TOPIC FOR HEAT GRAPH

user_prompt_label = """
You are a senior financial risk analyst and regulatory supervisor with deep expertise in global banking, capital markets, and financial policy. You are also proficient in interpreting the outputs of BERTopic and similar NLP models, especially when applied to unstructured Q&A transcripts from earnings calls of globally systemically important banks (G-SIBs).

You are tasked with assigning clear, human-readable labels to machine-generated topics from BERTopic.

### Context:
The topics are extracted from Q&A transcripts. Each topic includes an ID and a set of representative key phrases. Many of these phrases may contain noise or repeated forms, and some may reflect executives’ names or institution identifiers (e.g. 'jpmorgan chase', 'citigroup', etc.). These *do not* need to be included in the label.

### Your Task:
For each topic listed below, generate a *concise*, *descriptive*, and *domain-relevant* label that captures the **underlying theme or issue**. The label should:
- Be 2–5 words long (no more than 7)
- Be general enough to summarise the key idea
- Avoid listing keywords or repeating phrases from the topic
- Remove institution-specific names (e.g., "Goldman", "Citi") unless absolutely core to the theme
- Focus on business concepts, risks, regulatory matters, or strategic actions

### Output format:
Return a markdown table with two columns:

| Topic ID & Keywords | Topic Label |
|---------------------|-------------|
| `[ID] | `[Concise, human-readable label]` |

Here are the topics:
{insert_topic_string_here}
"""


In [None]:
from google import genai

client = genai.Client(api_key="AIzaSyAHoHvbRuaNiy3BHzMkHSRM-Qy5OufxBAQ")
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=f"{user_prompt_label} {results}"
)
# Use the response as a JSON string.
print(response.text)

```markdown
| Topic ID & Keywords | Topic Label |
|---------------------|-------------|
| `1 ('1_chief executive officer_chief officer chase_executive officer chase_executive officer chief')` | Executive Officer Dialogue |
| `10 ('10_trade war_trying make_economy think_going lot')` | Macroeconomic Concerns |
| `0 ('0_say couple thing_guess say couple_guess let try_couple thing let')` | Introductory Remarks |
| `20 ('20_chief officer chase_chief officer_chase interesting_chase interesting chief')` | Officer Acknowledgement |
| `11 ('11_moving part going_ask little bit_get thing_going spend')` | Future Expenditure Discussion |
| `24 ('24_rbc next come_next come rbc_come line rbc_line rbc capital')` | RBC Analyst Introduction |
| `3 ('3_see little bit_bit higher_bit less_little bit line')` | Slight Variance Discussion |
| `57 ('57_thanks chief officer_right chief officer_chase thanks chief_chief officer yeah')` | Officer Gratitude Expression |
| `35 ('35_truist security_truist security ok

In [None]:
import re

# Step 1: Extract each line that looks like a table row
lines = response.text.strip().split("\n")[3:]  # skip the headers

# Step 2: Build dictionary
topic_label_dict = {}

for line in lines:
    match = re.match(r"\|\s*`?(\d+)[^`]*`?\s*\|\s*(.*?)\s*\|", line)
    if match:
        topic_id = int(match.group(1))
        label = match.group(2).strip()
        topic_label_dict[topic_id] = label

# Step 3: Check result
for k, v in topic_label_dict.items():
    print(f"{k}: {v}")


1: Executive Officer Dialogue
10: Macroeconomic Concerns
0: Introductory Remarks
20: Officer Acknowledgement
11: Future Expenditure Discussion
24: RBC Analyst Introduction
3: Slight Variance Discussion
57: Officer Gratitude Expression
35: Truist Security Introduction
6: Strategic Business Overview
8: Revolving Balances and Revenue
29: Officer Salutations
68: Basel III Endgame & Repricing
50: NII Market Discussion
2: Inflationary Pressures
47: Analyst Line Opening
14: Integration & Growth Initiatives
31: Guidance & Documentation Availability
4: High Net Worth Clients
5: Forward Guidance Commentary
102: Call Line Instruction
12: Dividends and Share Repurchases
89: Infrastructure Modernization Efforts
26: Growth Expectations and Resiliency
17: Market Share Expansion
77: Digital Asset & Cloud Investment
69: Strategic Alignment and Integration
27: Investment and Progress Update
9: Risk Weighted Assets
21: Fluctuating Estimates Discussion
74: Return on Tangible Common Equity
54: Positive Ope

In [None]:
topic_dict = topic_label_dict ## LLM labelled topics
## Filter to only labelled topic IDs (assuming topic_dict has int keys)
labelled_topic_ids = [tid for tid in heatmap_data.columns if tid in topic_dict]

# Filter heatmap_data to only include labelled topics
filtered_heatmap_data = heatmap_data[labelled_topic_ids]

# Prepare labels for these topics
topic_labels = [topic_dict[tid] for tid in labelled_topic_ids]

# Build the heatmap
fig = go.Figure(
    data=go.Heatmap(
        z=filtered_heatmap_data.values,
        x=topic_labels,
        y=filtered_heatmap_data.index.tolist(),
        text=[
            [
                f"Document: {doc}<br>Topic: {topic_dict[topic]}<br>Probability: {prob:.4f}"
                for topic, prob in zip(labelled_topic_ids, row)
            ]
            for doc, row in zip(filtered_heatmap_data.index.tolist(), filtered_heatmap_data.values)
        ],
        hoverinfo="text",
        colorscale="Reds",
        colorbar=dict(title="Avg. Topic Probability")
    )
)

fig.update_layout(
    title="Top Topic Distribution Across Documents (Labelled)",
    xaxis_title="Topics",
    xaxis=dict(
        tickangle=45,
        tickfont=dict(size=10)
    ),
    yaxis_title="Document"
)

fig.show()


In [None]:
user_prompt = """ { You are an expert financial risk analyst with extensive knowledge of global banking operations, regulatory frameworks (like those of the PRA), and financial market dynamics. You are also highly proficient in interpreting the output of advanced natural language processing models, specifically BERTopic. Your primary goal is to extract structured, actionable intelligence from unstructured earnings call Q&A transcripts to inform supervisory judgment.

Your task is to analyze the provided BERTopic output, which summarizes key themes extracted from Bank of England G-SIB earnings call Q&A transcripts. For each document's topic analysis, you will perform the following:

**I. Per-Document Topic Analysis:**

For each listed "Dominant Topic" within a document, create a row in a table with the following columns:

| Topic ID & Keywords | Topic Label | Risk Category Mapping & Justification | Early Warning Signal Assessment & Potential Indicators | Confidence Level |
|---------------------|-------------|---------------------------------------|-------------------------------------------------------|------------------|
| `[ID] ('[ID]_[keyword1]_[keyword2]')` | `[Concise, human-readable label]` | `[List identified risk categories, no justification]` | `[Briefly suggest how this topic could be an early warning signal, suggesting specific linguistic cues or shifts if possible]` | `[High/Medium/Low]` |

**Detailed Guidelines for Table Columns:**

* **Topic Label:** Assign a concise, human-readable label that accurately reflects the core theme. If ambiguous, state "Ambiguous Topic: [Keywords]".
* **Risk Category Mapping & Justification:**
    * Identify *all plausible* types of banking risks this topic might indicate or be related to.
    * Use the following risk categories, providing a brief justification for each mapping based on the topic keywords and the context of a banking Q&A:
        * **Credit Risk:** (e.g., loan defaults, credit quality deterioration, counterparty risk, specific asset classes like CRE, wholesale, retail)
        * **Market Risk:** (e.g., interest rate risk, FX risk, equity price risk, commodity price risk, trading book exposures)
        * **Operational Risk:** (e.g., fraud, cyber security breaches, system failures, human error, process breakdowns, data management, IT infrastructure)
        * **Liquidity Risk:** (e.g., funding stability, deposit fluctuations, collateral management, contingent liquidity)
        * **Reputational Risk:** (e.g., negative public perception, brand damage, loss of customer trust, ESG controversies)
        * **Strategic Risk:** (e.g., business model changes, M&A, new market entry/exit, competitive landscape, capital allocation, risk appetite shifts)
        * **Systemic Risk:** (e.g., interconnectedness, contagion, macro-financial stability concerns)
        * **Compliance/Regulatory Risk:** (e.g., breaches of laws/rules/regulations, fines, sanctions, new regulatory requirements, supervisory scrutiny)
        * **Emerging Risk:** (e.g., climate change, geopolitical events, new technologies like AI/DLT, crypto assets)
* **Early Warning Signal Assessment & Potential Indicators:** Assess the topic's potential as an early warning signal for supervisory judgment. Explain *why* and *how* insights from this topic could provide an early indication of emerging risks or institutional change. Specifically suggest what *types of language patterns or shifts* within associated text might serve as these signals (e.g., increased mentions of "provisions," "stress tests," "tightening standards," "regulatory inquiries," "cyber incidents," "deposit outflows," "specific market volatility").
* **Confidence Level:** Assign a confidence level (High, Medium, Low) to your risk identification and early warning signal assessment, based on the clarity, specificity, and relevance of the provided keywords for the topic.

**II. Cross-Document Comparison and Overarching Insights:**

1.  **Trend Analysis (if multiple documents provided):** If analyses from multiple documents (e.g., different quarters) are provided, identify any significant shifts or trends in dominant topics, their frequencies, and associated risk types between the documents. Discuss what these trends might imply about changes in G-SIB focus, evolving challenges, or emerging risks over time. Highlight any topics that appear to gain or lose prominence.
2.  **PRA Supervisory Implications:** Summarize the overarching insights that the PRA could gain from this type of topic analysis to strengthen their oversight of complex institutions and improve their responsiveness to emerging risks. Specifically, how can this qualitative analysis complement traditional quantitative data?
}
"""

In [None]:
from google import genai

client = genai.Client(api_key="")
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=f"{user_prompt} {results}"
)
# Use the response as a JSON string.
print(response.text)

Okay, I'm ready to analyze the BERTopic output from the provided earnings call transcripts. I'll proceed document by document, creating the requested tables and then provide the cross-document analysis and PRA supervisory implications.

**I. Per-Document Topic Analysis:**

**DOCUMENT: 1q23-earnings-transcript.pdf**

|---------------------|-------------|---------------------------------------|-------------------------------------------------------|------------------|
| `[1] ('1_chief executive officer_chief officer chase_executive officer chase_executive officer chief')` | Executive Officer Remarks/Q&A | None meaningfully identified. |  Difficult to assess as it's a generic identifier. Monitoring the sentiment and content around these mentions could be useful (see other, more specific topics). | Low |
| `[10] ('10_trade war_trying make_economy think_going lot')` | Macroeconomic Concerns: Trade War & Economy | Strategic Risk, Market Risk, Systemic Risk | Increased discussion and negative