In [76]:
from collections import Counter, defaultdict
import re
import time
from concurrent.futures import ThreadPoolExecutor
import random
import string
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
import torch

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from bertopic import BERTopic
from bertopic.representation import TextGeneration
from umap import UMAP
from hdbscan import HDBSCAN

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [77]:
#### **SET GLOBAL VARIABLES**

In [78]:
# Make sure the working directory is set correctly
print("Initial working directory: ", os.getcwd())
os.chdir("../..")
print("New working directory: ", os.getcwd())

Initial working directory:  C:\
New working directory:  C:\


In [79]:
ROOT_FPATH = os.getcwd()
DATA_FOLDER = "data"
AGGREGATED_DATA_FOLDER_NAME = "aggregated"
transcripts_FNAME = "transcripts.parquet"
transcripts_FPATH = f"{ROOT_FPATH}/{DATA_FOLDER}/{AGGREGATED_DATA_FOLDER_NAME}/{transcripts_FNAME}"
GLOSSARY_FNAME = "glossary_dictionary_citigroup.csv"
GLOSSARY_FPATH = f"{ROOT_FPATH}/{DATA_FOLDER}/{GLOSSARY_FNAME}"

In [80]:
# Read in the data
df_transcripts = pd.read_parquet(transcripts_FPATH)
df_transcripts

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\/data/aggregated/transcripts.parquet'

#### 😊 Sentiment Analysis

In [None]:
# Let's check we don't have any missing data
df_transcripts['text'].isna().sum()

In [None]:
# We'll use finbert-tone for sentiment analysis which is the finetuned version of BERT for financial sentiment analysis
model_name = "yiyanghkust/finbert-tone"

In [None]:
# Check max token length for the model
model = AutoModelForSequenceClassification.from_pretrained(model_name)
print(model.config.max_position_embeddings)

In [None]:
# Find max token length in the dataset
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
max_tokens = df_transcripts["text"].apply(
    lambda x: len(tokenizer(x, truncation=False)["input_ids"])
).max()
print("Estimated max tokens in dataset:", max_tokens)

In [None]:
# Split the reviews into two parts for parallel processing (2x GPUs)
text_list = df_transcripts['text'].tolist()
# Split list into 2 roughly equal parts
midpoint = len(text_list) // 2
text_list_split1 = text_list[:midpoint]
text_list_split2 = text_list[midpoint:]

In [None]:
def run_sentiment_analysis_on_gpu(text_subset, model_name, device_id):
	"""
	Run sentiment analysis on a subset of text using a specified model and device.
	"""
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)

	clf = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device_id)

	results = []
	batch_size = 32

	for i in range(0, len(text_subset), batch_size):
		batch = text_subset[i:i+batch_size]
		batch_results = clf(batch, truncation=True, max_length=512)
		results.extend(batch_results)
		print(f"Device {device_id}: Processed {i + batch_size} reviews out of {len(text_subset)}")

	return results

In [None]:
# Run sentiment analysis on both subsets in parallel using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=2) as executor:
    future_0 = executor.submit(run_sentiment_analysis_on_gpu, text_list_split1, model_name, 0)
    future_1 = executor.submit(run_sentiment_analysis_on_gpu, text_list_split2, model_name, 1)

    results_0 = future_0.result()
    results_1 = future_1.result()
    results = results_0 + results_1

## General banks

In [None]:
# Postprocess results and add to DataFrame
results_clean = [d['label'] for d in results]
df_transcripts['sentiment'] = results_clean
# Sentiment over time
sentiment_summary = df_transcripts.groupby(['reporting_period', 'sentiment']).size().unstack(fill_value=0)

# Step 1: Create a mapping from reporting_period to date_of_call
# We'll take the first call date per reporting_period (they are usually the same)
period_to_date = df_transcripts.groupby('reporting_period')['date_of_call'].min()

# Step 2: Reorder the index of sentiment_summary by the corresponding call date
sentiment_summary['date_of_call'] = sentiment_summary.index.map(period_to_date)
sentiment_summary = sentiment_summary.sort_values('date_of_call')

# Step 3: Drop the helper column (optional)
sentiment_summary = sentiment_summary.drop(columns='date_of_call')

# Step 4: Plot again
sentiment_summary.plot(kind='bar', stacked=True, figsize=(12, 6),
                       colormap='coolwarm', title='Sentiment by Quarter')
plt.ylabel("Sentence Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## By Bank

In [None]:
import matplotlib.pyplot as plt

# Group by reporting_period, bank, and sentiment
sentiment_summary = df_transcripts.groupby(['reporting_period', 'bank', 'sentiment']).size()

# Pivot to get sentiment categories as columns
sentiment_summary = sentiment_summary.unstack(fill_value=0).reset_index()

# Add date_of_call for chronological sorting
date_map = df_transcripts.groupby('reporting_period')['date_of_call'].min()
sentiment_summary['date_of_call'] = sentiment_summary['reporting_period'].map(date_map)

# Sort by date
sentiment_summary = sentiment_summary.sort_values('date_of_call')

# Format labels
sentiment_summary['pretty_label'] = sentiment_summary['reporting_period'].str.replace('_', ' ')

# Plot per bank
banks = sentiment_summary['bank'].unique()

for bank in banks:
    bank_data = sentiment_summary[sentiment_summary['bank'] == bank]
    
    # Set index for plotting
    bank_data_plot = bank_data.set_index('pretty_label')
    
    # Drop helper columns
    bank_data_plot = bank_data_plot.drop(columns=['reporting_period', 'bank', 'date_of_call'])

    # Calculate percentages
    percent_data = bank_data_plot.div(bank_data_plot.sum(axis=1), axis=0) * 100
    colors = {
        'Negative': '#1f77b4',  # Blue
        'Neutral': '#708090',   # Orange
        'Positive': '#2ca02c'   # Green
    }

    # Ensure columns match the colors
    sentiment_order = ['Negative', 'Neutral', 'Positive']

    # Plot with custom colors
    ax = bank_data_plot[sentiment_order].plot(
        kind='bar', 
        stacked=True, 
        figsize=(12, 6),
        color=[colors[s] for s in sentiment_order], 
        title=f'Sentiment by Quarter - {bank}'
    )

    plt.ylabel("Sentence Count")
    plt.xlabel("Period")
    plt.xticks(rotation=45)

    # Annotate with percentages
    for idx, row in enumerate(percent_data.values):
        cumulative = 0
        for j, value in enumerate(row):
            if value > 5:  # Show only if > 5% to avoid clutter
                y = cumulative + bank_data_plot.values[idx][j] / 2
                ax.text(idx, y, f"{value:.0f}%", ha='center', va='center', fontsize=10, color='white')
            cumulative += bank_data_plot.values[idx][j]

    plt.tight_layout()
    plt.show()

In [None]:
print(df_transcripts.head())

In [None]:
df_transcripts.to_csv("data/app/data_nlp_analysis.csv", index=False)


## 🧠 Topic Modeling

In [None]:
def clean_text(text_list_raw):
    """
    Cleans a list of raw text by converting to lowercase and
    and filtering out stop words.

    Args:
        text_list_raw: List of raw text strings.

    Returns:
        text_list_clean: List of cleaned text strings.
    """
    stop_words = set(stopwords.words('english'))
    text_list_clean = []

    for text in text_list_raw:
        if text and text.lower() != "nan":
            text = text.lower()
            word_tokens = word_tokenize(text)
            filtered_tokens = [w for w in word_tokens if w not in stop_words]
            cleaned_text = " ".join(filtered_tokens)
            text_list_clean.append(cleaned_text)

    return text_list_clean

In [None]:
# Conduct some basic text cleaning
clean_text = clean_text(df_transcripts['text'].tolist())

In [None]:
# We'll use Fin-MPNET-Base for the embedding model in BERTopic as it hsa been tuned on financial data
embedding_model_name = "mukaj/fin-mpnet-base"
embedding_model = SentenceTransformer(embedding_model_name)

In [None]:
# BERTopic uses UMAP, a dimensionality reduction technique, to reduce the dimensionality of the embeddings before clustering
# UMAP introduces stochastic behaviour so we'll set a random seed for reproducibility
# https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html#preventing-stochastic-behavior
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0) # These are the default parameters for UMAP used in _bertopic.py with the additional parameter of random_state = 0

Let's take a look at the top topics ranked by the number of reviews they are assigned to

In [None]:
# Run BERTopic to extract topics from the cleaned text
model = BERTopic(verbose=True, embedding_model=embedding_model, umap_model=umap_model)
model.fit(clean_text)
topics, probabilities = model.transform(clean_text)

## Response Investors Vs Bank

In [None]:
import pandas as pd

# Load CSV (if not already loaded)
df = pd.read_parquet(transcripts_FPATH)

# Filter to Q&A section only
df_qa = df[df['section'].str.contains("Q and A", case=False, na=False)].copy()

# Reset index for safe iteration
df_qa = df_qa.reset_index(drop=True)

# Define roles considered as management
management_roles = ['CEO', 'CFO', 'Management', 'Executive']

# Prepare output list
qa_pairs = []

# Slide through transcript row-by-row to detect Q -> A
i = 0
while i < len(df_qa) - 1:
    row = df_qa.iloc[i]
    next_row = df_qa.iloc[i + 1]

    # Heuristic: if current is not host/management, and next is management → Q&A
    if row['role'] not in ['Host'] + management_roles and next_row['role'] in management_roles:
        qa_pairs.append({
            'question': row['text'],
            'answer': next_row['text'],
            'speaker_q': row['speaker'],
            'speaker_a': next_row['speaker'],
            'year': pd.to_datetime(row['date_of_call']).year,
            'bank': row['bank'],
            'reporting_period': row['reporting_period']
        })
        i += 2  # Skip to the next pair
    else:
        i += 1  # Slide forward if no match

# Convert to DataFrame
df_qa_pairs = pd.DataFrame(qa_pairs)

# Preview
print(df_qa_pairs.head())


In [None]:
# --- Glossary Expansion for Acronyms ---
glossary_df = pd.read_csv(GLOSSARY_FPATH)
glossary_dict = glossary_df.set_index('Term')['Definition'].to_dict()

def expand_acronyms(text, glossary):
    for term, definition in glossary.items():
        pattern = r'\b' + re.escape(term) + r'\b'
        text = re.sub(pattern, definition, text, flags=re.IGNORECASE)
    return text

df_qa_pairs['question'] = df_qa_pairs['question'].apply(lambda q: expand_acronyms(str(q), glossary_dict))
df_qa_pairs['answer'] = df_qa_pairs['answer'].apply(lambda a: expand_acronyms(str(a), glossary_dict))

In [None]:
# Libraries
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import re
from umap import UMAP
#from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance

# Initialization
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
embedding_model = SentenceTransformer("mukaj/fin-mpnet-base")

# Load the processed DataFrame (replace with actual data path)
# Expected columns: ['year', 'bank', 'transcript_id', 'question', 'answer', 'speaker_q', 'speaker_a']

# --- Topic Modeling on Questions ---
representation_model = MaximalMarginalRelevance(diversity=0.7, top_n_words=10)
#representation_model = KeyBERTInspired()

#representation_model = KeyBERTInspired()
questions = df_qa_pairs['question'].astype(str).tolist()
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0)
topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, nr_topics=25, representation_model=representation_model,
verbose=True)
topics, _ = topic_model.fit_transform(questions)
df_qa_pairs['bertopic_topic'] = topics

# --- Sentiment Analysis on Answers ---
df_qa_pairs['sentiment'] = df_qa_pairs['answer'].apply(lambda x: sia.polarity_scores(str(x))['compound'])


In [None]:
df_qa_pairs['bertopic_label'] = df_qa_pairs['bertopic_topic'].apply(lambda x: topic_model.get_topic_info().set_index("Topic").loc[x, "Name"])

In [None]:
print(df_qa_pairs['bertopic_label'].head())

In [None]:
# --- Expanded Topic Mapping with Semantic Similarity ---
topic_mapping_dict = {
    "Capital Adequacy": ["capital", "tier 1 capital", "tier 2 capital", "risk-weighted assets", "capital buffer", "regulatory capital", "cet1 ratio", "capital ratio", "tangible book value", "leverage ratio", "basel iii"],
    "Liquidity Risk": ["liquidity risk", "cash position", "short-term funding", "deposit outflows", "liquidity coverage ratio"],
    "Profitability": ["profitability", "earnings", "revenue", "return on equity", "net income", "efficiency ratio"],
    "Asset Quality and Credit Risk": ["non-performing", "loan loss", "credit risk", "charge-offs", "impairments"],
    "Macroeconomic Risk": ["inflation", "recession", "monetary policy", "macroeconomic"],
    "Interest Rate Risk": ["interest rate", "yield curve", "rate hikes"],
    "Market and Volatility Risk": ["volatility", "market risk", "trading losses"],
    "Operational Risk": ["cybersecurity", "system failure", "fraud", "technology risk"],
    "Regulatory & Compliance Risk": ["compliance", "regulatory", "basel", "audit", "oversight"],
    "ESG and Reputation Risk": ["esg", "sustainability", "climate", "governance", "reputation"],
    "Strategic and Business Model Risk": ["strategy", "restructuring", "market entry"],
    "Legal Risk": ["litigation", "lawsuit", "legal", "settlement"]
}

# Prepare keyword embeddings
expanded_topic_map = {
    topic: embedding_model.encode([kw.lower() for kw in kws], convert_to_tensor=True)
    for topic, kws in topic_mapping_dict.items()
}

def semantic_map_question(question, threshold=0.75):
    question_embedding = embedding_model.encode(question.lower(), convert_to_tensor=True)
    best_match, best_score = None, 0.0
    for category, keyword_embeddings in expanded_topic_map.items():
        scores = util.cos_sim(question_embedding, keyword_embeddings)
        max_score = float(scores.max())
        if max_score > best_score and max_score >= threshold:
            best_match, best_score = category, max_score
    return best_match if best_match else "Unmapped"

df_qa_pairs['semantic_topic'] = df_qa_pairs['question'].apply(lambda q: semantic_map_question(q, threshold=0.2))
df_qa_pairs['final_topic'] = df_qa_pairs.apply(lambda row: row['semantic_topic'] if row['semantic_topic'] != "Unmapped" else f"BERTopic {row['bertopic_label']}", axis=1)

In [None]:
print(df_qa_pairs['semantic_topic'].value_counts().head(10))

In [None]:
print(df_qa_pairs['final_topic'].value_counts())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np

# --- Helper functions ---
def get_neutral_pastel_palette(n):
    # Build from seaborn + matplotlib pastel-safe sets (manual override to avoid red/green)
    safe_hex_colors = [
        "#aec6cf", "#cfcfc4", "#fdfd96", "#b39eb5", "#ffb347",
        "#dda0dd", "#b0e0e6", "#cdb5cd", "#fab57a", "#d1cfe2",
        "#e6e6fa", "#f5deb3", "#ccccff", "#e0bbE4", "#f7cac9"
    ]
    return [mcolors.to_rgb(c) for c in safe_hex_colors[:n]]


def sentiment_text_color(score):
    if score >= 0.05:
        return 'green'
    elif score <= -0.05:
        return 'red'
    else:
        return 'black'

quarter_month_map = {'Q1': '03-31', 'Q2': '06-30', 'Q3': '09-30', 'Q4': '12-31'}
def quarter_to_date(qr):
    try:
        q, y = qr.split('_')
        return pd.to_datetime(f"{y}-{quarter_month_map.get(q, '12-31')}")
    except:
        return pd.NaT

# --- Processing ---
df_qa_pairs['date_of_call'] = df_qa_pairs['reporting_period'].apply(quarter_to_date)
df_qa_pairs['quarter'] = df_qa_pairs['reporting_period']

period_order = (
    df_qa_pairs.groupby('quarter')['date_of_call']
    .min()
    .sort_values()
    .index.tolist()
)

unique_topics = sorted(df_qa_pairs['final_topic'].unique())
topic_colors = dict(zip(unique_topics, get_neutral_pastel_palette(len(unique_topics))))

# --- Loop per bank ---
for bank in df_qa_pairs['bank'].unique():
    bank_df = df_qa_pairs[df_qa_pairs['bank'] == bank]

    topic_summary = (
        bank_df.groupby(['quarter', 'final_topic'])
        .agg(count=('final_topic', 'size'), avg_sentiment=('sentiment', 'mean'))
        .reset_index()
    )

    topic_summary['total'] = topic_summary.groupby('quarter')['count'].transform('sum')
    topic_summary['proportion'] = topic_summary['count'] / topic_summary['total']
    topic_summary['quarter'] = pd.Categorical(topic_summary['quarter'], categories=period_order, ordered=True)

    pivot_df = topic_summary.pivot(index='quarter', columns='final_topic', values='proportion').fillna(0)
    sentiment_lookup = topic_summary.set_index(['quarter', 'final_topic'])['avg_sentiment']

    plt.figure(figsize=(14, 8))
    topic_patches = {}

    for quarter in pivot_df.index:
        bottom = 0
        for topic in pivot_df.columns:
            height = pivot_df.loc[quarter, topic]
            if height > 0:
                sentiment = sentiment_lookup.get((quarter, topic), 0)
                face_color = topic_colors[topic]
                label_color = sentiment_text_color(sentiment)

                plt.bar(quarter, height, bottom=bottom, color=face_color)
                if height > 0.03:
                    plt.text(quarter, bottom + height / 2, f"{sentiment:+.2f}",
                             ha='center', va='center', fontsize=8, color=label_color, fontweight='bold')

                bottom += height
                if topic not in topic_patches:
                    topic_patches[topic] = mpatches.Patch(color=face_color, label=topic)

    plt.title(f"{bank} – Topic Proportions per Quarter\n(Topic's question of the analyst vs the bank sentimental response)")
    plt.ylabel("Proportion of Questions")
    plt.xticks(rotation=45)
    plt.legend(handles=topic_patches.values(), title="Topics", bbox_to_anchor=(1.01, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- Risk Mapping ---
risk_map = {
    'Capital Adequacy': 'Capital Risk',
    'Liquidity Risk': 'Liquidity Risk',
    'Asset Quality and Credit Risk': 'Credit Risk',
    'Interest Rate Risk': 'Market Risk',
    'Market and Volatility Risk': 'Market Risk',
    'Regulatory & Compliance Risk': 'Regulatory Risk',
    'Operational Risk': 'Operational Risk',
    'Macroeconomic Risk': 'Macro Risk',
    'ESG and Reputation Risk': 'Reputation Risk',
    'Strategic and Business Model Risk': 'Business Model Risk',
    'Legal Risk': 'Legal Risk'
}
df_qa_pairs['supervisory_risk'] = df_qa_pairs['final_topic'].map(risk_map).fillna('Other')

# --- Convert quarters to datetime ---
quarter_month_map = {'Q1': '03-31', 'Q2': '06-30', 'Q3': '09-30', 'Q4': '12-31'}

def quarter_to_date(qr):
    try:
        q, y = qr.split('_')
        return pd.to_datetime(f"{y}-{quarter_month_map.get(q, '12-31')}")
    except:
        return pd.NaT

df_qa_pairs['date_of_call'] = df_qa_pairs['reporting_period'].apply(quarter_to_date)
df_qa_pairs['quarter'] = df_qa_pairs['reporting_period']

# --- Sentiment aggregation ---
risk_summary = (
    df_qa_pairs.groupby(['supervisory_risk', 'quarter', 'date_of_call'])['sentiment']
    .agg(['mean', 'count'])
    .reset_index()
)

# --- Create a categorical order for quarter based on date ---
quarter_order = (
    risk_summary.dropna(subset=['date_of_call'])
    .drop_duplicates(subset=['quarter'])
    .sort_values('date_of_call')['quarter']
    .tolist()
)

risk_summary['quarter'] = pd.Categorical(risk_summary['quarter'], categories=quarter_order, ordered=True)

# --- Plotting each risk separately with reporting_period on x-axis ---
sns.set(style="whitegrid")
unique_risks = risk_summary['supervisory_risk'].unique()

for risk in unique_risks:
    plt.figure(figsize=(10, 4))
    risk_data = risk_summary[risk_summary['supervisory_risk'] == risk].sort_values('quarter')
    sns.lineplot(data=risk_data, x='quarter', y='mean', marker='o')
    plt.title(f"Average Sentiment Over Time – {risk}")
    plt.ylabel("Mean Sentiment")
    plt.xlabel("Reporting Period")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


### Sentiment per Regulatory Topic per Quarter

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# --- Step 1: Create sentiment_label ---
def classify_sentiment(score):
    if score >= 0.2:
        return "Positive"
    elif score <= -0.2:
        return "Negative"
    else:
        return "Neutral"

df_qa_pairs['sentiment_label'] = df_qa_pairs['sentiment'].apply(classify_sentiment)

# --- Step 2: Ensure correct time ordering ---
quarter_month_map = {'Q1': '03-31', 'Q2': '06-30', 'Q3': '09-30', 'Q4': '12-31'}

def quarter_to_date(qr):
    try:
        q, y = qr.split('_')
        return pd.to_datetime(f"{y}-{quarter_month_map.get(q, '12-31')}")
    except:
        return pd.NaT

df_qa_pairs['date_of_call'] = df_qa_pairs['reporting_period'].apply(quarter_to_date)
df_qa_pairs['quarter'] = df_qa_pairs['reporting_period']
df_qa_pairs['quarter'] = pd.Categorical(
    df_qa_pairs['quarter'],
    categories=sorted(df_qa_pairs['quarter'].dropna().unique(), key=quarter_to_date),
    ordered=True
)

# --- Step 3: Plot sentiment per bank ---
for bank in df_qa_pairs['bank'].unique():
    subset = df_qa_pairs[df_qa_pairs['bank'] == bank]
    if subset.empty:
        continue

    sentiment_trend = (
        subset.groupby('quarter')['sentiment_label']
        .value_counts()
        .unstack(fill_value=0)
        .sort_index()
    )

    sentiment_trend.plot(
        kind='bar',
        stacked=True,
        figsize=(10, 5),
        colormap='coolwarm'
    )
    plt.title(f"Sentiment Distribution Over Time – {bank}")
    plt.ylabel("Sentence Count")
    plt.xlabel("Reporting Period")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
print(df_qa_pairs.head())

### 4. Topic Drift (KL Divergence Between Quarters) 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define KL divergence
def kl_divergence(p, q):
    p = np.asarray(p) + 1e-10
    q = np.asarray(q) + 1e-10
    return np.sum(p * np.log(p / q))

# Prepare KL divergence per bank
kl_results = []

for bank, bank_df in df_qa_pairs.groupby('bank'):
    pivot = bank_df.groupby(['quarter', 'final_topic']).size().unstack(fill_value=0)
    pivot_prop = pivot.div(pivot.sum(axis=1), axis=0)
    
    # Skip banks with < 2 quarters
    if len(pivot_prop) < 2:
        continue

    for i in range(1, len(pivot_prop)):
        prev_dist = pivot_prop.iloc[i - 1]
        curr_dist = pivot_prop.iloc[i]
        kl_val = kl_divergence(prev_dist, curr_dist)
        kl_results.append({
            'bank': bank,
            'quarter': pivot_prop.index[i],
            'KL Divergence': kl_val
        })

# Create DataFrame
kl_df = pd.DataFrame(kl_results)

# --- Sort quarter categorically for proper order ---
quarter_month_map = {'Q1': '03-31', 'Q2': '06-30', 'Q3': '09-30', 'Q4': '12-31'}
def quarter_to_date(qr):
    try:
        q, y = qr.split('_')
        return pd.to_datetime(f"{y}-{quarter_month_map.get(q, '12-31')}")
    except:
        return pd.NaT

kl_df['quarter_date'] = kl_df['quarter'].apply(quarter_to_date)
ordered_quarters = (
    kl_df[['quarter', 'quarter_date']]
    .drop_duplicates()
    .sort_values('quarter_date')['quarter']
    .tolist()
)
kl_df['quarter'] = pd.Categorical(kl_df['quarter'], categories=ordered_quarters, ordered=True)

# --- Plot ---
plt.figure(figsize=(12, 6))
sns.barplot(
    data=kl_df,
    x='quarter',
    y='KL Divergence',
    hue='bank',
    palette='tab10'  # Use 'tab10', 'Set2', or 'Dark2' for clear distinctions
)

plt.title("Topic Distribution Drift Over Time by Bank", fontsize=14)
plt.xlabel("Quarter", fontsize=12)
plt.ylabel("KL Divergence", fontsize=12)
plt.xticks(rotation=45)
plt.legend(title='Bank', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
df_qa_pairs = df_qa_pairs[~df_qa_pairs['final_topic'].str.startswith("BERTopic")]

In [None]:
# --- 5. Peer Comparison ---
peer_avg = df_qa_pairs.groupby(['bank', 'final_topic'])['sentiment'].mean().unstack()
sns.heatmap(peer_avg, annot=True, center=0, cmap='RdYlGn', fmt=".2f").set_title("Topic Sentiment by Bank")
plt.ylabel("Banks")
plt.xlabel("Topics")
plt.tight_layout()
plt.show()

In [None]:
print(df_qa_pairs.head())

In [None]:
df_qa_pairs.to_csv("data/app/data_qa_pairs.csv", index=False)

In [None]:
df_qa_pairs.to_parquet("data/app/data_qa_pairs.parquet", index=False)