# 📊 Bank Earnings Sentiment & Topic Analysis

This notebook analyzes the sentiment and latent topics from earnings call transcripts.

## 📥 Load Data

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("transcript_sentences.csv")
df['quarter_year'] = df['quarter'] + " " + df['year'].astype(str)
df = df.dropna(subset=['sentence'])  # Remove empty rows


## 😊 Sentiment Analysis

In [None]:

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

df['sentiment_score'] = df['sentence'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
df['sentiment_label'] = df['sentiment_score'].apply(lambda x: 'positive' if x > 0.2 else 'negative' if x < -0.2 else 'neutral')


In [None]:

# Sentiment over time
sentiment_summary = df.groupby(['quarter_year', 'sentiment_label']).size().unstack(fill_value=0)

sentiment_summary.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='coolwarm', title='Sentiment by Quarter')
plt.ylabel("Sentence Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 🧠 Topic Modeling (LDA)

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['sentence'].astype(str))

lda = LatentDirichletAllocation(n_components=5, random_state=0)
topics = lda.fit_transform(X)
df['topic'] = topics.argmax(axis=1)


In [None]:

# Show top words per topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
    print(f"Topic {topic_idx}: {', '.join(top_words)}")


In [None]:

# Topic distribution
topic_dist = df.groupby(['quarter_year', 'topic']).size().unstack(fill_value=0)
topic_dist.plot(kind='bar', stacked=True, colormap='tab10', figsize=(12,6), title="Topics by Quarter")
plt.ylabel("Sentence Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 🏛️ Regulatory Topic Mapping by LDA Topic

In [None]:

# Regulatory mentions by topic
reg_cols = [col for col in df.columns if col.startswith("Mentions ")]
df['topic'] = df['topic'].astype(int)

topic_reg_summary = (
    df.groupby("topic")[reg_cols]
    .apply(lambda group: group.eq("Yes").sum())
    .sort_index()
)

# Normalize by row for proportion
topic_reg_percent = topic_reg_summary.div(topic_reg_summary.sum(axis=1), axis=0)

# Heatmap of regulatory-topic associations
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
sns.heatmap(topic_reg_percent, annot=True, cmap="Blues", fmt=".2f")
plt.title("📌 Regulatory Theme Association by Topic")
plt.ylabel("LDA Topic")
plt.xlabel("Regulatory Phrase")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
