<a href="https://colab.research.google.com/github/tongnet/fin7047_2026spr/blob/main/hw3_solutions" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load CSV file
file_path = "/content/simulated_firm_data.csv"
df = pd.read_csv(file_path)

# Keep numeric columns only
X = df.select_dtypes(include=["int64", "float64"]).dropna()

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Run PCA
pca = PCA(n_components=3)
pcs = pca.fit_transform(X_scaled)

# Create DataFrame for PCs
pc_df = pd.DataFrame(pcs, columns=["PC1", "PC2", "PC3"])

# Print first few rows
print("Principal Components (first 5 rows):")
print(pc_df.head())
print(pc_df.tail())

# Explained variance
print("\nExplained variance ratios:")
print(pca.explained_variance_ratio_)

print("\nCumulative explained variance:")
print(pca.explained_variance_ratio_.sum())

Principal Components (first 5 rows):
        PC1       PC2       PC3
0  1.885322  1.895347 -0.158241
1  0.467251  1.799759 -0.197252
2  0.873079 -1.199182  1.252787
3  1.208380  0.334479  1.468426
4 -0.196309  0.271125 -0.771677
         PC1       PC2       PC3
95 -1.157671 -0.248921 -1.065813
96  2.178971  0.381476 -0.827152
97 -0.081869 -0.776206  1.100086
98  0.875799 -0.199454  0.021838
99  0.486240  1.367389  1.493126

Explained variance ratios:
[0.25068495 0.20897311 0.17250812]

Cumulative explained variance:
0.6321661847254972


In [4]:
# LDA topic modeling (5 topics) with basic preprocessing
# Works with: pip install scikit-learn

import re
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation

# -----------------------------
# 1) Input documents
# -----------------------------
docs = [
    "I enjoy eating burgers and fries on the weekend.",
    "Rising interest rates are slowing down economic growth.",
    "Pasta and sandwiches are popular lunch choices for many people.",
    "The central bank introduced new measures to control inflation.",
    "Advances in artificial intelligence are reshaping many industries.",
    "Financial markets reacted negatively to the latest economic news.",
    "Machine learning is improving how companies analyze large datasets.",
    "The global economy is struggling with higher energy prices.",
    "I like trying different types of pizza with various toppings.",
    "Government spending programs aim to support unemployed workers.",
    "Technology firms are investing heavily in AI research.",
    "Stock prices fell sharply after the earnings announcement.",
    "Automation is changing the future of work across sectors.",
    "Economic uncertainty has reduced consumer confidence.",
    "New regulations were introduced to stabilize the financial system."
]

# -----------------------------
# 2) Preprocessing function
#    - lowercase
#    - remove non-letters
#    - remove stopwords
# -----------------------------
stopwords = set(ENGLISH_STOP_WORDS)

def preprocess(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)       # keep letters/spaces only
    tokens = [t for t in text.split() if t not in stopwords and len(t) > 2]
    return " ".join(tokens)

docs_clean = [preprocess(d) for d in docs]

# -----------------------------
# 3) Vectorize (bag-of-words)
#    - unigrams + bigrams help small corpora
#    - min_df=1 because dataset is tiny
# -----------------------------
vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    min_df=1
)

X = vectorizer.fit_transform(docs_clean)
vocab = vectorizer.get_feature_names_out()

# -----------------------------
# 4) Fit LDA with 5 topics
# -----------------------------
n_topics = 5
lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    learning_method="batch"
)
lda.fit(X)

# -----------------------------
# 5) Display topics
# -----------------------------
def print_topics(model, feature_names, top_n=8):
    for k, topic_weights in enumerate(model.components_):
        top_idx = topic_weights.argsort()[::-1][:top_n]
        top_terms = [feature_names[i] for i in top_idx]
        print(f"\nTopic {k+1}: {', '.join(top_terms)}")

print("=== Top terms per topic ===")
print_topics(lda, vocab, top_n=8)

# -----------------------------
# 6) Topic mixture per document
# -----------------------------
doc_topic = lda.transform(X)  # shape: (n_docs, n_topics)

print("\n=== Dominant topic per document ===")
for i, probs in enumerate(doc_topic, start=1):
    dom_topic = probs.argmax() + 1
    confidence = probs.max()
    print(f"Doc {i:02d}: Topic {dom_topic} (p={confidence:.3f}) | {docs[i-1]}")


=== Top terms per topic ===

Topic 1: economic, unemployed, workers, spending programs, spending, support, support unemployed, programs

Topic 2: financial, companies analyze, datasets, improving companies, learning improving, machine learning, improving, large datasets

Topic 3: prices, stock, stock prices, fell, earnings announcement, fell sharply, earnings, sharply earnings

Topic 4: economic news, financial markets, latest economic, negatively latest, negatively, markets reacted, markets, latest

Topic 5: introduced, new, types pizza, toppings, trying, trying different, types, pizza various

=== Dominant topic per document ===
Doc 01: Topic 4 (p=0.920) | I enjoy eating burgers and fries on the weekend.
Doc 02: Topic 2 (p=0.920) | Rising interest rates are slowing down economic growth.
Doc 03: Topic 5 (p=0.933) | Pasta and sandwiches are popular lunch choices for many people.
Doc 04: Topic 5 (p=0.943) | The central bank introduced new measures to control inflation.
Doc 05: Topic 4 (