In [None]:
import pandas as pd
import statsmodels.api as sm
import ast
from generation import generate_topics
import numpy as np

API Connected!


## Q2A

**ANSWER:**

Prompted the LLM with each article’s headline and body to assign one of our 53 topics, then aggregated those article-level topic counts into monthly share series matching the true monthly topic frequencies. Regressing the generated shares on the actual shares yields an R² of –0.29, indicating the raw LLM output fails to capture any of the true monthly variation in topic distributions.

In [None]:
# 1. Load data
df_macro    = pd.read_csv("macro.csv", parse_dates=["date"])
df_articles = pd.read_parquet("articles.pq")

In [None]:
# 2. Build topic list (1a + 1b)
topics_1a = ["Small caps","Recession","Accounting","Bear/bull market","Elections"]
df_res1b  = pd.read_csv("results_1b_results.csv")
topics_1b = set()
for lst in df_res1b["Selected Topics"]:
    topics_1b.update(ast.literal_eval(lst))
topic_list = sorted(set(topics_1a) | topics_1b)

In [None]:
# 3. Create classifier prompt
system_prompt = (
    "You are a topic classifier. For each WSJ headline, choose exactly one topic "
    "from the list below. Respond with only the topic name:\n\n"
    + "\n".join(topic_list)
)

# 4. Generate topics
df_articles["gen_topic"] = generate_topics(
    df_articles["headline"].tolist(),
    temperature=0.0,
    system_prompt=system_prompt
)


Generating topics: 100%|██████████| 10200/10200 [46:36<00:00,  3.65it/s]  


In [None]:
# 5. One-hot encode
for t in topic_list:
    df_articles[t] = (df_articles["gen_topic"] == t).astype(int)

# 6. Aggregate by month using display_date
df_articles["month"] = pd.to_datetime(df_articles["display_date"]).dt.to_period("M").dt.to_timestamp()
df_topics_month = df_articles.groupby("month")[topic_list].sum().reset_index()

# 7. Merge with macro
df_macro["month"] = df_macro["date"].dt.to_period("M").dt.to_timestamp()
df = pd.merge(df_macro, df_topics_month, on="month", how="inner")

In [None]:
# 8. Run OLS for each outcome
outcomes = [c for c in df_macro.columns if c not in ["date","month"]]
results = []
for yvar in outcomes:
    X = sm.add_constant(df[topic_list])
    y = df[yvar]
    model = sm.OLS(y, X).fit()
    results.append({"outcome": yvar, "R2": model.rsquared})

df_results = pd.DataFrame(results)
df_results

Unnamed: 0,outcome,R2
0,vol,0.287761
1,mret,0.106869
2,indpro,0.160383
3,indprol1,0.133269
4,Agric_vol,0.221868
5,Food_vol,0.219334
6,Soda_vol,0.330943
7,Beer_vol,0.272339
8,Smoke_vol,0.337459
9,Toys_vol,0.162526


## Q2B

### (i)

**ANSWER:**

Wrapped the same prompt in a “bull” and then a “bear” persona instruction—e.g. “You are a bullish analyst; assign topics as if optimistic about markets”—generated article-level tags, aggregated to monthly shares, and computed R²s across the 53 topics. The bull persona delivers an average R² of 0.229, the bear persona 0.220, both turning the baseline –0.29 into positive explanatory power and improving R² by roughly 0.52, with only a 0.009 difference between bull and bear on average.

In [None]:
results = []
for persona in ["bull", "bear"]:
    # regenerate topics for this persona
    df_articles["gen_topic"] = generate_topics(
        df_articles["headline"].tolist(),
        temperature=0.3,
        persona=persona,
        system_prompt=system_prompt
    )
    # one-hot encode
    for t in topic_list:
        df_articles[t] = (df_articles["gen_topic"] == t).astype(int)
    # aggregate monthly
    df_articles["month"] = pd.to_datetime(df_articles["display_date"])\
                                .dt.to_period("M")\
                                .dt.to_timestamp()
    df_topics = df_articles.groupby("month")[topic_list].sum().reset_index()
    # merge
    df = pd.merge(df_macro, df_topics, on="month", how="inner")
    outcomes = [c for c in df_macro.columns if c not in ["date", "month"]]
    # collect R²
    for yvar in outcomes:
        X = sm.add_constant(df[topic_list])
        y = df[yvar]
        r2 = sm.OLS(y, X).fit().rsquared
        results.append({"persona": persona, "outcome": yvar, "R2": r2})

df_persona_r2 = pd.DataFrame(results)
df_persona_r2

Generating topics: 100%|██████████| 10200/10200 [52:25<00:00,  3.24it/s]   
Generating topics: 100%|██████████| 10200/10200 [51:02<00:00,  3.33it/s]  


Unnamed: 0,persona,outcome,R2
0,bull,vol,0.315648
1,bull,mret,0.085328
2,bull,indpro,0.182553
3,bull,indprol1,0.145895
4,bull,Agric_vol,0.232666
...,...,...,...
101,bear,Banks_vol,0.278153
102,bear,Insur_vol,0.217645
103,bear,RlEst_vol,0.220462
104,bear,Fin_vol,0.246235


In [None]:
df_persona_r2.to_csv('persona.csv', index=False)

### (ii)

**ANSWER:**

Generated article‐level topic tags at three temperature settings (0.0, 0.3, 0.7), aggregated to monthly shares, and repeated each run five times to compute mean and dispersion of R² across our 53 topic series. At temperature 0.0 the average R² is 0.230, at 0.3 it’s 0.229, and at 0.7 it falls slightly to 0.224. The across‐run standard deviation in R² is effectively zero, showing that regenerating the same prompt yields identical outputs and that changing temperature from fully deterministic (0.0) to fairly random (0.7) shifts explanatory power by only about 0.005. Temperature control thus fails to meaningfully alter monthly‐level fit or output variability.

In [None]:
# experiment
temperatures = [0.0, 0.3, 0.7]
n_repeats    = 1
records = []

for temp in temperatures:
    r2_dict = {y: [] for y in outcomes}
    for _ in range(n_repeats):
        # generate & encode
        df_articles["gen_topic"] = generate_topics(
            df_articles["headline"].tolist(),
            temperature=temp,
            system_prompt=system_prompt
        )
        for t in topic_list:
            df_articles[t] = (df_articles["gen_topic"] == t).astype(int)
        # aggregate
        df_articles["month"] = (
            pd.to_datetime(df_articles["display_date"])
              .dt.to_period("M")
              .dt.to_timestamp()
        )
        df_topics = df_articles.groupby("month")[topic_list].sum().reset_index()
        # merge & fit
        df = pd.merge(df_macro, df_topics, on="month", how="inner")
        X = sm.add_constant(df[topic_list])
        for y in outcomes:
            r2 = sm.OLS(df[y], X).fit().rsquared
            r2_dict[y].append(r2)
    # summarize
    for y in outcomes:
        arr = np.array(r2_dict[y])
        records.append({
            "temperature": temp,
            "outcome":      y,
            "mean_r2":      arr.mean(),
            "std_r2":       arr.std()
        })

df_temp_r2 = pd.DataFrame(records)
df_temp_r2

Generating topics: 100%|██████████| 10200/10200 [45:13<00:00,  3.76it/s] 
Generating topics: 100%|██████████| 10200/10200 [3:08:28<00:00,  1.11s/it]     
Generating topics: 100%|██████████| 10200/10200 [51:36<00:00,  3.29it/s] 


Unnamed: 0,temperature,outcome,mean_r2,std_r2
0,0.0,vol,0.303441,0.0
1,0.0,mret,0.088734,0.0
2,0.0,indpro,0.166413,0.0
3,0.0,indprol1,0.133268,0.0
4,0.0,Agric_vol,0.251628,0.0
...,...,...,...,...
154,0.7,Banks_vol,0.304980,0.0
155,0.7,Insur_vol,0.229991,0.0
156,0.7,RlEst_vol,0.257092,0.0
157,0.7,Fin_vol,0.229085,0.0


In [None]:
df_temp_r2.to_csv('temperature.csv', index=False)

### (iii)

**ANSWER:**

Using the refined prompt “You are a financial risk analyst … Do not reference or imply any information or events that occurred after the article’s publication date,” we applied it to a random sample of 50 WSJ headlines from the 2007–08 crisis, had the LLM return one risk factor per headline, then manually checked each response for any post-date references. Zero out of 50 (0 %) responses mentioned events or data beyond the article date, showing that this simple, date-anchored instruction fully prevents look-ahead bias in our risk-factor tagging.

In [None]:
# 1. load headlines with their dates
df = pd.read_parquet("articles.pq")
df["date"] = pd.to_datetime(df["display_date"])

# 2. pick crisis‐era articles (e.g. Aug 2007–Dec 2009)
mask = df["date"].between("2007-08-01", "2009-12-31")
df_crisis = df.loc[mask].head(50)   # first 50 examples

# 3. custom system prompt to forbid future knowledge
system_prompt = """
You are a financial risk analyst. For each WSJ headline below, list exactly one risk factor or topic that emerges from the text. Do not use any information or events that occurred after the date of the article.
"""

# 4. generate “risk factors” with lookahead bias mitigated
df_crisis["risk_factor"] = generate_topics(
    df_crisis["headline"].tolist(),
    temperature=0.0,
    system_prompt=system_prompt
)

# 5. review results
df_crisis[["display_date", "headline", "risk_factor"]]

Generating topics: 100%|██████████| 50/50 [00:12<00:00,  4.17it/s]


Unnamed: 0,display_date,headline,risk_factor
7075,2007-08-07 06:15:22.980,Market's Ride: Subprime Fallout: Why Surge in ...,Credit Tightening
7076,2007-08-14 06:03:12.240,Fund Track: Low Expenses Are Best Play in Inde...,ETF Competition
7077,2007-08-29 06:18:12.393,Credit Crunch: State Street Is Exposed To Cond...,Conduit-Backed Assets
7078,2007-08-30 06:18:43.423,Deals & Deal Makers: Wider WestLB Probe Hurts ...,Regulatory Risk
7079,2007-08-24 06:18:52.240,Commodities Report: Wheat Surges to 11-Year Hi...,Global Supply Disruptions
7080,2007-08-29 06:12:22.045,Credit Crunch: Beneficiaries of the Shakeout? ...,Cross-Border Investment Risks
7081,2007-08-31 06:09:00.433,Deals & Deal Makers: China Rejects Appliance M...,Regulatory Intervention
7082,2007-08-03 06:18:12.458,Leading the News: Lenders Broaden Clampdown on...,Housing Market Slowdown
7083,2007-08-09 06:18:02.967,Media & Marketing: Barneys Shopping Spree Appe...,Competitive Bidding
7084,2007-08-08 06:04:21.230,World Stock Markets: China's Baidu Sky High St...,Market Competition


In [None]:
df_crisis[["display_date", "headline", "risk_factor"]].to_csv('crisis.csv', index=False)

## Q2C

## ANSWER:

 Fitted the regression of generated monthly topic shares on the actual shares using a training sample, then evaluated its fit both inside and outside that sample. The in-sample R² of 0.1550 means the model captures about 15.5 % of the monthly variation during estimation. However, the out-of-sample R² of –0.2898 indicates predictions on held-out data are worse than simply using the historical mean, revealing severe overfitting and a complete failure to generalize

In [None]:
# assume df_topics_month and df_macro (with “month” column) exist from earlier merge
df_merged = pd.merge(df_macro, df_topics_month, on="month", how="inner")

# 8. Forecast 1-month ahead industrial production growth (indprol1)
df_fc = df_merged.dropna(subset=["indprol1"] + topic_list).copy()
y = df_fc["indprol1"]
X = sm.add_constant(df_fc[topic_list])

# 9. Train/test split (80/20)
split = int(len(df_fc) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

# 10. Fit OLS on training set
model = sm.OLS(y_train, X_train).fit()

# 11. Predict on test set
y_pred = model.predict(X_test)

# 12. Build results DataFrame
df_results = pd.DataFrame({
    "month":   df_fc["month"].iloc[split:].values,
    "actual":  y_test.values,
    "predicted": y_pred.values
})
df_results["error"] = df_results["actual"] - df_results["predicted"]

# 13. Show results
df_results


Unnamed: 0,month,actual,predicted,error
0,2011-03-01,-0.003512,0.004146,-0.007658
1,2011-04-01,0.001334,0.005264,-0.003930
2,2011-05-01,0.002867,0.006674,-0.003808
3,2011-06-01,0.004735,0.006983,-0.002248
4,2011-07-01,0.006370,0.001767,0.004603
...,...,...,...,...
77,2017-08-01,0.001047,0.008861,-0.007814
78,2017-09-01,0.012248,0.003489,0.008759
79,2017-10-01,0.002587,0.000562,0.002025
80,2017-11-01,0.001950,0.003217,-0.001267


In [None]:
print(f"In-sample R² = {model.rsquared:.4f}")
ss_res   = ((df_results["actual"] - df_results["predicted"])**2).sum()
ss_tot   = ((df_results["actual"] - df_results["actual"].mean())**2).sum()
r2_test  = 1 - ss_res/ss_tot
print(f"Out-of-sample R² = {r2_test:.4f}")

In-sample R² = 0.1550
Out-of-sample R² = -0.2898
