# Market Sentiment Model
End-to-end notebook that builds a daily sentiment index from provided headlines, aligns it with price data, and produces trade signals ready to combine with Jake and Malcolm's strategy.


## 1) Imports
Using pandas_datareader for market data and VADER with a small finance lexicon extension for sentiment.


In [61]:
import json
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import yfinance as yf
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

pd.options.display.float_format = "{:.4f}".format


## 2) Config
Adjust ticker, dates, thresholds, and paths as needed. News is loaded from the provided CSV in `Data/news_headlines.csv`.


In [62]:

# Core parameters
TICKER = "SPY"  # proxy for broad US market
START_DATE = "2008-01-01"
END_DATE = "2024-12-31"

# Sentiment controls
POS_THRESHOLD = 0.20
NEG_THRESHOLD = -0.20
MIN_HEADLINES = 3
ROLLING_Z_WINDOW = 30  # trading days

# Data locations
NEWS_PATH = Path("../../Data/news_headlines.csv")


## 3) Price data via yfinance (with optional CSV cache)
Uses yfinance to avoid pandas_datareader/distutils issues. If network blocked, point to a cached CSV with date/close.


In [None]:


def fetch_prices(ticker: str, start: str, end: str, cached_csv: Path | None = None) -> pd.DataFrame:
    if cached_csv is not None and cached_csv.exists():
        px = pd.read_csv(cached_csv, parse_dates=["date"])
        if "close" not in px.columns:
            raise ValueError("Cached price file must include 'date' and 'close' columns.")
        return px[["date", "close"]].sort_values("date").reset_index(drop=True)

    start_dt = pd.to_datetime(start)
    end_dt = pd.to_datetime(end)
    px = yf.download(ticker, start=start_dt, end=end_dt, progress=False)

    if px.empty:
        raise ValueError("yfinance returned no data. Provide cached_csv with date/close columns.")

    # If MultiIndex columns (e.g. fields × tickers), select this ticker
    if isinstance(px.columns, pd.MultiIndex):
        # typical yfinance layout: level 0 = field, level 1 = ticker
        if ticker in px.columns.get_level_values(1):
            px = px.xs(ticker, axis=1, level=1)
        else:
            # fallback: just take the first ticker if something is weird
            px = px.xs(px.columns.levels[1][0], axis=1, level=1)

    px = px.rename(columns=str.lower)

    # Populate close from available candidates
    if "close" not in px.columns and "adj close" in px.columns:
        px["close"] = px["adj close"]

    if "close" not in px.columns:
        # Try capitalized variants before giving up
        for cand in ["Close", "Adj Close"]:
            if cand in px.columns:
                px["close"] = px[cand]
                break

    if "close" not in px.columns:
        raise ValueError(f"Price data missing 'close' column after download. Columns: {list(px.columns)}")

    prices = px[["close"]].reset_index().rename(columns={"Date": "date", "index": "date"})
    prices["date"] = pd.to_datetime(prices["date"])
    return prices

prices = fetch_prices(TICKER, START_DATE, END_DATE)
prices.tail()


  px = yf.download(ticker, start=start_dt, end=end_dt, progress=False)


Price,date,close
4273,2024-12-23,589.5244
4274,2024-12-24,596.0769
4275,2024-12-26,596.1167
4276,2024-12-27,589.8416
4277,2024-12-30,583.1106


## 4) Load headline data
The CSV from `Data/news_headlines.csv` is expected to have at least columns `Title` and `Date`. Any `CP`/price column is kept for reference but not required.


In [65]:

def load_headlines(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Standardise column names
    rename_map = {"Title": "headline", "Date": "date"}
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

    if "date" not in df.columns or "headline" not in df.columns:
        raise ValueError("Headline data must include 'Date' and 'Title' columns.")

    df["date"] = pd.to_datetime(df["date"])
    df["headline"] = df["headline"].astype(str).str.strip()
    df = df.dropna(subset=["date", "headline"])
    return df


headlines = load_headlines(NEWS_PATH)
headlines.head()


Unnamed: 0,headline,date,CP
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",2008-01-02,1447.16
1,Dow Tallies Biggest First-session-of-year Poin...,2008-01-02,1447.16
2,2008 predictions for the S&P 500,2008-01-02,1447.16
3,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16
4,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18


## 5) Sentiment model with finance slang extension
Extends VADER with a few retail/finance terms to improve polarity detection.


In [66]:

def build_analyzer() -> SentimentIntensityAnalyzer:
    sia = SentimentIntensityAnalyzer()
    finance_lexicon = {
        "mooning": 3.2,
        "rekt": -3.4,
        "bagholder": -2.4,
        "bagholders": -2.4,
        "diamond hands": 2.2,
        "paper hands": -2.0,
        "buy the dip": 1.8,
        "dead cat": -2.1,
        "rocket": 2.4,
        "to the moon": 2.6,
    }
    sia.lexicon.update(finance_lexicon)
    return sia


sia = build_analyzer()


## 6) Score headlines
Compute VADER compound scores per headline.


In [67]:

def score_headlines(df: pd.DataFrame, analyzer: SentimentIntensityAnalyzer) -> pd.DataFrame:
    scored = df.copy()
    scored["compound"] = scored["headline"].apply(lambda txt: analyzer.polarity_scores(txt)["compound"])
    return scored


scored_headlines = score_headlines(headlines, sia)
scored_headlines.head()


Unnamed: 0,headline,date,CP,compound
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",2008-01-02,1447.16,0.0
1,Dow Tallies Biggest First-session-of-year Poin...,2008-01-02,1447.16,-0.2732
2,2008 predictions for the S&P 500,2008-01-02,1447.16,0.0
3,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16,0.0
4,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18,0.6249


## 7) Daily aggregation
Capture mean, extremes, count, and an optional rolling z-score for the mean sentiment.


In [68]:

def aggregate_daily(df: pd.DataFrame, window: int = ROLLING_Z_WINDOW) -> pd.DataFrame:
    grouped = (
        df.groupby("date")["compound"]
        .agg(["mean", "max", "min", "count"])
        .reset_index()
        .rename(columns={
            "mean": "compound_mean",
            "max": "compound_max",
            "min": "compound_min",
            "count": "headline_count",
        })
    )

    # Collect all headlines per day (single string, separated by ||)
    headline_text = (
        df.groupby("date")["headline"]
        .apply(lambda s: " || ".join(s.astype(str)))
        .reset_index()
    )
    grouped = grouped.merge(headline_text, on="date", how="left")

    def last_point_z(series: pd.Series) -> float:
        if series.shape[0] < 5:
            return np.nan
        std = series.std(ddof=0)
        if std == 0:
            return 0.0
        return (series.iloc[-1] - series.mean()) / std

    grouped["compound_z"] = grouped["compound_mean"].rolling(window, min_periods=5).apply(last_point_z, raw=False)
    return grouped


daily_sentiment = aggregate_daily(scored_headlines)
print(daily_sentiment.head())


        date  compound_mean  compound_max  compound_min  headline_count  \
0 2008-01-02        -0.0911        0.0000       -0.2732               3   
1 2008-01-03         0.0000        0.0000        0.0000               1   
2 2008-01-07         0.6249        0.6249        0.6249               1   
3 2008-01-09         0.2048        0.6597       -0.2500               2   
4 2008-01-10         0.0000        0.0000        0.0000               1   

                                            headline  compound_z  
0  JPMorgan Predicts 2008 Will Be "Nothing But Ne...         NaN  
1  U.S. Stocks Higher After Economic Data, Monsan...         NaN  
2  U.S. Stocks Climb As Hopes Increase For More F...         NaN  
3  How Investing in Intangibles -- Like Employee ...         NaN  
4  U.S. Stocks Zigzag Higher As Bernanke Speech S...     -0.5737  


## 8) Signal construction
Build a daily sentiment score that blends average and extremes, then convert to buy/sell/hold using thresholds. Filters out days with too few headlines or conflicted sentiment.


In [69]:

def construct_signals(df: pd.DataFrame) -> pd.DataFrame:
    signals = df.copy()
    signals["final_score"] = (
        signals["compound_mean"]
        + 0.25 * signals["compound_max"]
        + 0.25 * signals["compound_min"]
    )

    signals["too_few_headlines"] = signals["headline_count"] < MIN_HEADLINES
    signals["conflicting"] = (signals["compound_max"] > 0) & (signals["compound_min"] < 0)

    def classify(row):
        if row["too_few_headlines"] or row["conflicting"]:
            return "hold"
        if row["final_score"] >= POS_THRESHOLD:
            return "buy"
        if row["final_score"] <= NEG_THRESHOLD:
            return "sell"
        return "hold"

    signals["sentiment_signal"] = signals.apply(classify, axis=1)
    return signals


sentiment_signals = construct_signals(daily_sentiment)
print(sentiment_signals.head())

out_path = Path("headline_compound.csv")
sentiment_signals.to_csv(out_path, index=False)
print(f"Saved daily sentiment with signals and headlines to {out_path}")


        date  compound_mean  compound_max  compound_min  headline_count  \
0 2008-01-02        -0.0911        0.0000       -0.2732               3   
1 2008-01-03         0.0000        0.0000        0.0000               1   
2 2008-01-07         0.6249        0.6249        0.6249               1   
3 2008-01-09         0.2048        0.6597       -0.2500               2   
4 2008-01-10         0.0000        0.0000        0.0000               1   

                                            headline  compound_z  final_score  \
0  JPMorgan Predicts 2008 Will Be "Nothing But Ne...         NaN      -0.1594   
1  U.S. Stocks Higher After Economic Data, Monsan...         NaN       0.0000   
2  U.S. Stocks Climb As Hopes Increase For More F...         NaN       0.9374   
3  How Investing in Intangibles -- Like Employee ...         NaN       0.3073   
4  U.S. Stocks Zigzag Higher As Bernanke Speech S...     -0.5737       0.0000   

   too_few_headlines  conflicting sentiment_signal  
0        

## 9) Align sentiment with SPY returns (no look-ahead)
Merge daily sentiment with SPY prices, compute returns, and shift signals/scores by one day for validation.


In [71]:

def align_sentiment_returns(sentiment: pd.DataFrame, prices: pd.DataFrame) -> pd.DataFrame:
    merged = prices.merge(sentiment, on="date", how="left").sort_values("date")
    merged["spy_return"] = merged["close"].pct_change()

    # shift sentiment one day forward to avoid look-ahead
    for col in ["final_score", "sentiment_signal"]:
        merged[f"{col}_lag1"] = merged[col].shift(1)

    return merged


merged_df = align_sentiment_returns(sentiment_signals, prices)
merged_df.tail()
merged_df.head()


Unnamed: 0,date,close,compound_mean,compound_max,compound_min,headline_count,headline,compound_z,final_score,too_few_headlines,conflicting,sentiment_signal,spy_return,final_score_lag1,sentiment_signal_lag1
0,2008-01-02,104.0849,-0.0911,0.0,-0.2732,3.0,"JPMorgan Predicts 2008 Will Be ""Nothing But Ne...",,-0.1594,False,False,hold,,,
1,2008-01-03,104.0347,0.0,0.0,0.0,1.0,"U.S. Stocks Higher After Economic Data, Monsan...",,0.0,True,False,hold,-0.0005,-0.1594,hold
2,2008-01-04,101.4851,,,,,,,,,,,-0.0245,0.0,hold
3,2008-01-07,101.399,0.6249,0.6249,0.6249,1.0,U.S. Stocks Climb As Hopes Increase For More F...,,0.9374,True,False,hold,-0.0008,,
4,2008-01-08,99.7615,,,,,,,,,,,-0.0161,0.9374,hold


## 10) Correlation and simple regression
Evaluate whether sentiment relates to same-day or next-day SPY returns.


In [78]:

from math import sqrt
import numpy as np

def simple_regression(y, x):
    # Adds intercept; returns beta and t-stat
    X = np.column_stack([np.ones(len(x)), x])
    beta, *_ = np.linalg.lstsq(X, y, rcond=None)
    y_hat = X @ beta
    resid = y - y_hat
    dof = max(len(x) - X.shape[1], 1)
    sigma2 = (resid @ resid) / dof
    cov_beta = sigma2 * np.linalg.inv(X.T @ X)
    se = np.sqrt(np.diag(cov_beta))
    t_stats = beta / se
    return beta, t_stats

# drop NaNs for analysis
analysis_df = merged_df.dropna(subset=["spy_return", "final_score", "final_score_lag1"])

# correlations
corr_same = analysis_df[["spy_return", "final_score"]].corr().iloc[0,1]
corr_next = analysis_df[["spy_return", "final_score_lag1"]].corr().iloc[0,1]

beta_same, t_same = simple_regression(
    analysis_df["spy_return"].values,
    analysis_df["final_score"].values,
)

beta_next, t_next = simple_regression(
    analysis_df["spy_return"].values,
    analysis_df["final_score_lag1"].values,
)

display({
    "corr_same_day": corr_same,
    "corr_next_day": corr_next,
    "beta_same_day": beta_same.tolist(),
    "t_same_day": t_same.tolist(),
    "beta_next_day": beta_next.tolist(),
    "t_next_day": t_next.tolist(),
})


{'corr_same_day': np.float64(0.12100227722853701),
 'corr_next_day': np.float64(0.019510885092469323),
 'beta_same_day': [0.0002840866603005102, 0.004653361273274365],
 't_same_day': [1.3928142387086206, 6.906355625665744],
 'beta_next_day': [0.0004560114551580251, 0.0007585992773132331],
 't_next_day': [2.2185340766833965, 1.1056359859898877]}

## 11) Returns by sentiment bucket
Compare average SPY returns when prior-day sentiment is positive vs. negative.


In [73]:

merged_df["sentiment_bucket"] = pd.cut(
    merged_df["final_score_lag1"],
    bins=[-np.inf, -0.2, 0.2, np.inf],
    labels=["bearish", "neutral", "bullish"],
)

bucket_stats = (
    merged_df.dropna(subset=["spy_return", "sentiment_bucket"])
    .groupby("sentiment_bucket")["spy_return"]
    .agg(["mean", "std", "count"])
)
print(bucket_stats)


                    mean    std  count
sentiment_bucket                      
bearish          -0.0003 0.0146    605
neutral           0.0007 0.0116   1932
bullish           0.0005 0.0100    970


  .groupby("sentiment_bucket")["spy_return"]


## 12) Export merged dataset for teammates
Provides aligned sentiment + SPY returns for downstream strategy work.


In [None]:


merged_path = Path("models/sentiment_model/sentiment_with_spy.csv")
merged_df.to_csv(merged_path, index=False)
print(f"Saved merged sentiment/price data to {merged_path}")


## 13) Coverage & distribution checks
Quick summary of headline coverage and distribution of sentiment buckets (uses merged_df).


In [76]:

coverage = {
    "date_range": (merged_df["date"].min(), merged_df["date"].max()),
    "headline_days": merged_df["date"].nunique(),
    "avg_headlines_per_day": merged_df["headline_count"].mean(),
}
print(coverage)

bucket_counts = merged_df["sentiment_bucket"].value_counts(dropna=False)
print("Sentiment bucket counts:)
print(bucket_counts)


SyntaxError: unterminated string literal (detected at line 9) (4102714778.py, line 9)

## 14) SPY return for a given day + that day's headlines + sentiment scores

In [84]:
def inspect_day(target_date, prices_df, headline_df, sentiment_df):
    target_date = pd.to_datetime(target_date).normalize()

    # Need both this day and previous day
    px_today = prices_df[prices_df["date"] == target_date]
    px_prev  = prices_df[prices_df["date"] == target_date - pd.Timedelta(days=1)]

    if px_today.empty:
        print(f"No SPY price data for {target_date.date()}")
        return

    if px_prev.empty:
        ret = np.nan  # no previous price
    else:
        ret = (px_today["close"].iloc[0] / px_prev["close"].iloc[0]) - 1

    # === SENTIMENT ===
    sent_row = sentiment_df[sentiment_df["date"] == target_date]
    if sent_row.empty:
        print(f"No sentiment data for {target_date.date()}")
        return
    sent_row = sent_row.iloc[0]

    # === SHOW HEADLINES ===
    day_headlines = headline_df[headline_df["date"] == target_date].copy()
    if not day_headlines.empty:
        print("---- 📰 Headlines & Compound Scores ----")
        display(day_headlines[["headline", "compound"]])


def build_daily_summary(prices, sentiment, headlines):
    # compute daily returns first
    prices = prices.sort_values("date").copy()
    prices["spy_return"] = prices["close"].pct_change()

    # group headlines into a single string per day
    grouped_headlines = (
        headlines.groupby("date")["headline"]
        .apply(lambda s: " || ".join(s.astype(str)))
        .reset_index()
        .rename(columns={"headline": "headlines"})
    )

    # merge everything
    merged = prices.merge(sentiment, on="date", how="left")\
                   .merge(grouped_headlines, on="date", how="left")

    return merged.sort_values("date")

daily_summary = build_daily_summary(prices, sentiment_signals, scored_headlines)
daily_summary.head()
daily_summary.to_csv('daily_summary.csv', index=False)



## 15) Quick takeaways (manual)
Use the outputs above to describe whether sentiment has any predictive power on SPY returns and how coverage limitations might affect reliability.


## 16) Weekly / Fortnightly / Monthly aggregation
Aggregate sentiment to coarser horizons (weighted by headline counts) for higher-level validation.


In [None]:

from typing import Dict

def aggregate_period(sent_df: pd.DataFrame, freq: str) -> pd.DataFrame:
    df = sent_df.copy()
    df = df.sort_values("date").set_index("date")

    def agg_block(sub: pd.DataFrame) -> pd.Series:
        total = sub["headline_count"].sum()
        def wavg(col):
            return (sub[col] * sub["headline_count"]).sum() / total if total > 0 else np.nan
        return pd.Series(
            {
                "compound_mean_wt": wavg("compound_mean"),
                "final_score_wt": wavg("final_score"),
                "compound_max": sub["compound_max"].max(),
                "compound_min": sub["compound_min"].min(),
                "headline_count": total,
                "start_date": sub.index.min(),
                "end_date": sub.index.max(),
            }
        )

    grouped = df.resample(freq).apply(agg_block)
    grouped = grouped.reset_index().rename(columns={"date": "period_end"})
    return grouped

freq_map = {
    "W-FRI": "weekly",
    "2W-FRI": "fortnightly",
    "M": "monthly",
}

aggregated_periods: Dict[str, pd.DataFrame] = {}
for freq, label in freq_map.items():
    aggregated_periods[label] = aggregate_period(sentiment_signals, freq)

aggregated_periods["weekly"].head()


## 17) Period returns & correlations
Align period sentiment with SPY period returns and check correlations/regressions using lagged sentiment (no look-ahead).


In [None]:

    def aggregate_returns(price_df: pd.DataFrame, freq: str) -> pd.DataFrame:
        p = price_df.sort_values("date").set_index("date")["close"]
        grp = p.resample(freq).agg(["first", "last"])
        grp["period_return"] = grp["last"] / grp["first"] - 1
        return grp.reset_index().rename(columns={"date": "period_end"})

    def corr_reg_for_period(sent_period: pd.DataFrame, returns_period: pd.DataFrame, label: str):
        merged = returns_period.merge(sent_period, on="period_end", how="inner").sort_values("period_end")
        merged["final_score_wt_lag1"] = merged["final_score_wt"].shift(1)

        df = merged.dropna(subset=["period_return", "final_score_wt", "final_score_wt_lag1"])
        corr_same = df[["period_return", "final_score_wt"]].corr().iloc[0,1]
        corr_next = df[["period_return", "final_score_wt_lag1"]].corr().iloc[0,1]

        beta_same, t_same = simple_regression(df["period_return"].values, df["final_score_wt"].values)
        beta_next, t_next = simple_regression(df["period_return"].values, df["final_score_wt_lag1"].values)

        print(f"
== {label.capitalize()} ==")
        print({
            "rows": len(df),
            "corr_same_period": corr_same,
            "corr_next_period": corr_next,
            "beta_same": beta_same.tolist(),
            "t_same": t_same.tolist(),
            "beta_next": beta_next.tolist(),
            "t_next": t_next.tolist(),
        })
        return merged

    aggregated_returns: Dict[str, pd.DataFrame] = {}
    merged_period_outputs: Dict[str, pd.DataFrame] = {}

    for freq, label in freq_map.items():
        rets = aggregate_returns(prices, freq)
        aggregated_returns[label] = rets
        merged_period_outputs[label] = corr_reg_for_period(
            aggregated_periods[label],
            rets,
            label,
        )


## 18) Export aggregated datasets
Save period-level sentiment + returns for teammates.


In [None]:

for label, df_out in merged_period_outputs.items():
    out_path = Path(f"models/sentiment_model/sentiment_with_spy_{label}.csv")
    df_out.to_csv(out_path, index=False)
    print(f"Saved {label} sentiment/price data to {out_path}")
