In [1]:
# --- Imports & params ---
import os, json, pathlib
import numpy as np
import pandas as pd

# Optional dep (market data)
try:
    import yfinance as yf
except Exception:
    yf = None

# Base I/O
DATA_DIR = pathlib.Path("data")
ART_DIR  = pathlib.Path("artifacts")
FIG_DIR  = pathlib.Path("reports/figures")
for p in [DATA_DIR, ART_DIR, FIG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Experiment switches
USE_MARKET        = True     # SPY/VIX context
USE_FUNDAMENTALS  = False    # placeholder
USE_NEWS          = False    # placeholder

# Run settings
TICKER = "AAPL"
START, END = "2015-01-01", "2023-12-31"


In [2]:
# --- Load the base features/labels from Phase 2 ---
base_csv = DATA_DIR / "df_nb02.csv"
if not base_csv.exists():
    raise FileNotFoundError("Expected Phase-2 output at data/df_nb02.csv. Run notebook 02 first.")

df = pd.read_csv(base_csv)

# Ensure tz-naive datetime
if "date" not in df.columns:
    raise KeyError("'date' column missing in df_nb02.csv")
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.tz_localize(None)
df = df.dropna(subset=["date"]).reset_index(drop=True)

print(df.shape)
df.head(3)


(2078, 16)


Unnamed: 0,date,open,high,low,close,volume,ret1,ret5,ret10,vol10,volz,rsi14,macd,macd_signal,ret_next,y
0,2015-02-20,28.654267,28.850317,28.527283,28.850317,195793600,0.008174,0.024039,0.079707,0.009271,-0.593114,78.737186,1.055491,0.908931,0.027027,1
1,2015-02-23,28.966155,29.630045,28.885953,29.630045,283896400,0.027027,0.046585,0.118305,0.009183,0.22753,82.379122,1.123915,0.951928,-0.00624,0
2,2015-02-24,29.616682,29.763719,29.222357,29.445139,276912400,-0.00624,0.033951,0.103992,0.010687,0.131432,78.926668,1.149966,0.991536,-0.025573,0


In [3]:
# --- Market context via alignment (no merges) ---

def fetch_close_series(ticker: str, start: str, end: str) -> pd.Series:
    """
    Return a pd.Series of adjusted Close with a tz-naive DatetimeIndex.
    Works whether yfinance returns single-level or MultiIndex columns.
    """
    if yf is None:
        raise ImportError("Please `pip install yfinance` to enable USE_MARKET=True.")

    r = yf.download(ticker, start=start, end=end, auto_adjust=True, progress=False)
    if r is None or r.empty:
        raise ValueError(f"No data for {ticker} in {start}..{end}")

    # tz-naive datetime index
    idx = pd.to_datetime(r.index, errors="coerce")
    try:
        if getattr(idx, "tz", None) is not None:
            idx = idx.tz_localize(None)
    except Exception:
        idx = pd.to_datetime(idx, errors="coerce").tz_localize(None)

    # Handle MultiIndex or single Index
    if isinstance(r.columns, pd.MultiIndex):
        close = r.xs("Close", axis=1, level=0)
        if isinstance(close, pd.DataFrame):
            close = close.iloc[:, 0]
    else:
        close = r["Close"]

    s = pd.Series(np.asarray(close).reshape(-1), index=idx, name="Close")
    s = s.sort_index()
    s = s[~s.index.duplicated(keep="last")]
    return s

if USE_MARKET:
    # Build a tz-naive DatetimeIndex for df
    dti = pd.to_datetime(df["date"], errors="coerce").dt.tz_localize(None)

    spy = fetch_close_series("SPY", START, END)
    vix = fetch_close_series("^VIX", START, END)

    # Align by label (same-day); no merges
    df["spy_close"] = dti.map(spy)
    df["vix_close"] = dti.map(vix)

    # Context features
    df["mkt_ret1"] = df["spy_close"].pct_change(1)
    df["mkt_ret5"] = df["spy_close"].pct_change(5)
    df["vix_chg1"] = df["vix_close"].pct_change(1)

    # Keep rows with all context features present
    df = df.dropna(subset=["spy_close","vix_close","mkt_ret1","mkt_ret5","vix_chg1"]).reset_index(drop=True)

print(df.shape)
df.filter(["date","spy_close","mkt_ret1","mkt_ret5","vix_close","vix_chg1"]).head(5)


(2073, 21)


Unnamed: 0,date,spy_close,mkt_ret1,mkt_ret5,vix_close,vix_chg1
0,2015-02-27,175.449524,-0.003406,-0.002746,13.34,-0.040978
1,2015-03-02,176.557251,0.006314,0.003693,13.04,-0.022489
2,2015-03-03,175.832626,-0.004104,-0.003258,13.86,0.062883
3,2015-03-04,175.091415,-0.004215,-0.006615,14.23,0.026696
4,2015-03-05,175.28299,0.001094,-0.004352,14.04,-0.013352


In [4]:
# --- Quality checks ---
req = ["spy_close","vix_close","mkt_ret1","mkt_ret5","vix_chg1"]
present = [c for c in req if c in df.columns]
missing = [c for c in req if c not in df.columns]

print("Shape:", df.shape)
print("Present:", present)
print("Missing:", missing)

if present:
    print("\nNulls in context cols:")
    print(df[present].isna().sum())

    print("\nReturn stats:")
    print(df[["mkt_ret1","mkt_ret5","vix_chg1"]].describe().T)


Shape: (2073, 21)
Present: ['spy_close', 'vix_close', 'mkt_ret1', 'mkt_ret5', 'vix_chg1']
Missing: []

Nulls in context cols:
spy_close    0
vix_close    0
mkt_ret1     0
mkt_ret5     0
vix_chg1     0
dtype: int64

Return stats:
           count      mean       std       min       25%       50%       75%  \
mkt_ret1  2073.0  0.000541  0.011881 -0.109424 -0.004063  0.000705  0.006289   
mkt_ret5  2073.0  0.002641  0.024245 -0.179693 -0.007323  0.004858  0.015463   
vix_chg1  2073.0  0.003509  0.088699 -0.259057 -0.046203 -0.007418  0.038741   

               max  
mkt_ret1  0.090603  
mkt_ret5  0.173581  
vix_chg1  1.155979  


In [5]:
# --- Save experiment output (separate file so we don't touch Phase-2) ---
out_name = "df_nb06_market.csv" if USE_MARKET else "df_nb06_base.csv"
out_path = DATA_DIR / out_name
df.to_csv(out_path, index=False)
print("Saved:", out_path)


Saved: data\df_nb06_market.csv


In [6]:
# --- Record storage format (quiet) ---
meta_path = DATA_DIR / "storage_format.json"
record = {"path": str(out_path), "format": "csv"}

try:
    if meta_path.exists():
        meta = json.load(open(meta_path, "r", encoding="utf-8"))
        if isinstance(meta, dict):
            meta = [meta]
    else:
        meta = []
    # replace or append our record
    meta = [m for m in meta if m.get("path") != record["path"]] + [record]
    json.dump(meta, open(meta_path, "w", encoding="utf-8"), indent=2)
except Exception:
    json.dump([record], open(meta_path, "w", encoding="utf-8"), indent=2)

print("Updated:", meta_path)


Updated: data\storage_format.json
