# Phase 2 — Data Build (modern snapshot)
Fetch OHLCV, engineer baseline features, and write `data/df_nb02.csv`.


In [None]:
# --- Imports & params ---
import warnings, json
from pathlib import Path
import numpy as np
import pandas as pd

try:
    import yfinance as yf
except Exception:
    yf = None

DATA = Path("data"); ART = Path("artifacts"); FIG = Path("reports/figures")
for p in (DATA, ART, FIG): p.mkdir(parents=True, exist_ok=True)

TICKER = "AAPL"
START  = "2015-01-01"
END    = (pd.Timestamp.now(tz="America/Los_Angeles") + pd.Timedelta(days=1)).date().isoformat()
USE_MARKET = True
warnings.filterwarnings("ignore")

print("Build params →", {"TICKER":TICKER, "START":START, "END":END, "USE_MARKET":USE_MARKET})


In [None]:
# Run once if the import failed
%pip install yfinance
import yfinance as yf
print("yfinance", yf.__version__)


In [None]:
# --- Feature helpers (no external TA libs) ---
def rsi(series: pd.Series, window: int = 14) -> pd.Series:
    s = series.astype(float)
    delta = s.diff()
    up = delta.clip(lower=0); down = -delta.clip(upper=0)
    roll_up = up.ewm(alpha=1/window, adjust=False).mean()
    roll_down = down.ewm(alpha=1/window, adjust=False).mean()
    rs = roll_up / roll_down.replace(0, np.nan)
    return 100 - (100 / (1 + rs))

def macd(series: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
    s = series.astype(float)
    ema_fast = s.ewm(span=fast, adjust=False).mean()
    ema_slow = s.ewm(span=slow, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    macd_sig  = macd_line.ewm(span=signal, adjust=False).mean()
    return macd_line, macd_sig

def zscore(s: pd.Series, win: int = 20) -> pd.Series:
    m = s.rolling(win, min_periods=win).mean()
    v = s.rolling(win, min_periods=win).std(ddof=0)
    return (s - m) / v.replace(0, np.nan)


In [None]:
# --- Download base OHLCV for the main ticker ---
if yf is None:
    raise ImportError("Please `pip install yfinance` to run this notebook.")

raw = yf.download(TICKER, start=START, end=END, auto_adjust=True, progress=False)
if raw is None or raw.empty:
    raise ValueError(f"No price data for {TICKER} in {START}..{END}")

# Normalize index to tz-naive dates
idx = pd.to_datetime(raw.index, errors="coerce")
try:
    if getattr(idx, "tz", None) is not None:
        idx = idx.tz_localize(None)
except Exception:
    idx = pd.to_datetime(idx, errors="coerce").tz_localize(None)

px = raw.copy()
px.index = idx
px = px.sort_index()
px = px[~px.index.duplicated(keep="last")]
print(px.shape, "rows →", px.index.min().date(), "→", px.index.max().date())
px.tail(3)


In [None]:
# --- Normalize yfinance columns to 1-D Series (handles MultiIndex) ---
def get_price_series(df, field: str, ticker: str = None):
    if isinstance(df.columns, pd.MultiIndex):
        s = None
        try: s = df.xs(field, axis=1, level=0)
        except Exception: pass
        if s is None or isinstance(s, pd.DataFrame):
            try: s = df.xs(field, axis=1, level=1)
            except Exception: pass
        if isinstance(s, pd.DataFrame):
            if ticker is not None and ticker in s.columns: s = s[ticker]
            else: s = s.iloc[:, 0]
    else:
        s = df[field]
    return pd.to_numeric(s, errors="coerce")

open_s   = get_price_series(px, "Open",   TICKER)
high_s   = get_price_series(px, "High",   TICKER)
low_s    = get_price_series(px, "Low",    TICKER)
close_s  = get_price_series(px, "Close",  TICKER)
volume_s = get_price_series(px, "Volume", TICKER)

px_clean = pd.DataFrame({
    "Open":   open_s.astype(float),
    "High":   high_s.astype(float),
    "Low":    low_s.astype(float),
    "Close":  close_s.astype(float),
    "Volume": volume_s.astype(float),
}, index=px.index).sort_index().dropna()

print("px_clean:", px_clean.shape, "rows →", px_clean.index.min().date(), "→", px_clean.index.max().date())
px_clean.head()


In [None]:
# --- Feature engineering (returns, vol, RSI, MACD, volume stats) + label ---
df = pd.DataFrame({
    "date":   px_clean.index,
    "open":   px_clean["Open"].values,
    "high":   px_clean["High"].values,
    "low":    px_clean["Low"].values,
    "close":  px_clean["Close"].values,
    "volume": px_clean["Volume"].values,
})

# Simple returns
df["ret1"]  = df["close"].pct_change()
df["ret5"]  = df["close"].pct_change(5)
df["ret10"] = df["close"].pct_change(10)

# Rolling volatility (10d)
df["vol10"] = df["ret1"].rolling(10, min_periods=10).std(ddof=0)

# Volume z-score (20d)
df["volz"] = zscore(df["volume"], win=20)

# RSI(14) and MACD(12,26,9)
df["rsi14"] = rsi(df["close"], window=14)
df["macd"], df["macd_signal"] = macd(df["close"], fast=12, slow=26, signal=9)

# Drop warm-up NaNs & tag ticker
df = df.dropna().reset_index(drop=True)
df["ticker"] = TICKER

# === Label (next-day direction) ===
df["ret_next"] = df["ret1"].shift(-1)
df = df.dropna(subset=["ret_next"]).copy()  # drop last unknown next-day return
df["y"] = (df["ret_next"] > 0).astype(int)

print("Engineered:", df.shape, "| first:", df['date'].min().date(), "| last:", df['date'].max().date(), "| y rate:", round(float(df['y'].mean()),3))
df.head(3)


In [None]:
# === NB2 VERIFY (robust) ===
from pathlib import Path
import json, pandas as pd, numpy as np

DATA = Path("data"); ART = Path("artifacts")
csv_path = DATA / "df_nb02.csv"
parq_path = DATA / "df_nb02.parquet"
lp_path = DATA / "label_params.json"
feat_path = ART / "feature_list.json"

# Load dataset
df = pd.read_csv(csv_path, parse_dates=["date"]).sort_values("date").reset_index(drop=True)
print("NB2 dataset:", df.shape, "|", df["date"].min().date(), "→", df["date"].max().date())
print("NaNs total:", int(df.isna().sum().sum()), "| duplicate dates:", int(df["date"].duplicated().sum()))

# Label check: y should equal sign of NEXT-day ret1
label_col = next((c for c in ["y","target","label","y_next_up"] if c in df.columns), None)
ret_next = df["ret1"].shift(-1)
mask = ret_next.notna()
y_from_ret = (ret_next[mask] > 0).astype(int).values
y_true = df.loc[mask, label_col].astype(int).values if label_col else None
match = (y_true == y_from_ret).mean() if y_true is not None else np.nan
print("Label matches next-day(ret1>0):", None if np.isnan(match) else round(float(match), 3))

# Feature list sanity (no leaks)
feats = json.load(open(feat_path, "r", encoding="utf-8"))
bad = set(feats) & {"y","ret_next","date","ticker","spy_close","vix_close"}
print("Features count:", len(feats))
print("Leaky/bad cols inside features:", bad)

# Files present?
print("Files exist →",
      "df_nb02.csv:", csv_path.exists(),
      "| parquet:", parq_path.exists(),
      "| label_params.json:", lp_path.exists(),
      "| feature_list.json:", feat_path.exists())

# Quick preview
print("\nPreview:")
keep = ["date"] + feats[:5] + ([label_col] if label_col else [])
print(df[keep].head())


In [None]:
# --- Save outputs (robust) ---
out_csv = DATA / "df_nb02.csv"
df.to_csv(out_csv, index=False)
print("Saved CSV:", out_csv, "| bytes:", out_csv.stat().st_size)

def _has_fastparquet():
    try:
        import fastparquet  # noqa: F401
        return True
    except Exception:
        return False

def _sanitize_periods(df_: pd.DataFrame) -> pd.DataFrame:
    for c in df_.columns:
        if pd.api.types.is_period_dtype(df_[c]):
            df_[c] = df_[c].astype(str)
    return df_

out_parq = DATA / "df_nb02.parquet"
df_parq = _sanitize_periods(df.copy())
saved_parquet = False
try:
    import pyarrow as pa  # noqa: F401
    try:
        pa.unregister_extension_type("pandas.period")
    except Exception:
        pass
    df_parq.to_parquet(out_parq, index=False, engine="pyarrow")
    saved_parquet = True
    print("Saved Parquet (pyarrow):", out_parq, "| bytes:", out_parq.stat().st_size)
except Exception as e:
    print("pyarrow failed →", e)
    if _has_fastparquet():
        try:
            df_parq.to_parquet(out_parq, index=False, engine="fastparquet")
            saved_parquet = True
            print("Saved Parquet (fastparquet):", out_parq, "| bytes:", out_parq.stat().st_size)
        except Exception as e2:
            print("fastparquet also failed →", e2)

if not saved_parquet:
    print("Parquet save skipped (CSV written).")

# Quick QA
dts = pd.to_datetime(df["date"])
print("Rows:", len(df), "| date span:", dts.min().date(), "→", dts.max().date())
print("Columns:", list(df.columns))
print("NaNs total:", int(df.isna().sum().sum()))
print("\nret1 describe:\n", df["ret1"].describe().to_string())

# Record storage format
meta_path = DATA / "storage_format.json"
record = {"path": str(out_csv), "format": "csv"}
try:
    if meta_path.exists():
        meta = json.load(open(meta_path, "r", encoding="utf-8"))
        if isinstance(meta, dict):
            meta = [meta]
    else:
        meta = []
    meta = [m for m in meta if m.get("path") != record["path"]] + [record]
    json.dump(meta, open(meta_path, "w", encoding="utf-8"), indent=2)
except Exception:
    json.dump([record], open(meta_path, "w", encoding="utf-8"), indent=2)
print("Updated:", meta_path)


In [None]:
# === NB2 VERIFY (robust) ===
from pathlib import Path
import json, pandas as pd, numpy as np

DATA = Path("data"); ART = Path("artifacts")
csv_path = DATA / "df_nb02.csv"
parq_path = DATA / "df_nb02.parquet"
lp_path = DATA / "label_params.json"
feat_path = ART / "feature_list.json"

# Load dataset
df = pd.read_csv(csv_path, parse_dates=["date"]).sort_values("date").reset_index(drop=True)
print("NB2 dataset:", df.shape, "|", df["date"].min().date(), "→", df["date"].max().date())
print("NaNs total:", int(df.isna().sum().sum()), "| duplicate dates:", int(df["date"].duplicated().sum()))

# Label check: y should equal sign of NEXT-day ret1
label_col = next((c for c in ["y","target","label","y_next_up"] if c in df.columns), None)
ret_next = df["ret1"].shift(-1)
mask = ret_next.notna()
y_from_ret = (ret_next[mask] > 0).astype(int).values
y_true = df.loc[mask, label_col].astype(int).values if label_col else None
match = (y_true == y_from_ret).mean() if y_true is not None else np.nan
print("Label matches next-day(ret1>0):", None if np.isnan(match) else round(float(match), 3))

# Feature list sanity (no leaks)
feats = json.load(open(feat_path, "r", encoding="utf-8"))
bad = set(feats) & {"y","ret_next","date","ticker","spy_close","vix_close"}
print("Features count:", len(feats))
print("Leaky/bad cols inside features:", bad)

# Files present?
print("Files exist →",
      "df_nb02.csv:", csv_path.exists(),
      "| parquet:", parq_path.exists(),
      "| label_params.json:", lp_path.exists(),
      "| feature_list.json:", feat_path.exists())

# Quick preview
print("\nPreview:")
keep = ["date"] + feats[:5] + ([label_col] if label_col else [])
print(df[keep].head())
