In [2]:
from yourproj.utils import setup_notebook, doc_title, section, subsection, hide_input
paths = setup_notebook()

doc_title(
    "Project Template: End-to-End AI/ML Pipeline",
        "A reusable notebook scaffold for DS projects — ingestion, preprocessing, EDA, modeling, training, evaluation."
        )

hide_input()

<IPython.core.display.Javascript object>

## Table of Contents
- [Project Introduction](#project-introduction)
- [Data Ingest](#data-ingest)
- [Preprocess & Feature Engineering](#preprocess--feature-engineering)
- [Exploratory Data Analysis](#exploratory-data-analysis)
- [Models](#models)
- [Training](#training)
- [Evaluation](#evaluation)


In [2]:
section("01 — EDA", "Quick checks; heavy logic lives in src/yourproj.")
subsection("Data Ingest", "Load raw CSVs into DataFrames")

## Project Introduction
**Goal:** Describe the problem you are solving, who benefits, and how success is measured.

**Scope:** Outline the boundaries (what is in/out) and key assumptions.

**KPIs & Metrics:** Define primary metrics (e.g., accuracy, F1, AUC, MAPE) and business KPIs.

**Risks & Constraints:** Data availability/quality, latency/cost constraints, privacy/compliance.


from IPython.display import Code
display(Code(filename="src/yourproj/config.py", language="python"))

## Data Ingest
**Sources:** List raw sources (files, DBs, APIs), ownership, and access paths.

**Schema:** Note important fields, time columns, identifiers, expected volumes.

**Freshness:** Update cadence, backfills, and late data handling.

**Data Quality:** Missing values, duplicates, outliers; checks you enforce.


<details><summary><code>src/yourproj/ingest.py</code></summary>

```python
from __future__ import annotations
import pandas as pd
from pathlib import Path

def load_prices(path: Path) -> pd.DataFrame:
    if not Path(path).exists():
        return pd.DataFrame(columns=['ticker','date','open','high','low','close','volume'])
    return pd.read_csv(path, parse_dates=['date'])

def load_earnings(path: Path) -> pd.DataFrame:
    if not Path(path).exists():
        return pd.DataFrame(columns=['ticker','announce_datetime','bmo_amc','eps_actual','eps_estimate'])
    return pd.read_csv(path, parse_dates=['announce_datetime'])
```
</details>


## Preprocess & Feature Engineering
**Cleaning:** Standardize types, handle missing values, de-duplication, scaling.

**Transformations:** Windowing, aggregations, joins, label construction.

**Feature Catalog:** Keep a short list of features with rationale and leakage checks.


In [4]:
# Run preprocess + feature engineering to produce artifacts
import sys
from pathlib import Path
sys.path.append(str(Path.cwd() / 'src'))
from yourproj.preprocess import main as preprocess_main
from yourproj.features import main as features_main
CONFIG = 'configs/exp_baseline.yaml'
preprocess_main(CONFIG)
features_main(CONFIG)


ImportError: cannot import name 'ensure_dir' from 'yourproj.utils' (/home/azureuser/ai_project/src/yourproj/utils/__init__.py)

<details><summary><code>src/yourproj/preprocess.py</code></summary>

```python
from __future__ import annotations
import pandas as pd
from pathlib import Path
from .config import load_config
from .utils import ensure_dir

def align_events(prices: pd.DataFrame, earnings: pd.DataFrame) -> pd.DataFrame:
    if earnings.empty or prices.empty:
        return pd.DataFrame()
    p = prices.copy()
    p['date'] = pd.to_datetime(p['date']).dt.tz_localize(None)

    e = earnings.copy()
    e['announce_datetime'] = pd.to_datetime(e['announce_datetime']).dt.tz_localize(None)
    e['t0_date'] = e.apply(
        lambda r: r['announce_datetime'].date() if r['bmo_amc']=='BMO' else (r['announce_datetime'] + pd.Timedelta(days=1)).date(),
        axis=1
    )
    e['t0_date'] = pd.to_datetime(e['t0_date'])
    # Attach close_t0 and close_t1 for labeling convenience
    m = e.merge(p[['ticker','date','close']].rename(columns={'date':'t0_date','close':'close_t0'}),
                on=['ticker','t0_date'], how='left')
    p = p.sort_values(['ticker','date'])
    p['close_t1'] = p.groupby('ticker')['close'].shift(-1)
    m = m.merge(p[['ticker','date','close_t1']].rename(columns={'date':'t0_date'}),
                on=['ticker','t0_date'], how='left')
    return m

def main(config_path: str = "configs/exp_baseline.yaml"):
    cfg = load_config(config_path)
    from .ingest import load_prices, load_earnings
    prices = load_prices(Path(cfg.paths['raw_prices']))
    earnings = load_earnings(Path(cfg.paths['raw_earnings']))
    events = align_events(prices, earnings)
    out = Path(cfg.paths['artifacts']) / "events.csv"
    ensure_dir(out.parent)
    events.to_csv(out, index=False)
    print(f"Saved aligned events to {out} (rows={len(events)})")

if __name__ == "__main__":
    import sys
    main(sys.argv[1] if len(sys.argv)>1 else "configs/exp_baseline.yaml")
```
</details>


<details><summary><code>src/yourproj/features.py</code></summary>

```python
from __future__ import annotations
import pandas as pd
from pathlib import Path
from .config import load_config
from .utils import ensure_dir
from .labels import make_day_ahead_label

def build_structured_features(events: pd.DataFrame) -> pd.DataFrame:
    df = events.copy()
    if {'eps_actual','eps_estimate'}.issubset(df.columns):
        df['surprise'] = df['eps_actual'] - df['eps_estimate']
    else:
        df['surprise'] = 0.0
    return df

def main(config_path: str = "configs/exp_baseline.yaml"):
    cfg = load_config(config_path)
    inp = Path(cfg.paths['artifacts']) / "events.csv"
    out = Path(cfg.paths['artifacts']) / "features.csv"
    df = pd.read_csv(inp, parse_dates=['t0_date'])
    df = make_day_ahead_label(df)
    feats = build_structured_features(df)
    ensure_dir(out.parent)
    feats.to_csv(out, index=False)
    print(f"Saved features to {out} (rows={len(feats)})")

if __name__ == "__main__":
    import sys
    main(sys.argv[1] if len(sys.argv)>1 else "configs/exp_baseline.yaml")
```
</details>


<details><summary><code>src/yourproj/labels.py</code></summary>

```python
import pandas as pd

def make_day_ahead_label(events: pd.DataFrame) -> pd.DataFrame:
    ev = events.copy()
    if 'close_t0' in ev and 'close_t1' in ev:
        ev['y_d1'] = (ev['close_t1'] > ev['close_t0']).astype('Int64')
    else:
        ev['y_d1'] = pd.NA
    return ev
```
</details>


In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd() / "src"))

import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display  # so display(...) works in both notebook & script

from yourproj.ingest import load_prices, load_earnings
from yourproj.preprocess import align_events
from yourproj.labels import make_day_ahead_label
from yourproj.features import build_structured_features

RAW_PRICES = Path("data/raw/prices.csv")
RAW_EARNINGS = Path("data/raw/earnings.csv")

## Exploratory Data Analysis
Use this section to explore distributions, correlations, seasonality, and sanity checks. Capture insights and hypotheses to guide modeling.


## Load data

In [None]:
prices = load_prices(RAW_PRICES)
earnings = load_earnings(RAW_EARNINGS)
display(prices.head())
display(earnings.head())

## Align + Label

In [None]:
events = align_events(prices, earnings)
labeled = make_day_ahead_label(events)
feats = build_structured_features(labeled)
display(feats.head())

## Example plot

In [None]:
Path("artifacts/figures").mkdir(parents=True, exist_ok=True)
if not feats.empty and "surprise" in feats:
    ax = feats["surprise"].hist(bins=30)
    ax.set_title("Earnings Surprise Distribution")
    plt.tight_layout()
    plt.savefig("artifacts/figures/surprise_dist.pdf")
    plt.show()

## Models
**Baselines:** Start with simple baselines (mean/last value, logistic/linear).

**Candidates:** Consider tree-based (RF/XGBoost), linear, neural, or specialized models.

**Hyperparameters:** Document search space and defaults.


<details><summary><code>src/yourproj/models.py</code></summary>

```python
from __future__ import annotations
from sklearn.linear_model import LogisticRegression

def train_logreg(X, y):
    model = LogisticRegression(max_iter=200)
    model.fit(X, y)
    return model
```
</details>


## Training
**Strategy:** Train/validation/test split strategy, cross-validation, and seeds.

**Reproducibility:** Deterministic seeds, config files, and artifact logging.

**Compute:** Hardware, runtime, and cost notes.


In [None]:
# Train model using generated features
import sys
from pathlib import Path
sys.path.append(str(Path.cwd() / 'src'))
from yourproj.train import main as train_main
CONFIG = 'configs/exp_baseline.yaml'
train_main(CONFIG)


<details><summary><code>src/yourproj/train.py</code></summary>

```python
from __future__ import annotations
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
from .config import load_config
from .utils import ensure_dir, save_json
from .models import train_logreg

def chrono_split(df: pd.DataFrame, time_col: str, test_size=0.25):
    df = df.sort_values(time_col)
    n = len(df)
    n_test = max(1, int(round(n*test_size)))
    return df.iloc[:-n_test], df.iloc[-n_test:]

def main(config_path: str = "configs/exp_baseline.yaml"):
    cfg = load_config(config_path)
    feats_path = Path(cfg.paths['artifacts']) / "features.csv"
    feats = pd.read_csv(feats_path, parse_dates=['t0_date'])
    feats = feats.dropna(subset=[cfg.train['target']])
    Xcols = cfg.train['features']
    train, test = chrono_split(feats, "t0_date", cfg.train['test_size'])
    Xtr, ytr = train[Xcols].values, train[cfg.train['target']].astype(int).values
    Xte, yte = test[Xcols].values, test[cfg.train['target']].astype(int).values
    model = train_logreg(Xtr, ytr)
    prob = model.predict_proba(Xte)[:,1]
    pred = (prob >= 0.5).astype(int)
    metrics = {
        "accuracy": float(accuracy_score(yte, pred)),
        "auc": float(roc_auc_score(yte, prob)) if len(set(yte))>1 else None,
        "n_test": int(len(yte)),
        "features": Xcols,
    }
    out = Path(cfg.paths['artifacts']) / "metrics.json"
    ensure_dir(out.parent)
    save_json(metrics, out)
    print("Saved metrics to", out, metrics)

if __name__ == "__main__":
    import sys
    main(sys.argv[1] if len(sys.argv)>1 else "configs/exp_baseline.yaml")
```
</details>


## Evaluation
**Metrics:** Primary and secondary metrics; calibration and confidence intervals if applicable.

**Error Analysis:** Segment performance by cohort/time; investigate failure modes.

**Reporting:** Figures/tables for stakeholders and decision makers.


In [None]:
# Evaluate model and save simple plots
import sys
from pathlib import Path
sys.path.append(str(Path.cwd() / 'src'))
from yourproj.eval import main as eval_main
CONFIG = 'configs/exp_baseline.yaml'
eval_main(CONFIG)


<details><summary><code>src/yourproj/eval.py</code></summary>

```python
from __future__ import annotations
from pathlib import Path
import json
import matplotlib.pyplot as plt
from .config import load_config

def main(config_path: str = "configs/exp_baseline.yaml"):
    cfg = load_config(config_path)
    metrics_path = Path(cfg.paths['artifacts']) / "metrics.json"
    with open(metrics_path) as f:
        m = json.load(f)
    print("Metrics:", m)
    vals = {k:v for k,v in m.items() if isinstance(v, (int,float)) and k in ["accuracy","auc"] and v is not None}
    if vals:
        plt.bar(list(vals.keys()), list(vals.values()))
        plt.title("Metrics")
        fig_path = Path(cfg.paths['artifacts']) / "figures" / "metrics.pdf"
        fig_path.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_path)
        print("Saved", fig_path)

if __name__ == "__main__":
    import sys
    main(sys.argv[1] if len(sys.argv)>1 else "configs/exp_baseline.yaml")
```
</details>
