In [6]:
# from google.colab import drive
# drive.flush_and_unmount()           # ignore errors if already unmounted

#If cannot remount, simply delete the mounted drive and then remount
# rm -rf /content/drive


In [7]:
# Colab cell
from google.colab import drive

drive.mount('/content/drive', force_remount=True)



Mounted at /content/drive


In [8]:
# Adjust these two for YOUR repo
REPO_OWNER = "kadkins3880"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1
BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

# if on my office computer

# REPO_NAME  = "lectureNotes"   # e.g., on my office computer
# BASE_DIR = r"E:\OneDrive - Auburn University Montgomery\teaching\AUM\STAT 4160 Productivity Tools" # on my office computer
# CLONE_DIR  = f"{BASE_DIR}\{REPO_NAME}"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [9]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    # !git status
    # !git pull --rebase # !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Working dir: /content/drive/MyDrive/dspt25/STAT4160


## Session 17 ‚Äî Feature Timing, Biases & Leakage

### Learning goals

By the end of class, students can:

1.  Explain and **avoid look‚Äëahead** and **survivorship** biases.
2.  Freeze and use a **static ticker universe** chosen from the **train window** (not the whole history).
3.  Define labels correctly (e.g., **t+1** and **t+5**) and verify them with tests.
4.  Add **leakage tests** that recompute trusted features and fail on any future‚Äëpeek.

------------------------------------------------------------------------

## Agenda

-   what leakage looks like; examples; how it sneaks in

-   survivorship bias (today‚Äôs constituents ‚â† past reality); freezing a universe

-    label definitions (t+1, multi‚Äëstep) and alignment rules

-   **In‚Äëclass lab**:

    1.  Freeze a static universe from the first split‚Äôs train window
    2.  Add leakage tests that recompute known‚Äëgood features
    3.  Add multi‚Äëstep labels (e.g., t+5) with tests

-   Wrap‚Äëup & homework brief

------------------------------------------------------------------------

### What is data leakage?

-   **Look‚Äëahead leakage:** using any info from *t+1* or later to compute features at *t* or to scale/normalize train and validation together.
-   **Common culprits:** `shift(-1)` in features, global scaling fit on full data, forward‚Äëfill across split boundaries, using today‚Äôs close to predict today‚Äôs close.

### Survivorship bias

-   Using **today‚Äôs index membership** to pick tickers for the past ‚áí drops delisted/removed names ‚áí **optimistically biased** results.
-   **Cure:** freeze a **static universe** from the **training window** (e.g., all tickers with ‚â• 252 observations by the end of the first train window). Save it and **filter by it** for all future experiments.

### Label definitions (be explicit)

-   **t+1 log return**: `r_1d = log_return.shift(-1)` per ticker (your Session‚Äë9 label).
-   **t+5 log return** (multi‚Äëstep): `r_5d = log_return.shift(-1) + ‚Ä¶ + log_return.shift(-5)` per ticker.
-   Rules: labels come from **future**; features come from **‚â§ t**. Splits with **embargo** reduce adjacency leakage.

------------------------------------------------------------------------

## In‚Äëclass lab (Colab‚Äëfriendly)

> Run each block as its own cell. Update `REPO_NAME` as needed.

### 0) Setup & load data (with fallbacks)


In [10]:
import os, pathlib, numpy as np, pandas as pd
from pathlib import Path

for p in ["data/raw","data/processed","data/static","reports","scripts","tests"]:
    Path(p).mkdir(parents=True, exist_ok=True)
print("Working dir:", os.getcwd())

# Load returns or synthesize a small fallback
rpath = Path("data/processed/returns.parquet")
if rpath.exists():
    returns = pd.read_parquet(rpath)
else:
    rng = np.random.default_rng(0)
    dates = pd.bdate_range("2022-01-03", periods=360)
    rows=[]
    for t in ["AAPL","MSFT","GOOGL","AMZN","NVDA","TSLA","META","NFLX"]:
        eps = rng.normal(0,0.012,size=len(dates)).astype("float32")
        adj = 100*np.exp(np.cumsum(eps))
        df = pd.DataFrame({
            "date": dates, "ticker": t,
            "adj_close": adj.astype("float32"),
            "log_return": np.r_[np.nan, np.diff(np.log(adj))].astype("float32")
        })
        df["r_1d"] = df["log_return"].shift(-1) # negative shift (backward) as the target
        df["weekday"] = df["date"].dt.weekday.astype("int8")
        df["month"]   = df["date"].dt.month.astype("int8")
        rows.append(df)
    returns = pd.concat(rows, ignore_index=True).dropna().reset_index(drop=True)
    returns["ticker"] = returns["ticker"].astype("category")
    returns.to_parquet(rpath, index=False)

# Load features_v1 or construct minimal lags for tests
fpath = Path("data/processed/features_v1.parquet")
if fpath.exists():
    feats = pd.read_parquet(fpath).sort_values(["ticker","date"]).reset_index(drop=True)
else:
    feats = returns.sort_values(["ticker","date"]).copy()
    for k in [1,2,3]:
        feats[f"lag{k}"] = feats.groupby("ticker")["log_return"].shift(k) # postiive shift (forward) as the "lags"
    feats["roll_mean_20"] = feats.groupby("ticker")["log_return"].rolling(20, min_periods=20).mean().reset_index(level=0, drop=True) # to see how groupby affect index, see below.
    feats["roll_std_20"]  = feats.groupby("ticker")["log_return"].rolling(20, min_periods=20).std().reset_index(level=0, drop=True)
    feats["zscore_20"]    = (feats["log_return"] - feats["roll_mean_20"]) / (feats["roll_std_20"] + 1e-8)
    feats = feats.dropna().reset_index(drop=True)

# Harmonize types
returns["date"] = pd.to_datetime(returns["date"])
feats["date"]   = pd.to_datetime(feats["date"])
returns["ticker"] = returns["ticker"].astype("category") #category type is faster
feats["ticker"]   = feats["ticker"].astype("category")
returns = returns.sort_values(["ticker","date"]).reset_index(drop=True)
feats   = feats.sort_values(["ticker","date"]).reset_index(drop=True)
returns.head(3), feats.head(3)

Working dir: /content/drive/MyDrive/dspt25/STAT4160


(        date ticker  log_return      r_1d  weekday  month
 0 2020-01-01   AAPL         NaN  0.002987        2      1
 1 2020-01-02   AAPL    0.002987 -0.002741        3      1
 2 2020-01-03   AAPL   -0.002741 -0.008906        4      1,
         date ticker  log_return      r_1d  weekday  month      lag1      lag2  \
 0 2020-01-29   AAPL   -0.018417 -0.002351        2      1 -0.012895 -0.019012   
 1 2020-01-30   AAPL   -0.002351 -0.012675        3      1 -0.018417 -0.012895   
 2 2020-01-31   AAPL   -0.012675  0.002713        4      1 -0.002351 -0.018417   
 
        lag3  roll_mean_20  roll_std_20  zscore_20  ewm_mean_20  ewm_std_20  \
 0 -0.004576     -0.004086     0.008476  -1.690830    -0.005252    0.009304   
 1 -0.019012     -0.004353     0.008324   0.240455    -0.004976    0.008875   
 2 -0.012895     -0.004849     0.008517  -0.918756    -0.005709    0.008745   
 
    exp_mean   exp_std  adj_close   volume  
 0 -0.004086  0.008476  92.154846  1598707  
 1 -0.004003  0.008270  9


### What happens to the index after `groupby`

1. **`groupby("ticker")["log_return"]`**
   This creates groups by ticker but doesn‚Äôt change the DataFrame yet‚Äîit‚Äôs a grouped Series.

2. **`.rolling(20, min_periods=20).mean()`**
   The rolling mean is computed **within each ticker**. The result is a **Series whose index is a MultiIndex** with two levels:

   * **Level 0** = the group key(s) (here, `'ticker'`)
   * **Level 1** = the *original index* of `feats` (e.g., your DateTimeIndex or RangeIndex)

   So at this point the result is indexed like `(ticker, original_index)`.

3. **`.reset_index(level=0, drop=True)`**
   This removes (drops) **level 0** of the MultiIndex (i.e., the `'ticker'` level), leaving you with just the **original index**.
   Dropping that level ensures the resulting Series lines up 1:1 with `feats`‚Äôs index, so you can safely assign it as a new column.

> **‚ÄúWhat is level 0?‚Äù**
> In a pandas **MultiIndex**, *level 0* is simply the **outermost** index level. In this case, it‚Äôs the `'ticker'` level that was introduced by `groupby`.

---

## Minimal example (with tiny window to make it easy to see)

```python
import pandas as pd

# Example data: 2 tickers over 6 days
feats = pd.DataFrame(
    {
        "ticker": ["A","A","A","B","B","B"],
        "log_return": [0.01, -0.02, 0.03, 0.10, 0.05, -0.02],
    },
    index=pd.date_range("2024-01-01", periods=6, freq="D"),
)
feats.index.name = "date"
```

### Rolling mean within each ticker (window=3 here just for display)

```python
s = feats.groupby("ticker")["log_return"].rolling(3, min_periods=1).mean()
```

**Index after groupby+rolling:** (note the two levels: `ticker` then `date`)

```
ticker  date
A       2024-01-01    0.010000
        2024-01-02   -0.005000
        2024-01-03    0.006667
B       2024-01-04    0.100000
        2024-01-05    0.075000
        2024-01-06    0.043333
```

Here, **level 0** is `'ticker'`. **Level 1** is the original index (`'date'`).

### Drop level 0 so it aligns with `feats`‚Äô index

```python
s_aligned = s.reset_index(level=0, drop=True)
```

Now the index is **just the original** (`date`):

```
date
2024-01-01    0.010000
2024-01-02   -0.005000
2024-01-03    0.006667
2024-01-04    0.100000
2024-01-05    0.075000
2024-01-06    0.043333
```

### Assign back

```python
feats["roll_mean_3"] = s_aligned
```

Result:

```
             ticker  log_return  roll_mean_3
date
2024-01-01      A        0.01       0.010000
2024-01-02      A       -0.02      -0.005000
2024-01-03      A        0.03       0.006667
2024-01-04      B        0.10       0.100000
2024-01-05      B        0.05       0.075000
2024-01-06      B       -0.02       0.043333
```

---

### Two small gotchas

* **Order vs. labels:** after `groupby(...).rolling(...).mean()`, the MultiIndex groups values by ticker. After you `reset_index(level=0, drop=True)`, the Series‚Äôs **index labels** match `feats`‚Äô index, but the **order** may be grouped by ticker. That‚Äôs okay‚Äîpandas aligns on labels when assigning. If you *also* want the original row order in that Series, you can do:

  ```python
  s_aligned = s.reset_index(level=0, drop=True).sort_index()
  ```

* **A simpler alternative:** you can avoid the MultiIndex entirely by using `transform`, which preserves the original index:

  ```python
  feats["roll_mean_20"] = (
      feats.groupby("ticker")["log_return"]
           .transform(lambda s: s.rolling(20, min_periods=20).mean())
  )
  ```




# 1) Freeze a **static universe** from the **first split‚Äôs train window**

In [11]:
import numpy as np, pandas as pd

def make_rolling_origin_splits(dates, train_min=252, val_size=63, step=63, embargo=5):
    u = np.array(sorted(pd.to_datetime(pd.Series(dates).unique())))
    i = train_min - 1; splits=[]
    while True:
        if i >= len(u): break
        a,b = u[0], u[i]  # a is fixed to u[0], the start date
        vs = i + embargo + 1
        ve = vs + val_size - 1
        if ve >= len(u): break
        splits.append((a,b,u[vs],u[ve]))
        i += step  # step is the size increased for a expanding window
    return splits

TRAIN_MIN =80 # adjust this constant
splits = make_rolling_origin_splits(returns["date"], train_min=TRAIN_MIN, val_size=21, step=21, embargo=5)
assert len(splits) >= 1, "Not enough history for a first split."
a,b,c,d = splits[0]
print("First train window:", a.date(), "‚Üí", b.date())

# Eligible = tickers with at least train_min rows by train_end (b)
train_slice = returns[(returns["date"]>=a) & (returns["date"]<=b)]
counts = train_slice.groupby("ticker").size()
eligible = counts[counts >= TRAIN_MIN].index.sort_values()  # extract just index (ticker name) and sort, see more explanaiton below
universe = pd.DataFrame({"ticker": eligible})
univ_name = f"data/static/universe_{b.date()}.csv"
universe.to_csv(univ_name, index=False)
print("Saved static universe:", univ_name, "| tickers:", len(universe))
universe.head()

First train window: 2020-01-01 ‚Üí 2020-04-21


  counts = train_slice.groupby("ticker").size()


Saved static universe: data/static/universe_2020-04-21.csv | tickers: 25


Unnamed: 0,ticker
0,AAPL
1,AMZN
2,BAC
3,CSCO
4,CVX


### `counts = train_slice.groupby("ticker").size()`

* `train_slice.groupby("ticker")` splits the DataFrame into groups by ticker symbol.
* `.size()` returns the **number of rows** in each group ‚Äî that is, how many data points each ticker has.
* The result is a **Series** indexed by ticker:

Example:

```python
import pandas as pd

train_slice = pd.DataFrame({
    "ticker": ["AAPL","AAPL","MSFT","MSFT","MSFT","NVDA"],
    "date": pd.date_range("2024-01-01", periods=6),
    "value": [1,2,3,4,5,6]
})

TRAIN_MIN = 3

counts = train_slice.groupby("ticker").size()
print(counts)
```

Output:

```
ticker
AAPL    2
MSFT    3
NVDA    1
dtype: int64
```

So:

* `'AAPL'` appears 2 times,
* `'MSFT'` 3 times,
* `'NVDA'` 1 time.

---

### `eligible = counts[counts >= TRAIN_MIN].index.sort_values()`

* `counts >= TRAIN_MIN` produces a Boolean mask:

  ```
  AAPL    False
  MSFT     True
  NVDA    False
  dtype: bool
  ```
* `counts[counts >= TRAIN_MIN]` filters the Series to include only the tickers satisfying the condition (here, at least 3 data points):

  ```
  ticker
  MSFT    3
  dtype: int64
  ```
* `.index` extracts just the tickers (`Index(["MSFT"], dtype=object)`).
* `.sort_values()` sorts that index alphabetically.

Final result:

```python
eligible
# Index(['MSFT'], dtype='object')
```



> From now on, **filter** your data to `universe` before modeling/evaluation.

# 2) Apply the static universe to your features

In [12]:
feats_static = feats[feats["ticker"].isin(set(universe["ticker"]))].copy()
feats_static.to_parquet("data/processed/features_v1_static.parquet", compression="zstd", index=False)
print("Wrote data/processed/features_v1_static.parquet", feats_static.shape)

Wrote data/processed/features_v1_static.parquet (3975, 18)


# 3) Add **leakage tests** that recompute trusted features & compare

Create a high‚Äëvalue test file that **fails** if any feature depends on future rows.

In [13]:
# tests/test_leakage_features.py
from __future__ import annotations # stop evaluating type annotations before Python 3.14, see more details below
import numpy as np, pandas as pd
import pytest

SAFE_ROLL = 20

@pytest.fixture(scope="session") # scope="session": per test session, for all tests. see below for more dtails.
def df():
    import pandas as pd
    import pathlib
    p = pathlib.Path("data/processed/features_v1_static.parquet")
    if not p.exists():
        p = pathlib.Path("data/processed/features_v1.parquet")
    df = pd.read_parquet(p).sort_values(["ticker","date"]).reset_index(drop=True)
    df["date"] = pd.to_datetime(df["date"])
    return df

def test_label_definition_r1d(df):
    for tkr, g in df.groupby("ticker"):
        assert g["r_1d"].iloc[:-1].equals(g["log_return"].iloc[1:]), f"r_1d mismatch for {tkr}"

def _recompute_safe(g: pd.DataFrame) -> pd.DataFrame:
    # Recompute causal features using only <= t information
    out = pd.DataFrame(index=g.index)
    s = g["log_return"]
    out["lag1"] = s.shift(1)
    out["lag2"] = s.shift(2)
    out["lag3"] = s.shift(3)
    rm = s.rolling(SAFE_ROLL, min_periods=SAFE_ROLL).mean()
    rs = s.rolling(SAFE_ROLL, min_periods=SAFE_ROLL).std()
    out["roll_mean_20"] = rm
    out["roll_std_20"]  = rs
    out["zscore_20"]    = (s - rm) / (rs + 1e-8)
    # EWM & expanding if present
    out["exp_mean"] = s.expanding(min_periods=SAFE_ROLL).mean() # window expanded from the beginning (with a min_periods window)
    out["exp_std"]  = s.expanding(min_periods=SAFE_ROLL).std()
    out["ewm_mean_20"] = s.ewm(span=20, adjust=False).mean() # see below for more detail. adjust=False, use recursive formula
    out["ewm_std_20"]  = s.ewm(span=20, adjust=False).std()
    # RSI(14) if adj_close present
    if "adj_close" in g:
        delta = g["adj_close"].diff()
        up = delta.clip(lower=0).ewm(alpha=1/14, adjust=False).mean()
        dn = (-delta.clip(upper=0)).ewm(alpha=1/14, adjust=False).mean()
        rs = up / (dn + 1e-12)
        out["rsi_14"] = 100 - (100/(1+rs))
    return out

@pytest.mark.parametrize("col", ["lag1","lag2","lag3","roll_mean_20","roll_std_20","zscore_20","exp_mean","exp_std","ewm_mean_20","ewm_std_20","rsi_14"])
# run pytest for each value in the list of "col"
def test_features_match_causal_recompute(df, col):
    if col not in df.columns:
        pytest.skip(f"{col} not present")
    # Compare per ticker to avoid cross-group alignment issues
    for tkr, g in df.groupby("ticker", sort=False):
        ref = _recompute_safe(g)
        if col not in ref.columns:
            continue
        a = g[col].to_numpy()
        b = ref[col].to_numpy()
        # Allow NaNs at the start; compare where both finite
        mask = np.isfinite(a) & np.isfinite(b)   # elementwise and
        if mask.sum() == 0:
            continue
        diff = np.nanmax(np.abs(a[mask] - b[mask]))
        assert float(diff) <= 1e-6, f"{col} deviates from causal recompute for {tkr}: max |Œî|={diff}"

def test_no_feature_equals_target(df):
    y = df["r_1d"].to_numpy()
    for col in df.select_dtypes(include=["float32","float64"]).columns:
        if col in {"r_1d","log_return"}:
            continue
        x = df[col].to_numpy()
        # Proportion of exact equality (within tiny tol) should not be high
        eq = np.isfinite(x) & np.isfinite(y) & (np.abs(x - y) < 1e-12)
        assert eq.mean() < 0.8, f"Suspicious: feature {col} equals target too often"

`from __future__ import annotations` is (was) a per‚Äëmodule switch that **stops Python from immediately evaluating type annotations**. Instead, the raw text of each annotation is stored (as a string) and only interpreted later by tools like `typing.get_type_hints`. This was introduced by **PEP 563** and is known as *postponed (stringified) annotations*. ([Python Enhancement Proposals (PEPs)][1])

---

## Why it existed

Before Python 3.14, annotations were normally evaluated *eagerly* at definition time. That caused headaches like:

* **Forward references** failed unless you quoted them (`"User"`) or reordered code.
* **Import cycles** and **import‚Äëtime work** triggered by annotations.

Opting into `from __future__ import annotations` solved this by storing `'User'` instead of looking up `User` immediately. ([Python documentation][2])

**Example (pre‚Äë3.14 behavior):**

```py
# without the future import (<=3.13 default)
def f(x: C) -> None: ...
class C: pass
# NameError at function definition time (C not yet defined)

# with the future import (3.7+)
from __future__ import annotations
def f(x: C) -> None: ...
class C: pass
print(f.__annotations__)         # {'x': 'C', 'return': 'None'}
```

(The same example and outcomes are described in the 3.14 docs' ‚ÄúAnnotation semantics‚Äù section.) ([Python documentation][2])

---

## What‚Äôs the status **today** (Python 3.14+)

Starting with **Python 3.14**, the default model changed to **deferred evaluation** (PEP 649 / PEP 749): annotations are computed lazily *when you ask for them*, not at import time‚Äîso forward references work out of the box, and you don‚Äôt need this future import anymore. ([Python documentation][3])

> **Important:** In 3.14+, if you *do* keep `from __future__ import annotations`, you force the older **stringified** behavior in that module (for backward compatibility). The docs note this legacy switch ‚Äúwill eventually be deprecated and removed.‚Äù ([Python documentation][2])

To inspect annotations under the new model, Python 3.14 adds **`annotationlib`** with `get_annotations()` and formats to get evaluated values, forward‚Äërefs, or strings:

```py
from annotationlib import get_annotations, Format
get_annotations(f, format=Format.VALUE)       # evaluated
get_annotations(f, format=Format.FORWARDREF)  # safe proxies
get_annotations(f, format=Format.STRING)      # strings
```

See the new library docs for details. ([Python documentation][2])

---

## When should you use it?

* **Targeting Python ‚â§ 3.13:**
  Use `from __future__ import annotations` if you want easy forward references and to avoid import‚Äëtime evaluation of annotations. Tools can evaluate them later via `typing.get_type_hints`. ([Python Enhancement Proposals (PEPs)][1])
* **Targeting Python ‚â• 3.14:**
  **Don‚Äôt use it.** The default already defers evaluation; keeping the future import reverts you to stringified annotations and is slated for deprecation/removal. ([Python documentation][3])

---

## Where it must appear

Future statements are **per‚Äëmodule** and must be placed **at the very top** of the file (after the module docstring and other future statements, before any other code). Otherwise you‚Äôll get a `SyntaxError`. ([Python documentation][4])

---

## Quick cheat‚Äësheet

* **Effect (3.7‚Äì3.13 with the future import):**
  `__annotations__` contains **strings**; evaluation is postponed. ([Python Enhancement Proposals (PEPs)][1])
* **Effect (3.14+ by default):**
  Annotations are **lazy** (computed on demand). Keep the future import only if you intentionally want strings‚Äîfor legacy reasons. ([Python documentation][3])
* **Introspecting:**
  Use `typing.get_type_hints(...)` on older versions; on 3.14+ you can also use `annotationlib.get_annotations(..., format=...)`. ([Python documentation][2])



[1]: https://peps.python.org/pep-0563/ "PEP 563 ‚Äì Postponed Evaluation of Annotations"
[2]: https://docs.python.org/3/library/annotationlib.html "annotationlib ‚Äî Functionality for introspecting annotations ‚Äî Python 3.14.0 documentation"
[3]: https://docs.python.org/3/whatsnew/3.14.html "What's new in Python 3.14"
[4]: https://docs.python.org/3/reference/simple_stmts.html"7. Simple statements"


##  1. A decorator is just a function that **takes another function (or class)** and **returns a new one**.

Think of it as:

> ‚ÄúWrap a function with extra behavior ‚Äî before, after, or around its execution ‚Äî without changing its code.‚Äù

---

### Basic example

```python
def my_decorator(func):
    def wrapper():
        print("Before the function runs")
        func()
        print("After the function runs")
    return wrapper
```

Now decorate a function:

```python
@my_decorator
def say_hello():
    print("Hello!")

say_hello()
```

Output:

```
Before the function runs
Hello!
After the function runs
```

---

### What actually happens under the hood

The line:

```python
@my_decorator
def say_hello():
    ...
```

is **syntactic sugar** for:

```python
def say_hello():
    ...
say_hello = my_decorator(say_hello)
```

So the name `say_hello` now **refers to the `wrapper` function** returned by `my_decorator`.

---

##  2. Decorators can add *extra logic* ‚Äî logging, timing, validation, caching, etc.

Example: measure how long a function takes

```python
import time

def timer(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print(f"{func.__name__} took {end - start:.3f}s")
        return result
    return wrapper

@timer
def slow_add(a, b):
    time.sleep(1)
    return a + b

slow_add(3, 4)
```

Output:

```
slow_add took 1.001s
```

---

## 3. Decorators can accept arguments too

If you want a decorator that itself takes arguments (e.g. a retry count, a logging level), you need an *extra layer of function nesting*:

```python
def repeat(n_times):
    def decorator(func):
        def wrapper(*args, **kwargs):
            for _ in range(n_times):
                func(*args, **kwargs)
        return wrapper
    return decorator

@repeat(3)
def greet():
    print("Hello!")

greet()
```

Output:

```
Hello!
Hello!
Hello!
```

---

## 4. Preserving function metadata (`functools.wraps`)

Without help, the decorated function loses its original name and docstring:

```python
print(slow_add.__name__)  # 'wrapper'  üò¨
```

To fix that, use `functools.wraps(func)`:

```python
from functools import wraps

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        ...
    return wrapper
```

Now:

```python
print(slow_add.__name__)  # 'slow_add' ‚úÖ
```

---

##  5. Decorators can also be used on classes or methods

For instance, applying `@classmethod`, `@staticmethod`, `@property`, or even custom decorators to modify class behavior.

Example:

```python
def uppercase(method):
    @wraps(method)
    def wrapper(*args, **kwargs):
        result = method(*args, **kwargs)
        return result.upper()
    return wrapper

class Greeter:
    @uppercase
    def greet(self):
        return "hello world"

print(Greeter().greet())  # 'HELLO WORLD'
```

---

##  6. Summary

| Concept                   | Meaning                                                          |
| ------------------------- | ---------------------------------------------------------------- |
| **Decorator**             | Function that takes another function/class and returns a new one |
| **`@decorator` syntax**   | Equivalent to `func = decorator(func)`                           |
| **Used for**              | Logging, validation, timing, caching, permissions, etc.          |
| **Use `functools.wraps`** | To keep original function‚Äôs metadata                             |
| **Can be stacked**        | Multiple decorators can wrap each other                          |

---




In **pytest**, the `@pytest.fixture` decorator can take a `scope` argument that controls **how long the fixture object lives** ‚Äî that is, how often pytest creates and destroys it.



###  Basic idea

```python
@pytest.fixture(scope="session")
def db_connection():
    print("Setting up database connection...")
    yield connect_to_db()
    print("Tearing down database connection...")
```

Here, `scope="session"` means:

> The fixture is **created once per entire test session**, shared across **all tests**, and **destroyed at the end** of the run.

So every test that depends on `db_connection` will reuse the same instance.

---

###  All possible scope levels

| Scope        | Lifetime                                            | Typical Use                                               |
| ------------ | --------------------------------------------------- | --------------------------------------------------------- |
| `"function"` | Default ‚Äî new fixture per test function             | Isolated tests; avoids cross-test state                   |
| `"class"`    | One fixture per test class                          | Expensive setup reused by tests in same class             |
| `"module"`   | One fixture per Python module (file)                | Shared state across all tests in one file                 |
| `"package"`  | One fixture per package (folder with `__init__.py`) | Rarely used; persists for all tests in that package       |
| `"session"`  | One fixture for the entire pytest run               | Global resources (DB, API client, temporary folder, etc.) |

---

###  Lifetime & teardown sequence

Fixtures are **created lazily** ‚Äî only when first requested ‚Äî and **torn down in reverse order** of creation at the end of their scope.

For `scope="session"`, the fixture:

1. Is initialized once before any test needs it.
2. Is reused across all modules, classes, and functions.
3. Is finalized when pytest is exiting (after all tests complete).

---

###  Example of multiple scopes

```python
import pytest

@pytest.fixture(scope="session")
def db():
    print("Connecting to database")
    yield "DB_CONNECTION"
    print("Closing database")

@pytest.fixture(scope="module")
def dataset(db):
    print("Loading dataset from", db)
    yield "DATASET"
    print("Unloading dataset")

def test_a(dataset):
    print("Test A using", dataset)

def test_b(dataset):
    print("Test B using", dataset)
```

**Output (simplified):**

```
Connecting to database      ‚Üê once per session
Loading dataset from DB_CONNECTION
Test A using DATASET
Test B using DATASET
Unloading dataset           ‚Üê once per module
Closing database            ‚Üê once per session
```

---


## üîπ The code defines two fixtures

### 1Ô∏è‚É£ `db` ‚Äî session-scoped

```python
@pytest.fixture(scope="session")
def db():
    print("Connecting to database")
    yield "DB_CONNECTION"
    print("Closing database")
```

* `scope="session"` ‚Üí created **once for the entire pytest run**.
* When first needed, pytest runs the setup part (before `yield`).
* The value `"DB_CONNECTION"` is passed to any test or fixture that depends on `db`.
* After *all* tests finish, pytest resumes after the `yield` ‚Üí teardown runs: `"Closing database"`.

So think of it as a **global resource** ‚Äî e.g., one connection reused everywhere.

---

### 2Ô∏è‚É£ `dataset` ‚Äî module-scoped

```python
@pytest.fixture(scope="module")
def dataset(db):
    print("Loading dataset from", db)
    yield "DATASET"
    print("Unloading dataset")
```

* `scope="module"` ‚Üí created **once per test file (module)**.
* Depends on `db`, so pytest first ensures `db` is ready.
* After the `yield`, its value (`"DATASET"`) is passed to any test using `dataset`.
* When the module‚Äôs tests finish, it runs the teardown code after `yield`.

So within a file, all tests share one `"DATASET"`; but in a different test file, pytest would call this fixture again.

---

## The two test functions

```python
def test_a(dataset):
    print("Test A using", dataset)

def test_b(dataset):
    print("Test B using", dataset)
```

Both tests depend on the `dataset` fixture.

---

## üîπ Execution flow step-by-step

1. **Pytest starts the session.**

   * It sees that `test_a` and `test_b` both require `dataset`, which in turn requires `db`.

2. **Before the first test, it resolves dependencies:**

   * No `db` yet ‚Üí create it:

     ```
     Connecting to database
     ```
   * No `dataset` yet (for this module) ‚Üí create it, using `db`:

     ```
     Loading dataset from DB_CONNECTION
     ```

3. **Run the tests (both share same `dataset`):**

   ```
   Test A using DATASET
   Test B using DATASET
   ```

4. **Module finishes ‚Üí teardown `dataset`:**

   ```
   Unloading dataset
   ```

5. **All tests finished ‚Üí teardown `db`:**

   ```
   Closing database
   ```

---

##  Why it behaves that way

* Pytest builds a **fixture dependency tree**.
* Fixtures with **larger scopes** live longer:

  ```
  session > package > module > class > function
  ```
* When a fixture depends on another, pytest ensures the *dependency* has an equal or wider scope.
  (It‚Äôs invalid for a narrower fixture to depend on a wider one.)

So here:

* `dataset (module)` depends on `db (session)`
* `db` stays alive until the end of session.
* `dataset` stays alive until end of this module.

---

##  Summary of lifetime

| Fixture            | Scope      | Created                       | Destroyed                           |
| ------------------ | ---------- | ----------------------------- | ----------------------------------- |
| `db`               | `session`  | Once at first use in any test | After all tests finish              |
| `dataset`          | `module`   | Once per test file            | After all tests in that file finish |
| `test_a`, `test_b` | `function` | Called per test               | End after each test                 |

---

### Final output order (as printed)

```
Connecting to database      ‚Üê db setup (once)
Loading dataset from DB_CONNECTION   ‚Üê dataset setup (once per module)
Test A using DATASET                 ‚Üê test function
Test B using DATASET                 ‚Üê test function
Unloading dataset                    ‚Üê dataset teardown
Closing database                     ‚Üê db teardown (end of session)
```

---

Would you like me to expand this example to include a **function-scoped** fixture too (so you can see how its setup/teardown happens before and after *each* test)?


###  When to use `scope="session"`

Use it when:

* Setup is **expensive** and can safely be **shared globally** (e.g. DB, Docker container, web server).
* Tests **don‚Äôt modify** shared state or are designed to handle concurrency.
* You want **fast test suites** that reuse a single resource.

Avoid it when:

* Tests rely on **isolation** or mutate the resource.
* The resource depends on per-test configuration.

---

### Summary

* `scope="session"` ‚Üí fixture lives for the whole pytest run.
* Created **once**, shared **everywhere**, destroyed **at the very end**.
* Great for global connections, caches, or initialization steps.
* Other scopes (`function`, `class`, `module`, `package`) define progressively larger lifetimes.

---





## 1. `expanding(min_periods=SAFE_ROLL).std()`

### Meaning

This uses **expanding windows** ‚Äî cumulative statistics from the start up to the current point.

```python
out["exp_std"] = s.expanding(min_periods=SAFE_ROLL).std()
```

* `s.expanding(...)` ‚Üí returns an *Expanding* object that represents
  ‚Äúall data from the beginning up to this index‚Äù.
* `min_periods=SAFE_ROLL` ‚Üí until you have at least `SAFE_ROLL` observations,
  it returns `NaN`.

So at index *i*, this is:

$$
\text{exp_std}_i = \operatorname{std}(s_0, s_1, \dots, s_i)
\quad \text{(if } i+1 \ge SAFE_ROLL)
$$

### Example

If `SAFE_ROLL = 3` and
`s = [1, 2, 4, 8]`, then:

| index | values used  | std    |
| ----- | ------------ | ------ |
| 0     | (not enough) | NaN    |
| 1     | (not enough) | NaN    |
| 2     | [1, 2, 4]    | ‚âà 1.53 |
| 3     | [1, 2, 4, 8] | ‚âà 3.11 |

It‚Äôs like a **growing window** ‚Äî the first value that meets the minimum, then keeps accumulating all past data.

---

## 2. `ewm(span=20, adjust=False).mean()`

### Meaning

This uses an **exponentially weighted moving average** (EWMA).

```python
out["ewm_mean_20"] = s.ewm(span=20, adjust=False).mean()
```

* `span=20` controls the *decay rate* (analogous to a 20-period moving average).
* Recent values get **more weight**, older values **less weight**, with exponential decay.
* `adjust=False` ‚Üí makes it compute the recursive form:

$$
  y_t = (1 - \alpha) , y_{t-1} + \alpha , x_t,
  \quad \text{where } \alpha = \frac{2}{span + 1}.
 $$

If `span=20`, then `Œ± ‚âà 0.095`.

This is smoother and more responsive than a simple rolling mean.

---

### Difference from rolling mean

| Type                        | Uses fixed window?               | Weights           | Behavior                             |
| --------------------------- | -------------------------------- | ----------------- | ------------------------------------ |
| `rolling(window=20).mean()` | ‚úÖ Fixed last 20 points           | Equal             | Sharp edges (drops oldest instantly) |
| `ewm(span=20).mean()`       | ‚ùå Infinite window                | Exponential decay | Smooth adaptation                    |
| `expanding().mean()`        | ‚ùå Growing window (all past data) | Equal             | Becomes stable slowly                |

---

### Example

If `s = [1, 2, 3, 4, 5]` and `span=2`:

| index | formula                 | result |
| ----- | ----------------------- | ------ |
| 0     | y‚ÇÄ = 1                  | 1.00   |
| 1     | y‚ÇÅ = 0.33¬∑1 + 0.67¬∑2    | 1.67   |
| 2     | y‚ÇÇ = 0.33¬∑1.67 + 0.67¬∑3 | 2.44   |
| 3     | y‚ÇÉ = 0.33¬∑2.44 + 0.67¬∑4 | 3.48   |
| 4     | y‚ÇÑ = 0.33¬∑3.48 + 0.67¬∑5 | 4.49   |

---

##  Summary

| Feature                | Window Type                     | Formula                   | Description                                   |
| ---------------------- | ------------------------------- | ------------------------- | --------------------------------------------- |
| `expanding(...).std()` | Growing (all data)              | Std. of all points so far | Measures long-term volatility as sample grows |
| `ewm(span=20).mean()`  | Infinite with exponential decay | Weighted mean, Œ±=2/(20+1) | Tracks recent trend smoothly                  |

---




The general form of the  weighted mean is:

$$
\text{WM}_t = \frac{\sum_{i=0}^{t} w_i x_i}{\sum_{i=0}^{t} w_i},
$$

where the weights $ w_i $ **decay exponentially** as observations get older.

In Pandas:

```python
s.ewm(span=20, adjust=...).mean()
```

The key question is **how** those weights $ w_i $ are normalized and updated as new data arrive ‚Äî that‚Äôs what `adjust` controls.

---

##  When `adjust=True` (the default)

This is the **textbook (exact)** exponentially weighted formula:

$$
y_t = \frac{\sum_{i=0}^{t} (1 - \alpha)^{t - i} \, x_i}{\sum_{i=0}^{t} (1 - \alpha)^{t - i}},
$$
where
$$
\alpha = \frac{2}{\text{span} + 1}.
$$

Each point contributes explicitly with its weight.
Older points keep a geometric weight of $(1 - \alpha)^k$.

---

###  Interpretation

* It‚Äôs the **"batch"** version ‚Äî computes the weighted average over all points seen so far.
* The denominator (sum of weights) grows toward $1 / \alpha$ asymptotically.
* You can view it as ‚Äúthe exact result you‚Äôd get if you recomputed the whole weighted average from scratch each time‚Äù.

---

### Example

Let‚Äôs take a short sequence:

```python
import pandas as pd
s = pd.Series([10, 20, 30])
```

With `Œ± = 0.5` (say `span=2`):

| t | x‚Çú | Weights (adjust=True)                              | Weighted mean               |
| - | -- | -------------------------------------------------- | --------------------------- |
| 0 | 10 | [1.0]                                              | 10.0                        |
| 1 | 20 | [0.5, 1.0] ‚Üí normalized ‚Üí [1/3, 2/3]               | (1/3)¬∑10 + (2/3)¬∑20 = 16.67 |
| 2 | 30 | [0.25, 0.5, 1.0] ‚Üí normalized ‚Üí [0.14, 0.29, 0.57] | ‚âà 23.33                     |

---

##  When `adjust=False`

Now Pandas uses the **recursive (online)** formulation:

$$
y_t = (1 - \alpha) y_{t-1} + \alpha x_t.
$$

This does *not* renormalize all past weights each time ‚Äî instead, it uses the previous result as a smoothed state.

It gives a **slightly different series**, especially at the beginning, because the normalization is implicit.

---

###  Interpretation

* It‚Äôs the **"recursive / efficient"** form, commonly used in streaming or real-time settings.

* You can think of it as:

  > ‚ÄúUpdate the previous estimate by moving Œ± fraction toward the new observation.‚Äù

* After many points, both `adjust=True` and `adjust=False` converge to almost the same value, but the *initial few points differ*.

---

### Example continued

Same data, same Œ± = 0.5:

| t | x‚Çú | Formula         | y‚Çú   |
| - | -- | --------------- | ---- |
| 0 | 10 | y‚ÇÄ = x‚ÇÄ         | 10.0 |
| 1 | 20 | 0.5¬∑10 + 0.5¬∑20 | 15.0 |
| 2 | 30 | 0.5¬∑15 + 0.5¬∑30 | 22.5 |

Compare:

* `adjust=True` gave `[10.0, 16.67, 23.33]`
* `adjust=False` gave `[10.0, 15.0, 22.5]`

They‚Äôre close, but not identical ‚Äî the *recursive version* lags slightly more at the beginning.

---

##  Summary

| Flag           | Formula                                                           | Type                                               | When to use                                                                              |
| -------------- | ----------------------------------------------------------------- | -------------------------------------------------- | ---------------------------------------------------------------------------------------- |
| `adjust=True`  | $$y_t = \frac{\sum (1-\alpha)^{t-i} x_i}{\sum (1-\alpha)^{t-i}}$$ | *Exact weighted average* (re-normalized each time) | When you want mathematically correct EWM or when comparing to textbook definitions       |
| `adjust=False` | $$y_t = (1-\alpha)y_{t-1} + \alpha x_t$$                          | *Recursive (online) update*                        | When streaming, efficiency, or aligning with real-time filters (EMA, technical analysis) |

---

###  Practical takeaways

* In finance or ML feature engineering, `adjust=False` is most common ‚Äî it mimics an *exponential moving average (EMA)* used in trading and real-time filters.
* In statistical modeling, `adjust=True` corresponds to the *pure exponential weighting formula* that reweights all history at each step.
* After a long time, both give nearly identical results; the difference is mainly in early periods.

---




##  1. `&` ‚Äî *bitwise AND operator*

In most languages (including **Python**, **C**, **C++**, **Java**, **JavaScript**):

* `&` performs a **bitwise AND** between two integers (or booleans, elementwise for arrays in NumPy/Pandas).
* It operates on **each bit individually**.

### Example (bitwise)

```python
a = 6  # binary: 110
b = 3  # binary: 011
print(a & b)   # 010  ‚Üí 2
```

|           a          |  b  | a & b |
| :------------------: | :-: | :---: |
|           1          |  0  |   0   |
|           1          |  1  |   1   |
|           0          |  1  |   0   |
| ‚Üí result: `010‚ÇÇ = 2` |     |       |

---

## 2. `&&` ‚Äî *logical AND operator*

* `&&` exists in **C, C++, Java, JavaScript**, etc.
* It performs **short-circuit logical AND**:
  evaluates the left-hand side first; only evaluates the right-hand side if needed.

### Example (logical)

```c
if (x > 0 && y > 0) {
    printf("Both positive\n");
}
```

Here, if `x > 0` is false, the second condition `y > 0` is **not even checked**.

---

##  3. Python **does not have `&&`**

In Python, logical operations use **English keywords**:

| C / Java | Python equivalent |   |      |
| -------- | ----------------- | - | ---- |
| `&&`     | `and`             |   |      |
| `        \|                   \| ` | `or` |
| `!`      | `not`             |   |      |

So:

```python
if x > 0 and y > 0:
    print("Both positive")
```

Python‚Äôs `and` also **short-circuits**, just like `&&`.

---

##  4. Mixing up `&` and `and` in Python

If you accidentally write:

```python
(x > 0) & (y > 0)
```

it *will* work, but **as a bitwise operation** ‚Äî not a logical short-circuit.

* For **scalars**, it behaves similarly but with stricter typing rules.
* For **NumPy / Pandas**, it‚Äôs **elementwise logical AND**.

### Example with Pandas/NumPy

```python
import pandas as pd
df = pd.DataFrame({"a": [1, -2, 3], "b": [2, 3, -1]})

mask = (df["a"] > 0) & (df["b"] > 0)
print(mask)
```

Output:

```
0     True
1    False
2    False
dtype: bool
```

Works ‚Äî because Pandas overloads `&` for elementwise logical AND.
But if you used `and`, it would error:

```python
(df["a"] > 0 and df["b"] > 0)
# ValueError: The truth value of a Series is ambiguous
```

---

##  5. Summary

| Operator | Language         | Meaning                       | Short-circuits? | Typical use                  |                     |   |   |
| -------- | ---------------- | ----------------------------- | --------------- | ---------------------------- | ------------------- | - | - |
| `&`      | All              | Bitwise AND / elementwise AND | ‚ùå No            | Integer or array bitwise ops |                     |   |   |
| `&&`     | C, C++, Java, JS | Logical AND                   | ‚úÖ Yes           | Boolean conditions           |                     |   |   |
| `and`    | Python           | Logical AND                   | ‚úÖ Yes           | Boolean conditions           |                     |   |   |
| `        \| `, `             \|                               \| `, `or`         | Bitwise OR / logical OR      | Similar distinction |   |   |

---

### TL;DR

* In **Python**, use:

  * `and` / `or` for logic
  * `&` / `|` for bitwise or elementwise logic (NumPy, Pandas)
* In **C-like languages**, use:

  * `&&` / `||` for logic
  * `&` / `|` for bitwise operations.




Run test now:

In [14]:
!pytest -q tests/test_leakage_features.py

[31mF[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[31mF[0m[31mF[0m[31mF[0m[31mF[0m[33ms[0m[32m.[0m[31m                                                            [100%][0m
[31m[1m__________________________ test_label_definition_r1d ___________________________[0m

df =            date ticker  log_return  ...   exp_std  adj_close   volume
0    2020-01-29   AAPL   -0.018417  ...  0.00847...489  82.670891  1777769
3974 2020-09-07    XOM   -0.008826  ...  0.009480  81.944473  1272137

[3975 rows x 18 columns]

    [0m[94mdef[39;49;00m[90m [39;49;00m[92mtest_label_definition_r1d[39;49;00m(df):[90m[39;49;00m
        [94mfor[39;49;00m tkr, g [95min[39;49;00m df.groupby([33m"[39;49;00m[33mticker[39;49;00m[33m"[39;49;00m):[90m[39;49;00m
>           [94massert[39;49;00m g[[33m"[39;49;00m[33mr_1d[39;49;00m[33m"[39;49;00m].iloc[:-[94m1[39;49;00m].equals(g[[33m"[39;49;00m[33mlog_return[39;49;00m[33m"[39;49;00m].iloc[[94m1[39

> If a test fails, **fix the pipeline**, don‚Äôt weaken the test.

# 4) Add **multi‚Äëstep labels** (e.g., t+5) and tests

In [15]:
# save to scripts/make_multistep_labels.py
from __future__ import annotations
import pandas as pd, numpy as np
from pathlib import Path

def make_multistep(in_parquet="data/processed/returns.parquet", horizons=(5,)):
    df = pd.read_parquet(in_parquet).sort_values(["ticker","date"]).reset_index(drop=True)
    for H in horizons:
        # r_Hd = sum of next H log returns: shift(-1) ... shift(-H): accumulative log return over H days
        s = df.groupby("ticker")["log_return"]
        acc = None  # initialize an accumulator
        for h in range(1, H+1):
            sh = s.shift(-h)
            acc = sh if acc is None else (acc + sh)  # accumulative
        df[f"r_{H}d"] = acc
    out = df
    Path("data/processed").mkdir(parents=True, exist_ok=True)
    out.to_parquet("data/processed/returns_multistep.parquet", compression="zstd", index=False)
    print("Wrote data/processed/returns_multistep.parquet", out.shape)

if __name__ == "__main__": #see below for more explanation
    make_multistep()

  s = df.groupby("ticker")["log_return"]


Wrote data/processed/returns_multistep.parquet (4500, 7)


```python
if __name__ == "__main__":
    make_multistep()
```


##  1. What `__name__` is

Every Python file (module) automatically gets a built-in variable called `__name__`.

* When you **run a file directly** (like `python my_script.py`),
  Python sets `__name__ = "__main__"`.
* When you **import the file** from another module,
  `__name__` is set to that module‚Äôs name (e.g. `"my_script"`).

So:

| How the file is used  | Value of `__name__` |
| --------------------- | ------------------- |
| `python my_script.py` | `"__main__"`        |
| `import my_script`    | `"my_script"`       |

---

##  2. The purpose of the `if __name__ == "__main__":` block

It lets you define code that should **run only when the file is executed directly**,
and **not when imported**.

That‚Äôs very useful for separating:

* **Reusable code** (functions, classes) ‚Äî used by imports.
* **Executable script logic** ‚Äî e.g. calling `main()` or `make_multistep()`.

---

##  3. In the example

```python
if __name__ == "__main__":
    make_multistep()
```

means:

> ‚ÄúIf this script is run directly from the command line, call the function `make_multistep()`.‚Äù

If the same file is imported as a module inside another script (for example: `from scripts.build_targets import make_multistep`),
then this code **will not run automatically**.

---

###  Typical project pattern

```python
def make_multistep():
    # your logic here (e.g. compute multistep targets)
    ...

if __name__ == "__main__":
    make_multistep()
```

* When you run:

  ```bash
  python scripts/make_targets.py
  ```

  ‚Üí the function executes immediately.
* When you import the same file in another script or notebook,
  you can call `make_multistep()` manually, but the automatic execution is skipped.

---

##  4. Why it‚Äôs good practice

- Prevents unwanted execution when importing code.
- Makes the module both *importable* and *runnable*.
- Common in data pipelines and CLI tools ‚Äî e.g.:

```python
def main():
    parser = argparse.ArgumentParser()
    ...
    args = parser.parse_args()
    run_pipeline(args)

if __name__ == "__main__":
    main()
```

---

##  Summary

| Component                    | Meaning                                                                                      |
| ---------------------------- | -------------------------------------------------------------------------------------------- |
| `__name__`                   | Automatically set by Python to either `"__main__"` (if run) or the module name (if imported) |
| `if __name__ == "__main__":` | Conditional that runs code only when script executed directly                                |
| `make_multistep()`           | Your function ‚Äî called only in standalone execution                                          |
| Purpose                      | Separate script behavior from reusable definitions                                           |

---



In [16]:
!python scripts/make_multistep_labels.py

  s = df.groupby("ticker")["log_return"]
Wrote data/processed/returns_multistep.parquet (4500, 7)


Add a test for label correctness:

In [17]:
# save to tests/test_labels_multistep.py
import pandas as pd, numpy as np

def test_r5d_definition():
    df = pd.read_parquet("data/processed/returns_multistep.parquet").sort_values(["ticker","date"])
    if "r_5d" not in df.columns:
        return
    for tkr, g in df.groupby("ticker"):
        lr = g["log_return"]
        r5 = sum(lr.shift(-h) for h in range(1,6))
        diff = (g["r_5d"] - r5).abs().max()
        assert float(diff) < 1e-10, f"r_5d misdefined for {tkr} (max |Œî|={diff})"

In [18]:
!pytest -q tests/test_labels_multistep.py

[32m.[0m[32m                                                                        [100%][0m


## Homework (due before Session 18)

**Goal:** Document your evaluation protocol and ship a concise ‚Äúleakage & bias‚Äù memo, plus a one‚Äëcommand audit.

### Part A ‚Äî Generate a **protocol memo** (`reports/eval_protocol.md`)

In [19]:
# save to scripts/write_eval_protocol.py
from __future__ import annotations
import pandas as pd, numpy as np
from pathlib import Path
from datetime import date

def make_rolling_origin_splits(dates, train_min=252, val_size=63, step=63, embargo=5):
    u = np.array(sorted(pd.to_datetime(pd.Series(dates).unique())))
    i = train_min - 1; out=[]
    while True:
        if i >= len(u): break
        a,b = u[0], u[i]; vs=i+embargo+1; ve=vs+val_size-1
        if ve >= len(u): break
        out.append((a,b,u[vs],u[ve])); i += step
    return out

def main():
    ret = pd.read_parquet("data/processed/returns.parquet").sort_values(["ticker","date"])
    # splits = make_rolling_origin_splits(ret["date"]). # this split is empty
    splits = make_rolling_origin_splits(ret["date"], train_min=80, val_size=21, step=21, embargo=5)
    a,b,c,d = splits[0]
    # Universe info
    univ_files = sorted(Path("data/static").glob("universe_*.csv")) # see below for more explanantions
    univ = univ_files[-1] if univ_files else None # take the last file
    univ_count = pd.read_csv(univ).shape[0] if univ else ret["ticker"].nunique()
    md = []
    md += ["# Evaluation Protocol (Leakage‚ÄëAware)", ""]
    md += ["**Date:** " + date.today().isoformat(), ""]
    md += ["## Splits", f"- Train window (split 1): **{a.date()} ‚Üí {b.date()}**",
           f"- Embargo: **5** business days", f"- Validation window: **{c.date()} ‚Üí {d.date()}**",
           f"- Step between origins: **63** business days", ""]
    md += ["## Static Universe", f"- Universe file: **{univ.name if univ else '(none)'}**",
           f"- Count: **{univ_count}** tickers",
           "- Selection rule: tickers with ‚â•252 obs by first train end; fixed for all splits.", ""]
    md += ["## Labels", "- `r_1d` = next‚Äëday log return `log_return.shift(-1)` per ticker.",
           "- `r_5d` (if used) = sum of `log_return.shift(-1..-5)`.", ""]
    md += ["## Leakage Controls",
           "- Features computed from ‚â§ t only (rolling/ewm/expanding without negative shifts).",
           "- No forward‚Äëfill across split boundaries; embargo = 5 days.",
           "- Scalers/normalizers fit on TRAIN only.",
           "- Tests: `tests/test_leakage_features.py`, `tests/test_labels_multistep.py`.", ""]
    md += ["## Caveats",
           "- Educational dataset; not investment advice.",
           "- Survivorship minimized via static universe; still subject to data vendor quirks.", ""]
    Path("reports").mkdir(parents=True, exist_ok=True)
    Path("reports/eval_protocol.md").write_text("\n".join(md))
    print("Wrote reports/eval_protocol.md")

if __name__ == "__main__":
    main()

Wrote reports/eval_protocol.md


In [20]:
["a", "b"]+["c"]

['a', 'b', 'c']

##  1Ô∏è‚É£ `Path("data/static").glob("universe_*.csv")`

* `Path(...)` comes from the **`pathlib`** module ‚Äî a modern replacement for `os.path`.
* `.glob("universe_*.csv")` searches that folder for **all files** whose names match the pattern `universe_*.csv`.

So if your folder `data/static` contains:

```
universe_2023.csv
universe_2024.csv
universe_2025.csv
readme.txt
```

then:

```python
list(Path("data/static").glob("universe_*.csv"))
```

returns something like:

```python
[PosixPath('data/static/universe_2023.csv'),
 PosixPath('data/static/universe_2024.csv'),
 PosixPath('data/static/universe_2025.csv')]
```

---

##  2Ô∏è‚É£ `sorted(...)`

```python
univ_files = sorted(Path("data/static").glob("universe_*.csv"))
```

* Converts the generator returned by `.glob()` into a list and sorts it alphabetically (lexicographically).
* Because filenames like `universe_2023.csv`, `universe_2024.csv` sort in chronological order, sorting ensures the latest one appears last ‚Äî assuming your filenames encode time or version numerically or lexicographically.

Result:

```python
univ_files = [
    PosixPath('data/static/universe_2023.csv'),
    PosixPath('data/static/universe_2024.csv'),
    PosixPath('data/static/universe_2025.csv')
]
```

---

##  3Ô∏è‚É£ `univ = univ_files[-1] if univ_files else None`

This is a **conditional expression** (a one-line `if` statement):

* If the list `univ_files` is **non-empty**, take its **last element** (`[-1]`).
* Otherwise (if no files matched), set `univ = None`.

So effectively:

```python
if univ_files:
    univ = univ_files[-1]
else:
    univ = None
```

---

##  4Ô∏è‚É£ What `univ` holds

* If files are found ‚Üí the latest file (based on sorted order).
  e.g. `PosixPath('data/static/universe_2025.csv')`
* If none are found ‚Üí `None`.

This allows later code to safely check:

```python
if univ is not None:
    df = pd.read_csv(univ)
else:
    print("No universe files found.")
```

---



In [21]:
!python scripts/write_eval_protocol.py

Wrote reports/eval_protocol.md


### Part B ‚Äî One‚Äëcommand **leakage audit** target

Append to your `Makefile`:

``` make
.PHONY: leakage-audit
leakage-audit: ## Run leakage & label tests; write eval protocol
\tpytest -q tests/test_leakage_features.py tests/test_labels_multistep.py
\tpython scripts/write_eval_protocol.py
```

Then run:

``` bash
make leakage-audit
```

In [22]:
# refer to lec14-inclass.ipynb how to add this using Python code

from pathlib import Path
mk = Path("Makefile")
text = mk.read_text() if mk.exists() else ""
if "leakage-audit" not in text:
    text += """

.PHONY: leakage-audit
leakage-audit: ## Run leakage & label tests; write eval protocol
\tpytest -q tests/test_leakage_features.py tests/test_labels_multistep.py
\tpython scripts/write_eval_protocol.py
"""
mk.write_text(text)
print(mk.read_text())

# Makefile ‚Äî unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c
.ONESHELL:


PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
	@awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

# all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts
all: $(DATA_RAW) $(FEATS) report train backup

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
	$(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
	# Basic QA first
	scripts/qa_csv.sh $(DATA_RAW)
	$(PY) scripts/build_features.

In [23]:
%%bash
make leakage-audit

pytest -q tests/test_leakage_features.py tests/test_labels_multistep.py
python scripts/write_eval_protocol.py
F......FFFFs..                                                           [100%]
__________________________ test_label_definition_r1d ___________________________

df =            date ticker  log_return  ...   exp_std  adj_close   volume
0    2020-01-29   AAPL   -0.018417  ...  0.00847...489  82.670891  1777769
3974 2020-09-07    XOM   -0.008826  ...  0.009480  81.944473  1272137

[3975 rows x 18 columns]

    def test_label_definition_r1d(df):
        for tkr, g in df.groupby("ticker"):
>           assert g["r_1d"].iloc[:-1].equals(g["log_return"].iloc[1:]), f"r_1d mismatch for {tkr}"
E           AssertionError: r_1d mismatch for AAPL
E           assert False
E            +  where False = equals(1     -0.002351\n2     -0.012675\n3      0.002713\n4      0.001568\n5     -0.001869\n         ...   \n154   -0.011083\n155   -0.012161\n156    0.013355\n157   -0.005071\n158    0.002917

make: *** [Makefile:136: leakage-audit] Error 1


CalledProcessError: Command 'b'make leakage-audit\n'' returned non-zero exit status 2.