In [1]:
# Colab cell
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Adjust these two for YOUR repo
REPO_OWNER = "ywanglab"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1

BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [3]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    # !git status
    # !git pull --rebase # !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Working dir: /content/drive/MyDrive/dspt25/STAT4160


In [4]:
import subprocess, shutil
def check(cmd):
    try:
        out = subprocess.check_output(cmd, text=True)
        print(cmd[0], "OK")
    except Exception as e:
        print(cmd[0], "NOT FOUND")
check(["make", "--version"])
check(["rsync", "--version"])
check(["quarto", "--version"])

make OK
rsync OK
quarto NOT FOUND


In [5]:
# Install Quarto CLI (one-time per Colab runtime)
# !wget -q https://quarto.org/download/latest/quarto-linux-amd64.deb -O /tmp/quarto.deb
# !dpkg -i /tmp/quarto.deb || apt-get -y -f install >/dev/null && dpkg -i /tmp/quarto.deb
# !quarto --version

#Alternatively, save it to G-drive, and only need to download the first time. The size of  quarto-linux-amd64.deb is ~125Mb.
# Path to store the deb package
deb_path = "/content/drive/MyDrive/quarto-linux-amd64.deb"

# Download only if not already saved
!test -f $deb_path || wget -q https://quarto.org/download/latest/quarto-linux-amd64.deb -O $deb_path

# Install from Drive (fast, no re-download)
!dpkg -i $deb_path || apt-get -y -f install >/dev/null && dpkg -i $deb_path #-f: fix package dependency issues
!quarto --version

(Reading database ... 130485 files and directories currently installed.)
Preparing to unpack .../MyDrive/quarto-linux-amd64.deb ...
Unpacking quarto (1.7.33) over (1.7.33) ...
Setting up quarto (1.7.33) ...
(Reading database ... 130485 files and directories currently installed.)
Preparing to unpack .../MyDrive/quarto-linux-amd64.deb ...
Unpacking quarto (1.7.33) over (1.7.33) ...
Setting up quarto (1.7.33) ...
1.7.33


* **`auto_adjust=True`**

  * Adjusts prices for dividends and stock splits.
  * With this option:

    * `Open`, `High`, `Low`, `Close` are adjusted values.
    * Column `Adj Close` is dropped (since it would duplicate `Close`).
  * If `False`, you’ll get both `Close` (raw) and `Adj Close`.

* **`progress=False`**

  * Suppresses the progress bar output in the terminal.
  * Useful when running inside scripts or notebooks to keep output clean.

* `df.rename` changes column names.
* `columns=str.lower` means: apply the function `str.lower` to every column name.
* Example:
  Before: `["Open","High","Low","Close","Volume"]`
  After:  `["open","high","low","close","volume"]`

* After `reset_index()`: This turns the index into a normal column.

```python
ap = argparse.ArgumentParser()
```

* Creates an argument parser object.
* This object knows how to read and interpret arguments from the command line.


```python
args = ap.parse_args()
```

* Actually parses the arguments given when the script runs.
* Creates a namespace object (`args`) with attributes:

  * `args.tickers`
  * `args.start`
  * `args.end`
  * `args.out`

If you run:

```bash
./scripts/get_prices.py --tickers=mylist.csv --start=2019-01-01 --end=2022-12-31 --out=data/prices.csv
```

Then inside Python:

```python
args.tickers   # "mylist.csv"
args.start     # "2019-01-01"
args.end       # "2022-12-31"
args.out       # "data/prices.csv"
```

If you run without arguments:

```bash
./scripts/get_prices.py
```

Then you get the **defaults**:

* `tickers_25.csv`
* `2020-01-01`
* `""` (empty string, → treated as today)
* `data/raw/prices.csv`

Nice catch — that’s a little trick inside the synthetic-data generator.

* **`hash(t)`**

   * Built-in Python function.
   * Returns an integer hash value of the object `t` (here, the ticker symbol string, e.g. `"AAPL"`).
   * Same `t` in the same run → same hash.
   * But note: in different Python processes, `hash()` is *seeded randomly by default* (for security), so the raw number can vary between runs.


**In short:**
`hash(t) % 1000` maps each ticker to a number between 0–999, so each ticker gets its own random-generator seed for reproducible synthetic data.

* **`pd.to_datetime(df["date"])`**

   * Takes the `"date"` column (which might be strings like `"2020-01-02"` or Timestamps) and converts it into pandas `Datetime64[ns]` objects.
   * Example: `"2020-01-02"` → `Timestamp('2020-01-02 00:00:00')`.

* **`.dt` accessor**

   * Lets you pull datetime-specific components (like `.year`, `.month`, `.day`, `.weekday`, etc.) from a pandas datetime series.

* **`.date`**

   * Extracts the underlying Python `datetime.date` object from each `Timestamp`.
   * Drops the time-of-day information, leaving just the calendar date.

That’s the **standard Python entry-point idiom** — it controls when your `main()` function runs.

```python
if __name__ == "__main__":
    sys.exit(main())
```

1. **`__name__`**

   * A special variable set by Python.
   * If you **run a file directly** (e.g. `python get_prices.py`), then `__name__` is set to `"__main__"`.
   * If you **import that file as a module** (e.g. `import get_prices`), then `__name__` is set to the module name (`"get_prices"`).

2. **`if __name__ == "__main__":`**

   * Ensures the block only runs when the script is executed directly, **not** when imported.
   * This way, you can safely import functions like `fetch_yf` elsewhere without auto-running the downloader.

3. **`sys.exit(main())`**

   * Calls the `main()` function.
   * `main()` returns `None` unless you `return` something — but if it raised an error, `sys.exit()` would propagate a nonzero exit code to the shell.
   * Using `sys.exit()` makes it explicit that this script is a **command-line program**, with exit codes that tools/shells can check.


```python
import os, stat
os.chmod("scripts/get_prices.py", os.stat("scripts/get_prices.py").st_mode | stat.S_IEXEC)
```

1. **`os.stat("scripts/get_prices.py").st_mode`**

   * `os.stat` gets file metadata (permissions, size, etc.).
   * `.st_mode` is an integer bitmask that encodes the file’s permission bits (read, write, execute).
   * Example: `0o644` → means `rw-r--r--`.

2. **`| stat.S_IEXEC`**

   * `stat.S_IEXEC` is the “executable” bit.
   * The `|` is a bitwise OR — it adds that permission without removing the existing ones.
   * So if the file was `rw-r--r--` (644), it becomes `rwxr--r--` (744).

3. **`os.chmod("scripts/get_prices.py", new_mode)`**

   * Actually changes the file’s mode (permissions) to include “executable.”

By default, when you `open(...,"w").write(get_py)`, the file has normal text-file permissions (read/write).
But after this command, the file is **executable**, so you can run it directly from the shell:

```bash
./scripts/get_prices.py --start=2021-01-01 --end=2023-01-01
```

Instead of having to type:

```bash
python scripts/get_prices.py --start=2021-01-01 --end=2023-01-01
```

`stat.S_IEXEC` is the **execute bit for the owner only**.

* **Group execute:** use `stat.S_IXGRP`
* **Others execute:** use `stat.S_IXOTH`
* **All three at once:** use `stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH`

So, for example:

```python
os.chmod("scripts/get_prices.py",
         os.stat("scripts/get_prices.py").st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
```

would make it executable by everyone (`rwxr-xr-x`).


* In personal projects, usually only you need to run it.
* Keeping group/others non-executable is a small security measure.
* It mimics `chmod u+x file.py` (owner execute only), which is the most common.



* Starts with `#!` → tells the operating system: *“Use this program to interpret the rest of the file.”*


* `/usr/bin/env` is a small program that looks up another program in your **PATH**.

* By writing:

  ```bash
  #!/usr/bin/env python
  ```

  you’re saying: *“Find whatever `python` is in my PATH, and use that.”*

* This is more flexible than hardcoding:

  ```bash
  #!/usr/bin/python
  ```

  because:

  * Some systems call it `python3`
  * Some install it in `/usr/local/bin`
  * Some use Conda or virtual environments with different paths



In [6]:
from pathlib import Path
Path("scripts").mkdir(exist_ok=True)

get_py = r"""#!/usr/bin/env python
import argparse, sys, time
from pathlib import Path
import pandas as pd, numpy as np

def fetch_yf(ticker, start, end):
    import yfinance as yf
    df = yf.download(ticker, start=start, end=end, auto_adjust=True, progress=False)
    if df is None or df.empty:
        raise RuntimeError("empty")
    df = df.rename(columns=str.lower)[["close","volume"]]
    # --- minimal sanitize to avoid NaNs in adj_close/log_return ---
    df = df.sort_index()
    df = df[~df.index.duplicated(keep="last")]
    df = df.dropna(subset=["close"])
    df["volume"] = df["volume"].fillna(0)
    # --------------------------------------------------------------
    df.index.name = "date"
    df = df.reset_index()
    df["ticker"] = ticker
    return df[["ticker","date","close","volume"]]

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--tickers", default="tickers_25.csv")
    ap.add_argument("--start", default="2020-01-01")
    ap.add_argument("--end", default="")
    ap.add_argument("--out", default="data/raw/prices.csv")
    args = ap.parse_args()

    out = Path(args.out)
    out.parent.mkdir(parents=True, exist_ok=True)
    tickers = pd.read_csv(args.tickers)["ticker"].dropna().unique().tolist()

    rows = []
    for t in tickers:
        try:
            df = fetch_yf(t, args.start, args.end or None)
        except Exception:
            # synthetic fallback
            idx = pd.bdate_range(args.start, args.end or pd.Timestamp.today().date())
            rng = np.random.default_rng(42 + hash(t)%1000)
            r = rng.normal(0, 0.01, len(idx))
            price = 100*np.exp(np.cumsum(r))
            vol = rng.integers(1e5, 5e6, len(idx))
            df = pd.DataFrame({"ticker": t, "date": idx, "close": price, "volume": vol})
        df["date"] = pd.to_datetime(df["date"]).dt.date
        df["adj_close"] = df["close"]
        df = df.drop(columns=["close"])
        df["log_return"] = np.log(df["adj_close"]).diff().fillna(0.0)
        rows.append(df)

    allp = pd.concat(rows, ignore_index=True)
    allp = allp[["ticker","date","adj_close","volume","log_return"]]
    allp.to_csv(out, index=False)
    print("Wrote", out, "rows:", len(allp))

if __name__ == "__main__":
    sys.exit(main())
"""
open("scripts/get_prices.py","w").write(get_py)
import os, stat
os.chmod("scripts/get_prices.py", os.stat("scripts/get_prices.py").st_mode | stat.S_IEXEC)
print("Created scripts/get_prices.py")

Created scripts/get_prices.py


* `.shift(k)`: moves values down by `k` rows within each group.

So for each stock:

* `lag1` = yesterday’s return
* `lag2` = return from 2 days ago
* `lag3` = return from 3 days ago

This ensures we can use past returns as predictors.

Example:

| ticker | date       | r\_1d | lag1  | lag2 |
| ------ | ---------- | ----- | ----- | ---- |
| AAPL   | 2020-01-02 | 0.02  | NaN   | NaN  |
| AAPL   | 2020-01-03 | -0.01 | 0.02  | NaN  |
| AAPL   | 2020-01-06 | 0.03  | -0.01 | 0.02 |


* `.rolling(args.roll, min_periods=args.roll//2)`:

  * Creates a moving window of length `args.roll` (say 30 days).
  * `min_periods=args.roll//2`: only compute a mean if at least half the window has non-missing data.
* `.reset_index(level=0, drop=True)`: because rolling adds a hierarchical index (ticker + date), we drop the extra ticker level and align back to the DataFrame.

This creates a smoothed return trend for each stock.



In [7]:
feat_py = r"""#!/usr/bin/env python
import argparse
from pathlib import Path
import pandas as pd, numpy as np

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--input", default="data/raw/prices.csv")
    ap.add_argument("--out", default="data/processed/features.parquet")
    ap.add_argument("--roll", type=int, default=20)
    args = ap.parse_args()

    df = pd.read_csv(args.input, parse_dates=["date"])
    df = df.sort_values(["ticker","date"])
    # groupwise lags
    df["r_1d"] = df["log_return"]
    for k in (1,2,3):
        df[f"lag{k}"] = df.groupby("ticker")["r_1d"].shift(k)
    df["roll_mean"] = (df.groupby("ticker")["r_1d"]
                         .rolling(args.roll, min_periods=args.roll//2).mean()
                         .reset_index(level=0, drop=True))
    df["roll_std"]  = (df.groupby("ticker")["r_1d"]
                         .rolling(args.roll, min_periods=args.roll//2).std()
                         .reset_index(level=0, drop=True))
    out = Path(args.out)
    out.parent.mkdir(parents=True, exist_ok=True)
    # Save compactly
    df.to_parquet(out, index=False)
    print("Wrote", out, "rows:", len(df))

if __name__ == "__main__":
    main()
"""
open("scripts/build_features.py","w").write(feat_py)
import os, stat
os.chmod("scripts/build_features.py", os.stat("scripts/build_features.py").st_mode | stat.S_IEXEC)
print("Created scripts/build_features.py")

Created scripts/build_features.py


```bash
ROOT="${1:-backups}"
```

* `$1` = the first argument passed to the script.
* `${1:-backups}` means:

  * If `$1` is provided → use it.
  * If `$1` is empty/missing → fall back to `"backups"`.

So:

* Run as `./myscript.sh /tmp/saves` → `ROOT="/tmp/saves"`.
* Run as `./myscript.sh` → `ROOT="backups"`.

```bash
STAMP="$(date +%Y%m%d-%H%M%S)"
```

* `$( ... )` runs a command and captures its output.
* `date +%Y%m%d-%H%M%S` formats the current date/time as:

  * `%Y` = year (2025)
  * `%m` = month (09)
  * `%d` = day (09)
  * `%H` = hour (10, 24-hour clock)
  * `%M` = minute
  * `%S` = second

Example result:

```
STAMP="20250909-103512"
```

```bash
INCLUDE=("data/processed" "reports" "docs")
```

* This is a Bash **array** with three entries.
* Each entry is a directory path you want to include in your backup.

So `${INCLUDE[@]}` expands to:

```
data/processed reports docs
```

```bash
if [[ -d "$src" ]]; then
```

* `-d` checks “does this path exist and is it a directory?”
* Prevents errors if one of the folders is missing.

```bash
rsync -avh --delete --exclude 'raw/' --exclude 'interim/' "$src"/ "$DEST/$src"/
```

* **`rsync`** = tool for efficient folder copying.
* Flags:

  * `-a` → archive mode (preserves permissions, timestamps, symlinks, etc.).
  * `-v` → verbose (print what’s happening).
  * `-h` → human-readable sizes.
* `--delete` → remove files in destination that no longer exist in source.
* `--exclude 'raw/' --exclude 'interim/'` → skip those subfolders.
* `"$src"/` → source directory, with trailing `/` meaning “copy contents, not the folder itself.”
* `"$DEST/$src"/` → destination path inside the backup directory. `rsync` can create the last directory in the path automatically.

But you must ensure the parent directories already exist (with `mkdir -p "$DEST"`).

So, e.g. if `DEST="backups/run-20250909-105530"` and `src="reports"`, then files are copied into:

```
backups/run-20250909-105530/reports/
```



In [8]:
backup_sh = r"""#!/usr/bin/env bash
# Sync selected artifacts to backups/<timestamp> using rsync.
# Usage: scripts/backup.sh [DEST_ROOT]
set -euo pipefail
ROOT="${1:-backups}"
STAMP="$(date +%Y%m%d-%H%M%S)"
DEST="${ROOT}/run-${STAMP}"
mkdir -p "$DEST"

# What to back up (adjust as needed)
INCLUDE=("data/processed" "reports" "docs")

for src in "${INCLUDE[@]}"; do
  if [[ -d "$src" ]]; then
    echo "Syncing $src -> $DEST/$src"
    rsync -avh --delete --exclude 'raw/' --exclude 'interim/' "$src"/ "$DEST/$src"/
  fi
done

echo "Backup complete at $DEST"
"""
open("scripts/backup.sh","w").write(backup_sh)
import os, stat
os.chmod("scripts/backup.sh", os.stat("scripts/backup.sh").st_mode | stat.S_IEXEC)
print("Created scripts/backup.sh")

Created scripts/backup.sh


* `SHELL := /bin/bash` — tell `make` to run recipe commands with **bash** (not `/bin/sh`). Good if you rely on Bash-isms (arrays, `[[ ]]`, brace-expansion, etc.).
* `.SHELLFLAGS := -eu -o pipefail -c`

  * `-e` → exit immediately if any command in a recipe fails (non-zero status).
  * `-u` → treat **unset variables as an error** (helps catch typos like `$DATA_RAWW`).
  * `-o pipefail` → in pipelines (`a | b | c`), fail the whole pipeline if **any** part fails (not just the last command).
  * `-c` → standard: execute the following command string.

### Tool “aliases”

```make
PY := python
QUARTO := quarto
```

* Variables for executables. If you need a specific interpreter, you can override:

  * `make REPORT PY=python3.11`
  * `make REPORT QUARTO=/opt/quarto/bin/quarto`

### Tunable parameters (with defaults)

```make
START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30
```

* `?=` is **conditional assignment**: set a default **only if not already set** in the environment or on the CLI.
* This lets you override at run time without editing the Makefile:

  * `make report START=2019-01-01 END=2024-12-31 ROLL=60`
  * Or `START=2019-01-01 make report`

### File path variables

```make
DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html
```

* `:=` is **immediate assignment**: evaluate right now. (Here it’s identical to `=`, but `:=` avoids surprises if the RHS referenced other vars that might change later.)
* These centralize paths so your rules can reference them consistently:

  ```make
  $(DATA_RAW)
  $(FEATS)
  $(REPORT)
  ```


### Tips / gotchas

* On macOS/WSL/Windows Git Bash, ensure `/bin/bash` exists; otherwise use the path your system provides (e.g., `/usr/bin/env bash` in recipes: `bash -eu -o pipefail -c '...'`).
* With `-u`, any missing var expansion in recipes will fail fast—helpful during development.
* Prefer `?=` for user-tunable params; prefer `:=` for fixed paths/commands.

```make
.DEFAULT_GOAL := help
```

* Tells `make` what to run when you just type `make` with no arguments.
* By default, `make` uses the **first target defined in the file**, which can be confusing if it’s something destructive.
* Here, you explicitly set the default to the `help` target.
  So:

  ```bash
  make
  ```

  is equivalent to:

  ```bash
  make help
  ```

```make
.PHONY: help all clean clobber qa report backup
```

* Declares that these are **not actual files**, just “commands” to run.
* Why? Because by default, `make` thinks targets represent files that should be built.

  * If a file named `report` existed, `make report` would do nothing (because the file is “already up to date”).
* Declaring them `.PHONY` forces `make` to always run their recipes.

So each of these (`help`, `all`, `clean`, `clobber`, `qa`, `report`, `backup`) will always run when called.

### 3. Typical roles of those targets

* **help**: print usage instructions (often lists all available targets).
* **all**: build everything (full pipeline).
* **clean**: remove temporary or generated files.
* **clobber**: stronger clean (maybe also remove large datasets, caches).
* **qa**: quality assurance checks (linting, validation, etc.).
* **report**: build analysis reports (Quarto, Rmarkdown, LaTeX, etc.).
* **backup**: run your backup script/rsync workflow.

* `.PHONY` ensures those targets run unconditionally, without being mistaken for filenames.


```make
help: ## Show help for each target
```

* Target is `help`.
* The `## ...` comment is special: it’s parsed by the `awk` command below.
* When you type `make help`, this recipe runs.

```make
@awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} \
/^[a-zA-Z0-9_\-]+:.*##/ { \
  printf "  \033[36m%-18s\033[0m %s\n", $1, $2 \
}' $(MAKEFILE_LIST)
```

* **`@`** → suppresses echo of the command itself. Only prints what `awk` produces.
* **`awk`** processes your Makefile(s) (`$(MAKEFILE_LIST)` expands to the current Makefile and any included ones).

* **`FS = ":.*##"`** → Field Separator: split lines at the colon + `##` marker.

  * Example line:

    ```
    report: $(FEATS) ## Build the HTML report
    ```

    splits into:

    * `$1 = "report"`
    * `$2 = " Build the HTML report"`

* **Regex `/^[a-zA-Z0-9_\-]+:.*##/`**

  * Match any line that starts with a valid target name followed by a colon and containing `##`.
  * Ensures only documented targets show up.
  * `[a-zA-Z0-9_\-]+` → one or more characters that can be:

    * `a–z` (lowercase letters)

    * `A–Z` (uppercase letters)

    * `0–9` (digits)

    * `_` (underscore)

    * `-` (hyphen)
    → this matches a typical Make target name.

* `.` → a **dot** in regex means:
  “match any single character (except a newline, by default).”
  Examples:

  * `a.b` matches `acb`, `a9b`, `a-b`, etc.

* `*` → a **quantifier** meaning:
  “repeat the previous thing **zero or more times**.”

Put together:

* `.*` → “match **any sequence of characters**, including the empty string.”

After the `:`, you might have prerequisites, variables, whitespace, etc.
`.*` is just a “catch-all” to absorb whatever is there until the `##` marker.

So for:

```
report: $(FEATS) ## Build the report
```

* `[a-zA-Z0-9_\-]+` → `report`
* `:` → literal colon
* `.*` → `$(FEATS)` (the prerequisites part)
* `##` → start of the help text



* **`printf` with colors**

  * `\033[36m` = cyan (for target names).
  * `%-18s` = left-align names in 18-character column.
  * `\033[0m` = reset color.
  * `$1` = target name, `$2` = help text.

If your Makefile has:

```make
report: $(FEATS) ## Build the HTML report
clean:  ## Remove temporary files
backup: ## Back up data and reports
```

Running:

```bash
make help
```

Output:

```
Available targets:
  report             Build the HTML report
  clean              Remove temporary files
  backup             Back up data and reports
```

(with the target names in cyan).

**In short:**
This `help` target auto-extracts `##` comments after targets and prints a pretty help menu. It keeps your Makefile self-documenting.

###  Why `$$1` instead of `$1`?

* In **awk**, you’d normally write `$1` to mean “first field.”
* But in a **Makefile recipe**, `$` is special — it’s Make’s variable syntax.
* To get a literal `$` into the shell command, you need to escape it as `$$`.

* `MAKEFILE_LIST` is a **built-in Make variable**.
* It contains the list of all Makefiles that have been read, in the order they were parsed.
* The current Makefile is always included.

So when you write:

```make
awk ... $(MAKEFILE_LIST)
```

it means “run awk on this Makefile (and any included ones).”

That’s how the `help` target can scan your Makefile itself and pull out the `##` comments.


  * `$(DATA_RAW)` → expands to `data/raw/prices.csv` (your raw price data file).
  * `$(FEATS)` → expands to `data/processed/features.parquet` (your engineered features).
  * `report` → another target (probably builds your Quarto report).
  * `backup` → another target (runs your rsync backup script).

If any are missing or outdated, Make will run their recipes in the right order.

There’s no indented command block after `all:`.
That’s because `all` is just an **aggregate target** — it depends on others, but doesn’t do extra work itself.

**In short:**
`all` is a convenience target that ties together your **data download**, **feature generation**, **report building**, and **backup** steps. Running `make all` executes the whole workflow end to end.





### 1. Target

```make
$(DATA_RAW):
```

* `$(DATA_RAW)` expands to `data/raw/prices.csv` (from your earlier definition).
* So this rule builds that file.
* In Make terms: *“to produce `data/raw/prices.csv`, do the following …”*

---

### 2. Prerequisites

```make
scripts/get_prices.py tickers_25.csv
```

* These are the dependencies.
* Meaning: if either the script (`scripts/get_prices.py`) or the ticker list (`tickers_25.csv`) is newer than `data/raw/prices.csv`, then this target is out-of-date and should be rebuilt.
* If `data/raw/prices.csv` is missing entirely → it will also be rebuilt.

---

### 3. Recipe

```make
$(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)
```

* `$(PY)` expands to `python` (from your earlier variable).
* So the command is:

  ```bash
  python scripts/get_prices.py --tickers tickers_25.csv --start 2020-01-01 --end 2025-08-01 --out data/raw/prices.csv
  ```
* This runs your stock-data downloader script, generating the CSV.

Nice — that’s the **report-building rule** in your pipeline. Let’s unpack it carefully.


```make
report: $(REPORT) ## Render Quarto EDA to docs/
```

* Declares `report` as a phony-style alias for the actual file target `$(REPORT)`.

So:

```bash
make report
```

actually means *“make sure `docs/reports/eda.html` is up to date.”*


```make
@test -f $(REPORT) || (echo "Report not generated." && exit 1)
```

* `@` suppresses the echo of the command itself.
* `test -f $(REPORT)` checks if the report file exists.
* If not, it prints an error and exits with code 1.
* This is a sanity check to fail loudly if Quarto didn’t actually produce the file.


```bash
rm -rf data/processed/*.parquet
```

* `rm` = remove (delete files).
* `-r` = recursive (needed if directories are involved).
* `-f` = force (don’t ask, don’t complain if the file doesn’t exist).
* `data/processed/*.parquet` = all `.parquet` files in `data/processed/`.

So this deletes all processed parquet files.

---

### 2. The `|| true` part

```bash
... || true
```

* `||` = “OR” in shell.
* `true` is a command that always succeeds (exit status 0).
* Meaning: if the `rm` command fails (e.g. no matching `.parquet` files → `rm` exits with error), the whole line still returns success.

---

### 3. Why this matters in Make

* With `set -e` (or `.SHELLFLAGS := -eu -o pipefail -c` in your Makefile), any nonzero exit code would cause `make` to stop.
* `rm` on a glob with no matches can exit with error (`No such file or directory`).
* Adding `|| true` ensures the target won’t fail just because there’s nothing to delete.



**Note**: in the Makefile, I changed `docs/` to `docs1/` to avoid output into the lecture notes folders `docs/`

In [23]:
makefile = r"""# Makefile — unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c

PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
    @awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
    $(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
    # Basic QA first
    scripts/qa_csv.sh $(DATA_RAW)
    $(PY) scripts/build_features.py --input $(DATA_RAW) --out $(FEATS) --roll $(ROLL)

report: $(REPORT) ## Render Quarto EDA to docs1/
$(REPORT): reports/eda.qmd _quarto.yml docs1/style.css
    $(QUARTO) render reports/eda.qmd -P symbol:AAPL -P start_date=$(START) -P end_date=$(END) -P rolling=$(ROLL) --output-dir docs1/
    @test -f $(REPORT) || (echo "Report not generated." && exit 1)

backup: ## Rsync selected artifacts to backups/<timestamp>/
    ./scripts/backup.sh

clean: ## Remove intermediate artifacts (safe)
    rm -rf data/interim
    rm -rf data/processed/*.parquet || true

clobber: clean ## Remove generated reports and backups (dangerous)
    rm -rf docs/reports || true
    rm -rf backups || true
"""
open("Makefile","w").write(makefile)
print(open("Makefile").read())

# Makefile — unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c

PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
    @awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
    $(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
    # Basic QA first
    scripts/qa_csv.sh $(DATA_RAW)
    $(PY) scripts/build_features.py --input $(DATA_RAW) --out $(FEATS) --roll $(

1. `subprocess.check_output`

* From Python’s built-in [`subprocess`](https://docs.python.org/3/library/subprocess.html) module.
* Runs a command in a child process.
* Captures what the command prints to **stdout**.
* Returns it as a string (or bytes, if `text=True` isn’t set).
The `text=True` argument

* Converts the output from **bytes** to a regular Python **string**.
* Without it:

  ```python
  b"Available targets:\n  report   Build the HTML report\n..."
  ```
* With it:

  ```python
  "Available targets:\n  report   Build the HTML report\n..."
  ```



In [24]:
import subprocess, os, textwrap, sys
try:
  print(subprocess.check_output(["make", "help"], text=True))
except:
  print("cwd:", os.getcwd())
  print("make present?", shutil.which("make"))
  print("awk present?", shutil.which("awk"))
  print("Makefile exists?", os.path.exists("Makefile"))

  res = subprocess.run(["make", "help"], text=True, capture_output=True)
  print("returncode:", res.returncode)
  print("STDOUT:\n", res.stdout)
  print("STDERR:\n", res.stderr)

cwd: /content/drive/MyDrive/dspt25/STAT4160
make present? /usr/bin/make
awk present? /usr/bin/awk
Makefile exists? True
returncode: 2
STDOUT:
 
STDERR:
 Makefile:27: *** missing separator.  Stop.



```bash
perl -i -pe 's/^\h{4}(?=\S)/\t/' Makefile
```

### Pieces explained

* **`perl`** → use Perl for in-place text processing.
* **`-i`** → edit the file in place (overwrites `Makefile`).
* **`-pe`** →

  * `-p` loops over each line of the file,
  * `-e` executes the following code.
* **`s/.../.../`** → substitution regex.

#### Regex

* `^` → start of line.
* `\h{4}` → 4 horizontal whitespace characters (spaces or tabs, but usually spaces).
* `(?=\S)` → lookahead ensuring a non-whitespace character follows (so you don’t match lines that are just spaces).
* Replace that with `\t` → a single literal tab.
* `(...)` → parentheses = a **group**.
* `?=` → this makes it a **lookahead**.
* `\S` → “non-whitespace character” (the opposite of `\s`).

So effectively: **replace exactly 4 leading spaces before text with a tab.**






In [25]:
%%bash
# BACK UP FIRST
cp Makefile Makefile.bak
# Replace lines that BEGIN with 4 spaces by a single tab
perl -i -pe 's/^\h{4}(?=\S)/\t/' Makefile
cat Makefile

# Makefile — unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c

PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
	@awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
	$(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
	# Basic QA first
	scripts/qa_csv.sh $(DATA_RAW)
	$(PY) scripts/build_features.py --input $(DATA_RAW) --out $(FEATS) --roll $(ROLL)

report: 

In [26]:
import subprocess, os, textwrap, sys
try:
  print(subprocess.check_output(["make", "help"], text=True))
except:
  print("cwd:", os.getcwd())
  print("make present?", shutil.which("make"))
  print("awk present?", shutil.which("awk"))
  print("Makefile exists?", os.path.exists("Makefile"))

  res = subprocess.run(["make", "help"], text=True, capture_output=True)
  print("returncode:", res.returncode)
  print("STDOUT:\n", res.stdout)
  print("STDERR:\n", res.stderr)

Available targets:
  [36mhelp              [0m  Show help for each target
  [36mall               [0m  Run the full pipeline and back up artifacts
  [36mreport            [0m  Render Quarto EDA to docs1/
  [36mbackup            [0m  Rsync selected artifacts to backups/<timestamp>/
  [36mclean             [0m  Remove intermediate artifacts (safe)
  [36mclobber           [0m  Remove generated reports and backups (dangerous)



In [27]:
!chmod +x scripts/qa_csv.sh scripts/backup.sh


In [14]:
!pip install papermill



In [28]:
# Fetch raw, build features, render report, back up artifacts
import subprocess
try:
  print(subprocess.check_output(["make", "all"], text=True))
except:
  import subprocess, os, shutil
  print("cwd:", os.getcwd())
  res = subprocess.run(["make", "all"], text=True, capture_output=True)
  print("returncode:", res.returncode)
  print("STDOUT:\n", res.stdout)
  print("STDERR:\n", res.stderr)



cwd: /content/drive/MyDrive/dspt25/STAT4160
returncode: 2
STDOUT:
 quarto render reports/eda.qmd -P symbol:AAPL -P start_date=2020-01-01 -P end_date=2025-08-01 -P rolling=30 --output-dir docs1/
Report not generated.

STDERR:
 
Starting python3 kernel...[ColabKernelApp] ERROR | No such comm target registered: quarto_kernel_setup
Done

Executing 'eda.quarto_ipynb'
  Cell 1/8: ''...Done
  Cell 2/8: ''...Done
  Cell 3/8: ''...Done
  Cell 4/8: ''...Done
  Cell 5/8: ''...Done
  Cell 6/8: ''...Done
  Cell 7/8: ''...Done
  Cell 8/8: ''...Done

[1mpandoc [22m
  to: html
  output-file: eda.html
  standalone: true
  title-prefix: Unified Stocks — EDA
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  toc: true
  number-sections: false
  variables: {}
  
[1mmetadata[22m
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  theme: cosmo
  title: Stock EDA
  execute-dir: /content/drive/MyDrive/dspt25/STAT4160/reports
  jupyter: 

The above error in the last line is expected because I changed the output to docs1/.

```bash
set -e
if ! command -v just >/dev/null 2>&1; then
  echo "just not found; skipping optional step."
  exit 0
fi
```

1. **`set -e`**

   * Tells the shell: *“exit immediately if any command fails (non-zero status).”*
   * Useful for scripts where you want to stop on the first error.

2. **`command -v just >/dev/null 2>&1`**

   * `command -v prog` is the POSIX-portable way to check if `prog` is available in `$PATH`.
   * Here it checks if `just` (the [Just command runner](https://github.com/casey/just)) is installed.
   * `>/dev/null 2>&1` discards both stdout and stderr, so no messages are printed.

In POSIX shells, each process has numbered **file descriptors**:

* `0` = stdin
* `1` = stdout
* `2` = stderr

```bash
2>&1
```

it means:
“Redirect file descriptor 2 (stderr) to the same destination as file descriptor 1 (stdout).”

So both stdout and stderr go to the same place (screen, file, pipe…).

`justfile` is from a different tool called **[Just](https://github.com/casey/just)**.


* Just is a **command runner** (like Make, but simpler).
* It’s designed to save and organize shell commands you use often.
* You install it (`cargo install just` or via package manager), then you create a file named `justfile` in your project.


```just
set shell := ["bash", "-eu", "-o", "pipefail", "-c"]
```

* `set shell := [...]` → tells `just` which program to use as the command shell.
* The list form means: explicitly pass the program and its arguments.

So:

1. **`bash`** → use Bash as the shell (instead of `/bin/sh`).
2. **`-e`** → exit immediately if any command fails.
3. **`-u`** → error if an unset variable is used.
4. **`-o pipefail`** → in a pipeline (`a | b | c`), fail if *any* command fails, not just the last one.
5. **`-c`** → tells Bash to run the following string as a command.

By default, `just` runs recipes in `/bin/sh`. On Linux/macOS, `/bin/sh` can be a minimal shell (dash), which doesn’t support all Bash features. Setting this makes recipes behave more like what you’d expect from a robust Bash script with safe defaults.

You’ve already seen something very similar in your Makefile:

```make
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c
```


* Use `$(START)`/`$(END)` if you’re writing a Makefile.
* Use `{{start}}`/`{{end}}` only in a Justfile recipe.






In [29]:
%%bash
set -e
if ! command -v just >/dev/null 2>&1; then
  echo "just not found; skipping optional step."
  exit 0
fi
cat > justfile << 'EOF'
# justfile — optional convenience recipes
set shell := ["bash", "-eu", "-o", "pipefail", "-c"]

start := "2020-01-01"
end   := "2025-08-01"
roll  := "30"

help:
\t@echo "Recipes: get-data, features, report, all, backup"

get-data:
\tpython scripts/get_prices.py --tickers tickers_25.csv --start {{start}} --end {{end}} --out data/raw/prices.csv

features:
\tbash -lc 'scripts/qa_csv.sh data/raw/prices.csv'
\tpython scripts/build_features.py --input data/raw/prices.csv --out data/processed/features.parquet --roll {{roll}}

report:
\tquarto render reports/eda.qmd -P symbol:AAPL -P start_date={{start}} -P end_date={{end}} -P rolling:{{roll}} --output-dir docs1/

all: get-data features report

backup:
\t./scripts/backup.sh
EOF
echo "Wrote justfile (optional)."

just not found; skipping optional step.


In [30]:
!cat justfile

cat: justfile: No such file or directory


The standard workflow for generating a new SSH key pair and making it usable with services like GitHub or your own servers.

```bash
ssh-keygen -t ed25519 -C "you@school.edu"
```

* `ssh-keygen` → program to create a new SSH key pair.
* `-t ed25519` → use the modern **Ed25519** algorithm (faster, shorter, more secure than RSA).
* **`-t`** = *type of key to create*.
* Common values:

  * `rsa` (older, 2048/4096-bit keys)
  * `ed25519` (modern, shorter, secure, recommended)
  * `ecdsa` (rare, not widely used anymore)
* So `-t ed25519` means “generate an Ed25519 keypair.”
* `-C "you@school.edu"` → add a comment label in the key (often your email).

**During the prompts:**

* *File location*: press **Enter** to accept the default `~/.ssh/id_ed25519`.
  (If that file already exists and you don’t want to overwrite it, pick another name like `~/.ssh/id_ed25519_github`.)
* *Passphrase*: recommended to add one (adds security in case someone steals your private key). Leave blank if you want passwordless, but less secure.

This creates two files:

* `~/.ssh/id_ed25519` → **private key** (keep secret, never share).
* `~/.ssh/id_ed25519.pub` → **public key** (safe to share).

### View the public key

```bash
cat ~/.ssh/id_ed25519.pub
```

* Prints the public key.
* You copy the whole line starting with `ssh-ed25519 ...` and paste it:

  * Into **GitHub → Settings → SSH and GPG keys → New SSH key**, or
  * Into a server’s `~/.ssh/authorized_keys` file.


### Test the key

Once added, test with:

```bash
ssh -T git@github.com
```

You should see something like:

```
Hi your-username! You've successfully authenticated, but GitHub does not provide shell access.
```

* **`-T`** = *disable pseudo-tty allocation*.
* Normally `ssh` gives you an interactive terminal session on the remote machine.
* With `-T`, you’re saying “don’t open a terminal, just run the command.”
* GitHub specifically tells you to use `ssh -T` when testing keys, because GitHub doesn’t provide shell access — you just want authentication to succeed.




In [None]:
### Run the followng code in a Terminal , and
 #Press enter to accept default path (~/.ssh/id_ed25519), set a passphrase (recommended)

ssh-keygen -t ed25519 -C "you@school.edu"

### Run the followng code in a Terminal , and
 Press enter to accept default path (~/.ssh/id_ed25519), set a passphrase (recommended)
```bash
ssh-keygen -t ed25519 -C "you@school.edu"
```

In [34]:
cat ~/.ssh/id_ed25519.pub   # copy this PUBLIC key where needed (GitHub/servers)

ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBHbSPan69Pcxm1N08IGYJFn3bv/xNqXDQ0aq+ByPAsJ you@school.edu


## non-interactive fix (works in Colab/terminals)

```bash
# 1) Ensure the .ssh dir exists and has safe perms
mkdir -p ~/.ssh && chmod 700 ~/.ssh

# 2) Generate the keypair without prompts:
#    -f = filename; -N = passphrase (empty "" here; set one if you want)
ssh-keygen -t ed25519 -C "you@school.edu" -f ~/.ssh/id_ed25519 -N ""

# 3) Show your PUBLIC key (copy this to GitHub/servers)
cat ~/.ssh/id_ed25519.pub
```

`~/` is a shell shortcut for “inside my home directory”, which in Colab means `/root/`.

In [5]:
%%bash
# 1) Ensure the .ssh dir exists and has safe perms
mkdir -p ~/.ssh && chmod 700 ~/.ssh

# 2) Generate the keypair without prompts:
#    -f = filename; -N = passphrase (empty "" here; set one if you want)
ssh-keygen -t ed25519 -C "you@school.edu" -f ~/.ssh/id_ed25519 -N ""

# 3) Show your PUBLIC key (copy this to GitHub/servers)
cat ~/.ssh/id_ed25519.pub


Generating public/private ed25519 key pair.
Your identification has been saved in /root/.ssh/id_ed25519
Your public key has been saved in /root/.ssh/id_ed25519.pub
The key fingerprint is:
SHA256:Idt7w12+YGj9CMbv0O9M2M5mgWOVxr9Qtcu+XYv2XkI you@school.edu
The key's randomart image is:
+--[ED25519 256]--+
|                 |
|                .|
|      . .    . .o|
|       + .    =o |
|      . S    +E..|
|         + =+B.o.|
|        . @.B.B.+|
|         + * @+==|
|           .==@=o|
+----[SHA256]-----+
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINEa/ANyxBiPcVaJV8x+qxGK75bJxYOV8KBserQxqOrr you@school.edu


Note: the public key include three parts sepated spaces. After having generated the public key, copy the public key into GitHub->settings->SSH keys->new SSH key

In [15]:
!ssh -T git@github.com

Hi ywanglab! You've successfully authenticated, but GitHub does not provide shell access.


In [16]:
%%bash
cat >> ~/.ssh/config <<'CFG'
Host github
  HostName github.com
  User git
  IdentityFile ~/.ssh/id_ed25519
  IdentitiesOnly yes
  AddKeysToAgent yes
CFG

chmod 600 ~/.ssh/config


In [17]:
!ssh -T github || true


Hi ywanglab! You've successfully authenticated, but GitHub does not provide shell access.


**`tmux`** stands for **terminal multiplexer**. It’s a tool that lets you manage multiple terminal sessions inside a single terminal window. Think of it as a "window manager for your terminal."

* You can start a program (like training a model) inside `tmux`.
* If you disconnect (close your laptop, lose SSH connection, etc.), the program keeps running in the background.
* Later, you can **reattach** and see the output as if you never left.
* You can split your terminal screen into **panes** (horizontal/vertical).
* You can also create multiple **windows** inside the same `tmux` session.
* This means you can run different processes (e.g., monitoring logs, editing code, running a server) side by side.


* `tmux new -s train` → start a new session named `train`.
* `Ctrl-b d` → detach (session continues running in background).
* `tmux ls` → list all active sessions.
* `tmux attach -t train` → reconnect to the session. `-t`: target session
* `tmux kill-session -t train` → end the session (kills all processes inside it).




In [None]:
%%bash
tmux new -s train              # start session "train"
# ... run your long job ...
# detach: press Ctrl-b then d
tmux ls                        # list sessions
tmux attach -t train           # reattach
tmux kill-session -t train     # end session

# Homework

```python
json.dump(
    {"model": "linear(lag1,lag2,lag3)",  # a string describing your model
     "test_mae": mae,                   # test mean absolute error (probably a float variable)
     "n_test": len(yte)},               # number of test samples
    f,                                  # file object, e.g., opened with open("results.json", "w")
    indent=2                            # pretty-print with 2 spaces indentation
)
```





In [19]:
train_py = r"""#!/usr/bin/env python
import argparse, json
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--features", default="data/processed/features.parquet")
    ap.add_argument("--out-metrics", default="reports/baseline_metrics.json")
    args = ap.parse_args()

    df = pd.read_parquet(args.features)
    # Train/test split by date (last 20% for test)
    df = df.dropna(subset=["lag1","lag2","lag3","r_1d"])
    n = len(df)
    split = int(n*0.8)
    Xtr = df[["lag1","lag2","lag3"]].iloc[:split].values
    ytr = df["r_1d"].iloc[:split].values
    Xte = df[["lag1","lag2","lag3"]].iloc[split:].values
    yte = df["r_1d"].iloc[split:].values

    model = LinearRegression().fit(Xtr, ytr)
    pred = model.predict(Xte)
    mae = float(mean_absolute_error(yte, pred))

    Path("reports").mkdir(exist_ok=True)
    with open(args.out_metrics, "w") as f:
        json.dump({"model":"linear(lag1,lag2,lag3)","test_mae":mae,"n_test":len(yte)}, f, indent=2)
    print("Wrote", args.out_metrics, "MAE:", mae)

if __name__ == "__main__":
    main()
"""
open("scripts/train_baseline.py","w").write(train_py)
import os, stat
os.chmod("scripts/train_baseline.py", os.stat("scripts/train_baseline.py").st_mode | stat.S_IEXEC)
print("Created scripts/train_baseline.py")

Created scripts/train_baseline.py


Append these to your `Makefile`:

``` make
# --- add after FEATS definition, near other targets ---

TRAIN_METRICS := reports/baseline_metrics.json

.PHONY: train
train: $(TRAIN_METRICS) ## Train toy baseline and write metrics

$(TRAIN_METRICS): scripts/train_baseline.py $(FEATS)
    $(PY) scripts/train_baseline.py --features $(FEATS) --out-metrics $(TRAIN_METRICS)

# Update 'all' to include 'train'
# all: $(DATA_RAW) $(FEATS) report backup   # OLD
# Replace with:
# all: $(DATA_RAW) $(FEATS) report train backup
```

In [20]:
makefile = r"""# Makefile — unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c

PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
    @awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

# all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts
all: $(DATA_RAW) $(FEATS) report train backup

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
    $(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
    # Basic QA first
    scripts/qa_csv.sh $(DATA_RAW)
    $(PY) scripts/build_features.py --input $(DATA_RAW) --out $(FEATS) --roll $(ROLL)

# --- add after FEATS definition, near other targets ---

TRAIN_METRICS := reports/baseline_metrics.json

.PHONY: train
train: $(TRAIN_METRICS) ## Train toy baseline and write metrics

$(TRAIN_METRICS): scripts/train_baseline.py $(FEATS)
    $(PY) scripts/train_baseline.py --features $(FEATS) --out-metrics $(TRAIN_METRICS)

report: $(REPORT) ## Render Quarto EDA to docs1/
$(REPORT): reports/eda.qmd _quarto.yml docs1/style.css
    $(QUARTO) render reports/eda.qmd -P symbol:AAPL -P start_date=$(START) -P end_date=$(END) -P rolling=$(ROLL) --output-dir docs1/
    @test -f $(REPORT) || (echo "Report not generated." && exit 1)

backup: ## Rsync selected artifacts to backups/<timestamp>/
    ./scripts/backup.sh

clean: ## Remove intermediate artifacts (safe)
    rm -rf data/interim
    rm -rf data/processed/*.parquet || true

clobber: clean ## Remove generated reports and backups (dangerous)
    rm -rf docs/reports || true
    rm -rf backups || true
"""
open("Makefile","w").write(makefile)
print(open("Makefile").read())

# Makefile — unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c

PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
    @awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

# all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts
all: $(DATA_RAW) $(FEATS) report train backup

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
    $(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
    # Basic QA first
    scripts/qa_csv.sh $(DATA_RAW)
    $(PY) scripts/build_features

In [21]:
import subprocess, os, textwrap, sys
try:
  print(subprocess.check_output(["make", "help"], text=True))
except:
  print("cwd:", os.getcwd())
  print("make present?", shutil.which("make"))
  print("awk present?", shutil.which("awk"))
  print("Makefile exists?", os.path.exists("Makefile"))

  res = subprocess.run(["make", "help"], text=True, capture_output=True)
  print("returncode:", res.returncode)
  print("STDOUT:\n", res.stdout)
  print("STDERR:\n", res.stderr)

cwd: /content/drive/MyDrive/dspt25/STAT4160
make present? /usr/bin/make
awk present? /usr/bin/awk
Makefile exists? True
returncode: 2
STDOUT:
 
STDERR:
 Makefile:28: *** missing separator.  Stop.



In [22]:
%%bash
# BACK UP FIRST
cp Makefile Makefile.bak
# Replace lines that BEGIN with 4 spaces by a single tab
perl -i -pe 's/^\h{4}(?=\S)/\t/' Makefile
cat Makefile

# Makefile — unified-stocks
SHELL := /bin/bash
.SHELLFLAGS := -eu -o pipefail -c

PY := python
QUARTO := quarto

START ?= 2020-01-01
END   ?= 2025-08-01
ROLL  ?= 30

DATA_RAW := data/raw/prices.csv
FEATS    := data/processed/features.parquet
REPORT   := docs/reports/eda.html

# Default target
.DEFAULT_GOAL := help

.PHONY: help all clean clobber qa report backup

help: ## Show help for each target
	@awk 'BEGIN {FS = ":.*##"; printf "Available targets:\n"} /^[a-zA-Z0-9_\-]+:.*##/ {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

# all: $(DATA_RAW) $(FEATS) report backup ## Run the full pipeline and back up artifacts
all: $(DATA_RAW) $(FEATS) report train backup

$(DATA_RAW): scripts/get_prices.py tickers_25.csv
	$(PY) scripts/get_prices.py --tickers tickers_25.csv --start $(START) --end $(END) --out $(DATA_RAW)

$(FEATS): scripts/build_features.py $(DATA_RAW) scripts/qa_csv.sh
	# Basic QA first
	scripts/qa_csv.sh $(DATA_RAW)
	$(PY) scripts/build_features.py --input $(D

In [23]:
import subprocess, os, textwrap, sys
try:
  print(subprocess.check_output(["make", "help"], text=True))
except:
  print("cwd:", os.getcwd())
  print("make present?", shutil.which("make"))
  print("awk present?", shutil.which("awk"))
  print("Makefile exists?", os.path.exists("Makefile"))

  res = subprocess.run(["make", "help"], text=True, capture_output=True)
  print("returncode:", res.returncode)
  print("STDOUT:\n", res.stdout)
  print("STDERR:\n", res.stderr)

Available targets:
  [36mhelp              [0m  Show help for each target
  [36mtrain             [0m  Train toy baseline and write metrics
  [36mreport            [0m  Render Quarto EDA to docs1/
  [36mbackup            [0m  Rsync selected artifacts to backups/<timestamp>/
  [36mclean             [0m  Remove intermediate artifacts (safe)
  [36mclobber           [0m  Remove generated reports and backups (dangerous)



In [None]:
!chmod +x scripts/qa_csv.sh scripts/backup.sh


In [25]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"
make train
cat reports/baseline_metrics.json

python scripts/train_baseline.py --features data/processed/features.parquet --out-metrics reports/baseline_metrics.json
Wrote reports/baseline_metrics.json MAE: 0.00775245930297027
{
  "model": "linear(lag1,lag2,lag3)",
  "test_mae": 0.00775245930297027,
  "n_test": 885
}