In [1]:
# Colab cell
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Adjust these two for YOUR repo
REPO_OWNER = "kadkins3"
REPO_NAME  = "STAT4160"   # e.g., STAT4160-team1

BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [3]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    # !git status
    # !git pull --rebase # !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Working dir: /content/drive/MyDrive/dspt25/STAT4160


In [4]:
train_py = r"""#!/usr/bin/env python
import argparse, json
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--features", default="data/processed/features.parquet")
    ap.add_argument("--out-metrics", default="reports/baseline_metrics.json")
    args = ap.parse_args()

    df = pd.read_parquet(args.features)
    # Train/test split by date (last 20% for test)
    df = df.dropna(subset=["lag1","lag2","lag3","r_1d"])
    n = len(df)
    split = int(n*0.8)
    Xtr = df[["lag1","lag2","lag3"]].iloc[:split].values
    ytr = df["r_1d"].iloc[:split].values
    Xte = df[["lag1","lag2","lag3"]].iloc[split:].values
    yte = df["r_1d"].iloc[split:].values

    model = LinearRegression().fit(Xtr, ytr)
    pred = model.predict(Xte)
    mae = float(mean_absolute_error(yte, pred))

    Path("reports").mkdir(exist_ok=True)
    with open(args.out_metrics, "w") as f:
        json.dump({"model":"linear(lag1,lag2,lag3)","test_mae":mae,"n_test":len(yte)}, f, indent=2)
    print("Wrote", args.out_metrics, "MAE:", mae)

if __name__ == "__main__":
    main()
"""
open("scripts/train_baseline.py","w").write(train_py)
import os, stat
os.chmod("scripts/train_baseline.py", os.stat("scripts/train_baseline.py").st_mode | stat.S_IEXEC)
print("Created scripts/train_baseline.py")

Created scripts/train_baseline.py


In [5]:
%%bash
cat >> Makefile <<'EOF'

# --- Training step (Part B) ---

TRAIN_METRICS := reports/baseline_metrics.json

.PHONY: train
train: $(TRAIN_METRICS) ## Train toy baseline and write metrics

$(TRAIN_METRICS): scripts/train_baseline.py $(FEATS)
	$(PY) scripts/train_baseline.py --features $(FEATS) --out-metrics $(TRAIN_METRICS)

EOF


In [6]:
%%bash
cd "/content/drive/MyDrive/dspt25/STAT4160"
# convert CRLF -> LF for all shell scripts
find . -type f -name "*.sh" -print -exec sed -i 's/\r$//' {} \;
# ensure executable
chmod +x scripts/*.sh
# re-run the make target that failed
make train
cat reports/baseline_metrics.json

./homework/homework_5_KA/qa_csv.sh
./scripts/qa_csv.sh
./scripts/backup.sh
python scripts/get_prices.py --tickers tickers_25.csv --start 2020-01-01 --end 2025-08-01 --out data/raw/prices.csv
Wrote data/raw/prices.csv rows: 36450
# Basic QA first
scripts/qa_csv.sh data/raw/prices.csv
python scripts/build_features.py --input data/raw/prices.csv --out data/processed/features.parquet --roll 30
OK: data/raw/prices.csv passed basic CSV QA (36451 lines).
Wrote data/processed/features.parquet rows: 36450
python scripts/train_baseline.py --features data/processed/features.parquet --out-metrics reports/baseline_metrics.json
Wrote reports/baseline_metrics.json MAE: 0.007950903336381838
{
  "model": "linear(lag1,lag2,lag3)",
  "test_mae": 0.007950903336381838,
  "n_test": 7275
}



In [7]:
%%bash
set -euo pipefail
cd "/content/drive/MyDrive/dspt25/STAT4160"
make help

Available targets:
  [36mhelp              [0m  Show help for each target
  [36mtrain             [0m  Train toy baseline and write metrics
  [36mreport            [0m  Render Quarto EDA to docs1/
  [36mbackup            [0m  Rsync selected artifacts to backups/<timestamp>/
  [36mclean             [0m  Remove intermediate artifacts (safe)
  [36mclobber           [0m  Remove generated reports and backups (dangerous)
  [36mdb                [0m  Build/refresh SQLite database from CSVs
  [36msql-report        [0m  Generate a simple SQL-driven CSV summary
  [36mprices-parquet    [0m  Clean raw prices and save processed Parquet(s)
  [36mreturns-parquet   [0m  Build returns.parquet with r_1d + calendar features
  [36mtrain             [0m  Train toy baseline and write metrics


