In [2]:
# Colab cell
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# Adjust these two for YOUR repo
REPO_OWNER = "ywanglab"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1

BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [4]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    # !git status
    # !git pull --rebase # !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Working dir: /content/drive/MyDrive/dspt25/STAT4160


In [5]:
import argparse, sys, time
from pathlib import Path
import pandas as pd, numpy as np

def fetch_yf(ticker, start, end):
    import yfinance as yf
    df = yf.download(ticker, start=start, end=end, auto_adjust=True, progress=False)
    if df is None or df.empty:
        raise RuntimeError("empty")
    df = df.rename(columns=str.lower)[["close","volume"]]
    df.index.name = "date"
    df = df.reset_index()
    df["ticker"] = ticker
    return df[["ticker","date","close","volume"]]

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--tickers", default="tickers_25.csv")
    ap.add_argument("--start", default="2020-01-01")
    ap.add_argument("--end", default="")
    ap.add_argument("--out", default="data/raw/prices.csv")
    args = ap.parse_args()

    out = Path(args.out)
    out.parent.mkdir(parents=True, exist_ok=True)
    tickers = pd.read_csv(args.tickers)["ticker"].dropna().unique().tolist()

    rows = []
    for t in tickers:
        try:
            df = fetch_yf(t, args.start, args.end or None)
        except Exception:
            # synthetic fallback
            idx = pd.bdate_range(args.start, args.end or pd.Timestamp.today().date())
            rng = np.random.default_rng(42 + hash(t)%1000)
            r = rng.normal(0, 0.01, len(idx))
            price = 100*np.exp(np.cumsum(r))
            vol = rng.integers(1e5, 5e6, len(idx))
            df = pd.DataFrame({"ticker": t, "date": idx, "close": price, "volume": vol})
        df["date"] = pd.to_datetime(df["date"]).dt.date
        df["adj_close"] = df["close"]
        df = df.drop(columns=["close"])
        df["log_return"] = np.log(df["adj_close"]).diff().fillna(0.0)
        rows.append(df)

    allp = pd.concat(rows, ignore_index=True)
    allp = allp[["ticker","date","adj_close","volume","log_return"]]
    allp.to_csv(out, index=False)
    print("Wrote", out, "rows:", len(allp))

if __name__ == "__main__":
    sys.exit(main())

usage: colab_kernel_launcher.py [-h] [--tickers TICKERS] [--start START]
                                [--end END] [--out OUT]
colab_kernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-4062556b-e48f-4176-aae0-6b7de20f3c96.json


SystemExit: 2

In [None]:
import argparse
from pathlib import Path
import pandas as pd, numpy as np

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--input", default="data/raw/prices.csv")
    ap.add_argument("--out", default="data/processed/features.parquet")
    ap.add_argument("--roll", type=int, default=20)
    args = ap.parse_args()

    df = pd.read_csv(args.input, parse_dates=["date"])
    df = df.sort_values(["ticker","date"])
    # groupwise lags
    df["r_1d"] = df["log_return"]
    for k in (1,2,3):
        df[f"lag{k}"] = df.groupby("ticker")["r_1d"].shift(k)
    df["roll_mean"] = (df.groupby("ticker")["r_1d"]
                         .rolling(args.roll, min_periods=args.roll//2).mean()
                         .reset_index(level=0, drop=True))
    df["roll_std"]  = (df.groupby("ticker")["r_1d"]
                         .rolling(args.roll, min_periods=args.roll//2).std()
                         .reset_index(level=0, drop=True))
    out = Path(args.out)
    out.parent.mkdir(parents=True, exist_ok=True)
    # Save compactly
    df.to_parquet(out, index=False)
    print("Wrote", out, "rows:", len(df))

if __name__ == "__main__":
    main()