In [21]:
# Colab cell
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [22]:
# Adjust these two for YOUR repo
REPO_OWNER = "ywanglab"
REPO_NAME  = "STAT4160"   # e.g., unified-stocks-team1

BASE_DIR   = "/content/drive/MyDrive/dspt25"
CLONE_DIR  = f"{BASE_DIR}/{REPO_NAME}"
REPO_URL   = f"https://github.com/{REPO_OWNER}/{REPO_NAME}.git"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)


In [23]:
import os, subprocess, shutil, pathlib

if not pathlib.Path(CLONE_DIR).exists():
    !git clone {REPO_URL} {CLONE_DIR}
else:
    # If the folder exists, just ensure it's a git repo and pull latest
    os.chdir(CLONE_DIR)
    # !git status
    # !git pull --rebase # !git pull --ff-only
os.chdir(CLONE_DIR)
print("Working dir:", os.getcwd())

Working dir: /content/drive/MyDrive/dspt25/STAT4160


In [None]:

!git status

In [None]:
!git pull --rebase # !git pull --ff-only

In [None]:
# Install Quarto CLI (one-time per Colab runtime)
# !wget -q https://quarto.org/download/latest/quarto-linux-amd64.deb -O /tmp/quarto.deb
# !dpkg -i /tmp/quarto.deb || apt-get -y -f install >/dev/null && dpkg -i /tmp/quarto.deb
# !quarto --version

#Alternatively, save it to G-drive, and only need to download the first time. The size of  quarto-linux-amd64.deb is ~125Mb.
# Path to store the deb package
deb_path = "/content/drive/MyDrive/quarto-linux-amd64.deb"

# Download only if not already saved
!test -f $deb_path || wget -q https://quarto.org/download/latest/quarto-linux-amd64.deb -O $deb_path

# Install from Drive (fast, no re-download)
!dpkg -i $deb_path || apt-get -y -f install >/dev/null && dpkg -i $deb_path #-f: fix package dependency issues
!quarto --version

(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ... 95%(Reading database ... 100%(Reading database ... 130482 files and directories currently installed.)
Preparing to unpack .../MyDrive/quarto-linux-amd64.deb ...
Unpacking quarto (1.7.33) over (1.7.33) ...
Setting up quarto (1.7.33) ...
(Reading database ... 130482 files and directories currently installed.)
Preparing to unpack .../MyDrive/quarto-linux-amd64.deb ...
Unpacking quarto (1.7.33) over (1.7.33) ...
Setting up quarto (1.7.33) ...
1.7.33


In [None]:
# Install ruamel.yaml for safe YAML edits
!pip -q install ruamel.yaml

In [None]:
# # Install ruamel.yaml for safe YAML edits
# !pip -q install ruamel.yaml

from ruamel.yaml import YAML
from pathlib import Path

yaml = YAML()
cfg_path = Path("_quarto.yml")
if cfg_path.exists():
    cfg = yaml.load(cfg_path.read_text())
else:
    cfg = {"project": {"type": "website", "output-dir": "docs"},
           "website": {"title": "Unified Stocks", "navbar": {"left": [{"href":"index.qmd","text":"Home"}]}},
           "format":{"html":{"theme":"cosmo","toc":True}}}

# Add/ensure features
cfg.setdefault("format", {}).setdefault("html", {})
cfg["format"]["html"]["toc"] = True
cfg["format"]["html"]["code-fold"] = False
cfg["format"]["html"]["toc-depth"] = 2
cfg["format"]["html"]["page-navigation"] = True
cfg["format"]["html"]["code-tools"] = True
cfg["format"]["html"]["fig-cap-location"] = "bottom"
cfg["format"]["html"]["tbl-cap-location"] = "top"
cfg["format"]["html"]["css"] = "docs1/style.css"

cfg.setdefault("execute", {})
cfg["execute"]["echo"] = True
cfg["execute"]["warning"] = False
cfg["execute"]["cache"] = True

# Freeze: deterministic rebuilds until the source changes
# cfg["project"]["freeze"] = "auto"
cfg["execute"]["freeze"] = "auto"

# Bibliography
cfg["bibliography"] = "references.bib"

# Ensure navbar has EDA link
nav = cfg.setdefault("website", {}).setdefault("navbar", {}).setdefault("left", [])
if not any(item.get("href") == "reports/eda.qmd" for item in nav if isinstance(item, dict)):
    nav.append({"href": "reports/eda.qmd", "text": "EDA"})

yaml.dump(cfg, open("_quarto.yml","w"))
print(open("_quarto.yml").read())

project:
  type: website
  output-dir: docs1

website:
  title: Unified Stocks — EDA
  navbar:
    left:
    - href: index.qmd
      text: Home
    - href: reports/eda.qmd
      text: EDA (parametrized)

format:
  html:
    theme: cosmo
    toc: true
    code-fold: false

    toc-depth: 2
    page-navigation: true
    code-tools: true
    fig-cap-location: bottom
    tbl-cap-location: top
    css: docs1/style.css
execute:
  echo: true
  cache: true
  freeze: auto
bibliography: references.bib



In [None]:
refs = r"""@book{hyndman-fpp3,
  title = {Forecasting: Principles and Practice},
  author = {Hyndman, Rob J. and Athanasopoulos, George},
  edition = {3},
  year = {2021},
  url = {https://otexts.com/fpp3/}
}
@misc{quarto-docs,
  title = {Quarto Documentation},
  author = {{Posit}},
  year = {2025},
  url = {https://quarto.org/}
}
@misc{yfinance,
  title = {yfinance: Yahoo! Finance market data downloader},
  author = {Ran Aroussi},
  year = {2024},
  url = {https://github.com/ranaroussi/yfinance}
}
"""
open("references.bib","w").write(refs)
print(open("references.bib").read())

@book{hyndman-fpp3,
  title = {Forecasting: Principles and Practice},
  author = {Hyndman, Rob J. and Athanasopoulos, George},
  edition = {3},
  year = {2021},
  url = {https://otexts.com/fpp3/}
}
@misc{quarto-docs,
  title = {Quarto Documentation},
  author = {{Posit}},
  year = {2025},
  url = {https://quarto.org/}
}
@misc{yfinance,
  title = {yfinance: Yahoo! Finance market data downloader},
  author = {Ran Aroussi},
  year = {2024},
  url = {https://github.com/ranaroussi/yfinance}
}



In [None]:
from textwrap import dedent
eda = dedent(r"""\
---
title: "Stock EDA"
format:
  html:
    toc: true
    number-sections: false
execute-dir: "/content/drive/MyDrive/dspt25/STAT4160/reports"
execute:
  echo: true
  warning: false
  cache: true

jupyter: python3
params:
  symbol: "AAPL"
  start_date: "2018-01-01"
  end_date: ""
  rolling: 20
---

> *Educational use only — not trading advice.* Data pulled via **yfinance** [@yfinance].

This page is **parameterized**; see the **Parameters** section for usage.

## Setup parameters if using Python
```{python}
#| tags: [parameters]
# Default values (overridden by -P at render time)
SYMBOL = "AAPL"
START  = "2018-01-01"
END    = ""
ROLL   =  20
```
## Setup

```{python}
#| echo: true
#| message: false
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from pathlib import Path

# SYMBOL = params.get("symbol", "AAPL")
# START  = params.get("start_date", "2018-01-01")
# END    = params.get("end_date", "")
# ROLL   = int(params.get("rolling", 20))
if not END:
  END = pd.Timestamp.today().strftime("%Y-%m-%d")
```

## Download and tidy

```{python}
#| echo: true
try:
  data = yf.download(SYMBOL, start=START, end=END, auto_adjust=True, progress=False)
except Exception as e:
  # Synthetic fallback
  idx = pd.bdate_range(START, END)
  rng = np.random.default_rng(42)
  ret = rng.normal(0, 0.01, len(idx))
  price = 100 * np.exp(np.cumsum(ret))
  vol = rng.integers(1e5, 5e6, len(idx))
  data = pd.DataFrame({"Close": price, "Volume": vol}, index=idx)

df = (data.rename(columns=str.lower)[["close","volume"]]
        .dropna()
        .assign(log_return=lambda d: np.log(d["close"]).diff()))
df["roll_mean"] = df["log_return"].rolling(ROLL, min_periods=ROLL//2).mean()
df["roll_vol"]  = df["log_return"].rolling(ROLL, min_periods=ROLL//2).std()
df = df.dropna()
```

## Price over time

```{python}
#| label: fig-price
#| fig-cap: "{SYMBOL} — Adjusted Close"
#| fig-alt: "Line chart showing adjusted close price of the selected stock over time."
#| echo: false
fig, ax = plt.subplots(figsize=(8,3))
ax.plot(df.index, df["close"])
ax.set_xlabel("Date"); ax.set_ylabel("Price")
fig.tight_layout()
```

As shown in **Figure @fig-price**, prices vary over time with changing volatility.

## Return distribution

```{python}
#| label: fig-hist
#| fig-cap: "{SYMBOL} — Daily log return histogram"
#| fig-alt: "Histogram of daily log returns."
#| echo: false
fig, ax = plt.subplots(figsize=(6,3))
ax.hist(df["log_return"], bins=50, alpha=0.85)
ax.set_xlabel("log return"); ax.set_ylabel("count")
fig.tight_layout()
```

**Figure @fig-hist** shows the return distribution; many assets exhibit heavy tails \[@hyndman-fpp3, pp. 20–21].

## Rolling statistics (window = {params.rolling})

```{python}
#| label: fig-rolling
#| fig-cap: "{SYMBOL} — Rolling mean and volatility"
#| fig-alt: "Two line plots of rolling mean and rolling standard deviation of returns."
#| echo: false
fig, ax = plt.subplots(figsize=(8,3))
ax.plot(df.index, df["roll_mean"], label="rolling mean")
ax.plot(df.index, df["roll_vol"],  label="rolling std")
ax.set_xlabel("Date"); ax.set_ylabel("value")
ax.legend()
fig.tight_layout()
```

## Summary table

```{python}
#| label: tbl-summary
#| tbl-cap: "Summary statistics for {SYMBOL}."
#| echo: false
summary = pd.DataFrame({
    "n_days": [len(df)],
    "start": [df.index.min().date()],
    "end":   [df.index.max().date()],
    "mean_daily_ret": [df["log_return"].mean()],
    "std_daily_ret":  [df["log_return"].std()],
    "ann_vol_approx": [df["log_return"].std()*np.sqrt(252)]
})
summary
```

See **Table @tbl-summary** for overall statistics.

## Data dictionary

```{python}
#| label: tbl-dict
#| tbl-cap: "Data dictionary for computed columns."
#| echo: false
desc = {
  "close": "Adjusted closing price.",
  "volume": "Trading volume.",
  "log_return": "log(Close_t) − log(Close_{t−1}).",
  "roll_mean": f"Rolling mean of log returns (window={ROLL}).",
  "roll_vol": f"Rolling std of log returns (window={ROLL})."
}
# Build a mapping: first-level name -> combined dtype(s)
dtype_by_price = (
    df.dtypes                          # Series indexed by (Price, Ticker)
      .groupby(level=0)                # group by first level "Price"
      .apply(lambda s: " / ".join(sorted({str(dt) for dt in s})))
)

dd = pd.DataFrame({
    "column": list(desc.keys()),
    "dtype": [dtype_by_price.get(c, "MISSING") for c in desc.keys()],
    "description": list(desc.values())
})
dd
# dd = pd.DataFrame({
#   "column": list(desc.keys()),
#   "dtype": [str(df[c].dtype) for c in desc.keys()],
#   "description": list(desc.values()) })
# dd
```

## Parameters

This page accepts parameters: `symbol`, `start_date`, `end_date`, and `rolling`. You can re‑render with:

```
quarto render reports/eda.qmd \\
  -P symbol:MSFT -P start_date:2019-01-01 -P end_date:2025-08-01 -P rolling:30
```

## References

""")

In [None]:
open("reports/eda.qmd","w").write(eda)
print("Wrote reports/eda.qmd with hygiene features.")

Wrote reports/eda.qmd with hygiene features.


In [None]:
from pathlib import Path
Path("docs1").mkdir(exist_ok=True)
css = """\
/* Increase base font and widen code blocks slightly */
body { font-size: 1.02rem; }
pre code { white-space: pre-wrap; }
img { max-width: 100%; height: auto; }
"""


In [None]:
open("docs1/style.css","w").write(css)
print("Wrote docs1/style.css")

Wrote docs1/style.css


In [None]:
# !pip install jupyter-cache # Run this cell if the package is missing
# !pip install papermill   #Run this cell if papermill is passing.

In [None]:
!quarto render --output-dir docs1/

[ 1/28] notebooks/lec4-inclass.ipynb[39m[22m
[ 2/28] notebooks/system_check.ipynb[39m[22m
[ 3/28] notebooks/lec2_hw.ipynb[39m[22m
[ 4/28] notebooks/lec2-hw.ipynb[39m[22m
[ 5/28] notebooks/lec3-inclass.ipynb[39m[22m
[ 6/28] notebooks/lec3_code_inside_QMD_testing.ipynb[39m[22m
[ 7/28] notebooks/testing.ipynb[39m[22m
[ 8/28] notebooks/reproducibility_demo.ipynb[39m[22m
[ 9/28] notebooks/lec2_inclass.ipynb[39m[22m
[10/28] notebooks/lec4_code_in_QMD_testing.ipynb[39m[22m
[11/28] reports/eda-AAPL.qmd[39m[22m
[12/28] reports/eda-MSFT.qmd[39m[22m
[13/28] reports/eda-NVDA.qmd[39m[22m
[14/28] reports/eda.qmd[39m[22m

Starting python3 kernel...Done
[ColabKernelApp] ERROR | No such comm target registered: quarto_kernel_setup

Executing 'eda.quarto_ipynb'
  Cell 1/8: ''..............Done
  Cell 2/8: ''..............Done
  Cell 3/8: ''..............Done
  Cell 4/8: 'fig-price'.....Done
  Cell 5/8: 'fig-hist'......Done
  Cell 6/8: 'fig-rolling'...Done
  Cell 7/8: 'tbl-summ