# VaR Toolkit — Reusable Risk Analysis (Parametric / Historical / Monte Carlo)

This notebook computes Value-at-Risk (VaR), Expected Shortfall (ES), and basic backtesting.  
It is **reusable**: change a few parameters and rerun.

**Data sources (choose one):**
- **yfinance** (default, no token)
- **TuShare** (optional, requires a token; set via `.env` or code cell)
- CSV (optional fallback)

Outputs: figures in `./figures/` and a summary CSV in `./outputs/summary.csv`.


## 0. Parameters

In [None]:

# ---- Edit these ----
SYMBOL = "600036.SS"   # e.g., '600036.SS' (CMB); US tickers like 'AAPL'; HK '0700.HK'
START  = "2018-01-01"
END    = "2025-01-01"

DATA_SOURCE = "yfinance"   # "yfinance" (default) | "tushare" | "csv"
CSV_PATH    = "zhaoshang.csv"  # only used if DATA_SOURCE='csv'

CONF_LEVELS = [0.95, 0.99]
HORIZONS    = [1, 10]

MC_SEED = 42
MC_N    = 100000

FIG_DIR = "figures"
OUT_DIR = "outputs"
# --------------------

import os, math, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from scipy.stats import norm

pd.options.display_float_format = "{:.6f}".format
plt.rcParams["figure.figsize"] = (10, 6)

Path(FIG_DIR).mkdir(parents=True, exist_ok=True)
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

print("SYMBOL:", SYMBOL, "|", START, "→", END, "| Source:", DATA_SOURCE)


## 1. Data loading

In [None]:

def load_yfinance(symbol, start, end):
    try:
        import yfinance as yf
    except Exception as e:
        raise ImportError("yfinance not installed. `pip install yfinance`.")
    data = yf.download(symbol, start=start, end=end, progress=False)
    if data is None or len(data)==0:
        raise ValueError("No data from yfinance. Check symbol or dates.")
    df = data.reset_index()[["Date","Adj Close"]].rename(columns={"Date":"datetime","Adj Close":"close"})
    return df

def load_tushare(symbol, start, end):
    # TuShare requires a token. Put it in .env as TUSHARE_TOKEN or set TS_TOKEN in code.
    token = os.getenv("TUSHARE_TOKEN", os.getenv("TS_TOKEN", "")).strip()
    if not token:
        raise ValueError("Missing TuShare token. Set TUSHARE_TOKEN in environment/.env.")
    try:
        import tushare as ts
    except Exception:
        raise ImportError("tushare not installed. `pip install tushare`.")
    ts.set_token(token)
    pro = ts.pro_api(token)
    # Convert dates to YYYYMMDD
    s = pd.Timestamp(start).strftime("%Y%m%d")
    e = pd.Timestamp(end).strftime("%Y%m%d")
    # For A-share, TuShare uses ts_code like '600036.SH'
    ts_code = symbol if symbol.endswith((".SH", ".SZ")) else symbol
    df = pro.daily(ts_code=ts_code, start_date=s, end_date=e)
    if df is None or len(df)==0:
        raise ValueError("No data from TuShare. Check ts_code and date window.")
    df["trade_date"] = pd.to_datetime(df["trade_date"], format="%Y%m%d", errors="coerce")
    df = df.rename(columns={"trade_date":"datetime", "close":"close"})[["datetime","close"]]
    df = df.sort_values("datetime").reset_index(drop=True)
    return df

def load_csv(csv_path):
    df = pd.read_csv(csv_path)
    # date column detection
    for c in ["datetime","trade_date","date","Date"]:
        if c in df.columns:
            date_col = c
            break
    else:
        raise ValueError("CSV must include one date column: datetime/trade_date/date/Date")
    if date_col == "trade_date":
        df[date_col] = pd.to_datetime(df[date_col].astype(str), format="%Y%m%d", errors="coerce")
    else:
        df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    # price column detection
    for c in ["close","Close","Adj Close","adj_close","AdjClose"]:
        if c in df.columns:
            price_col = c
            break
    else:
        raise ValueError("CSV must include a price column like close/Adj Close")
    df = df[[date_col, price_col]].rename(columns={date_col:"datetime", price_col:"close"})
    df = df.sort_values("datetime").dropna().reset_index(drop=True)
    return df

if DATA_SOURCE == "yfinance":
    raw = load_yfinance(SYMBOL, START, END)
elif DATA_SOURCE == "tushare":
    raw = load_tushare(SYMBOL, START, END)
elif DATA_SOURCE == "csv":
    if not os.path.exists(CSV_PATH):
        raise FileNotFoundError(f"CSV not found: {CSV_PATH}")
    raw = load_csv(CSV_PATH)
else:
    raise ValueError("Unknown DATA_SOURCE. Use 'yfinance' | 'tushare' | 'csv'.")

# window filter (also keeps consistency between loaders)
raw = raw[(raw["datetime"]>=START) & (raw["datetime"]<=END)].copy().sort_values("datetime")
print("Data window:", raw["datetime"].min().date(), "→", raw["datetime"].max().date(), "| points:", len(raw))
raw.head()


## 2. Returns and basic stats

In [None]:

df = raw.copy()
df["ret"] = np.log(df["close"] / df["close"].shift(1))
ret = df["ret"].dropna()

mu, sig = ret.mean(), ret.std(ddof=1)
ann_mu, ann_vol = mu*252, sig*np.sqrt(252)

print(f"Daily mean: {mu:.6f}, Daily vol: {sig:.6f}")
print(f"Annualized return: {ann_mu:.4%}, Annualized volatility: {ann_vol:.4%}")
ret.describe()


## 3. VaR methods and ES

In [None]:

def var_parametric(mu, sigma, alpha=0.95, horizon_days=1, value=1.0):
    z = norm.ppf(1 - alpha)
    mu_h = mu * horizon_days
    sig_h = sigma * math.sqrt(horizon_days)
    return float(max(0.0, -(mu_h + z*sig_h) * value))

def var_historical(returns, alpha=0.95, horizon_days=1, value=1.0):
    losses = -returns
    if horizon_days == 1:
        q = losses.quantile(alpha)
    else:
        q = losses.quantile(alpha) * math.sqrt(horizon_days)
    return float(q * value)

def var_monte_carlo(mu, sigma, alpha=0.95, horizon_days=1, value=1.0, n=100000, seed=42):
    rng = np.random.default_rng(seed)
    sim = rng.normal(loc=mu*horizon_days, scale=sigma*math.sqrt(horizon_days), size=n)
    losses = -sim * value
    return float(np.quantile(losses, alpha))

def es_parametric(mu, sigma, alpha=0.95, horizon_days=1, value=1.0):
    z = norm.ppf(1 - alpha)
    mu_h = mu * horizon_days
    sig_h = sigma * math.sqrt(horizon_days)
    es = value * (-(mu_h) + (norm.pdf(z) / (1 - alpha)) * sig_h)
    return float(max(0.0, es))

rows = []
for a in CONF_LEVELS:
    for h in HORIZONS:
        rows.append({
            "alpha": a, "horizon_d": h,
            "VaR_param": var_parametric(mu, sig, a, h),
            "VaR_hist":  var_historical(ret, a, h),
            "VaR_mc":    var_monte_carlo(mu, sig, a, h, n=MC_N, seed=MC_SEED),
            "ES_param":  es_parametric(mu, sig, a, h)
        })
res = pd.DataFrame(rows)
res


## 4. Visualizations

In [None]:

# 4.1 Distribution with normal overlay
fig = plt.figure()
plt.hist(ret, bins=60, density=True, alpha=0.6, label="Empirical")
x = np.linspace(ret.min(), ret.max(), 400)
pdf = (1/(sig*np.sqrt(2*np.pi))) * np.exp(-0.5*((x - mu)/sig)**2)
plt.plot(x, pdf, label="Normal fit")
plt.title("Daily Log-Returns — Empirical vs Normal")
plt.legend(); plt.grid(True); plt.tight_layout()
plt.savefig(f"{FIG_DIR}/return_distribution.png", dpi=200)
plt.show()

# 4.2 Rolling volatility
roll_win = 60
rolling_vol = ret.rolling(roll_win).std() * np.sqrt(252)
fig = plt.figure()
plt.plot(rolling_vol.index, rolling_vol.values, label=f"Rolling Vol ({roll_win}d, annualized)")
plt.title("Rolling Volatility")
plt.legend(); plt.grid(True); plt.tight_layout()
plt.savefig(f"{FIG_DIR}/rolling_vol.png", dpi=200)
plt.show()

# 4.3 VaR comparison (1d, first alpha)
alpha = CONF_LEVELS[0] if len(CONF_LEVELS)>0 else 0.95
V_p = var_parametric(mu, sig, alpha, 1)
V_h = var_historical(ret, alpha, 1)
V_m = var_monte_carlo(mu, sig, alpha, 1, n=MC_N, seed=MC_SEED)

ret_ts = pd.Series(ret.values, index=df.loc[ret.index, "datetime"])
fig = plt.figure()
plt.plot(ret_ts, label="Daily Return")
plt.axhline(-V_p, linestyle="--", label=f"Parametric VaR {int(alpha*100)}%")
plt.axhline(-V_h, linestyle="--", label=f"Hist VaR {int(alpha*100)}%")
plt.axhline(-V_m, linestyle="--", label=f"MC VaR {int(alpha*100)}%")
plt.title("Returns with VaR Thresholds (1d)")
plt.legend(); plt.grid(True); plt.tight_layout()
plt.savefig(f"{FIG_DIR}/var_comparison_1d_{int(alpha*100)}.png", dpi=200)
plt.show()


## 5. Backtesting (Parametric 1d, 95%)

In [None]:

alpha_bt = 0.95
V_bt = var_parametric(mu, sig, alpha_bt, 1)
ret_ts = pd.Series(ret.values, index=df.loc[ret.index, "datetime"])

years = sorted(set(ret_ts.index.year))
rows_bt = []
for yr in years:
    sel = ret_ts[ret_ts.index.year == yr]
    if len(sel)==0: 
        continue
    exceed = (sel < -V_bt).sum()
    rows_bt.append({"year": int(yr), "n_days": int(len(sel)), "exceed": int(exceed), "ratio": exceed/len(sel)})
bt = pd.DataFrame(rows_bt).sort_values("year")
bt


## 6. Export

In [None]:

today = pd.Timestamp.today().strftime("%Y-%m-%d")
sumrow = {
    "symbol": SYMBOL, "start": START, "end": END,
    "daily_mean": mu, "daily_vol": sig,
    "ann_return": ann_mu, "ann_vol": ann_vol,
    "generated_on": today, "source": DATA_SOURCE
}
for a in CONF_LEVELS:
    for h in HORIZONS:
        sumrow[f"VaR_param_{int(a*100)}_{h}d"] = var_parametric(mu, sig, a, h)
        sumrow[f"VaR_hist_{int(a*100)}_{h}d"]  = var_historical(ret, a, h)
        sumrow[f"VaR_mc_{int(a*100)}_{h}d"]    = var_monte_carlo(mu, sig, a, h, n=MC_N, seed=MC_SEED)
        sumrow[f"ES_param_{int(a*100)}_{h}d"]  = es_parametric(mu, sig, a, h)

out_path = f"{OUT_DIR}/summary.csv"
pd.DataFrame([sumrow]).to_csv(out_path, index=False, encoding="utf-8")
print("Summary saved to:", out_path)
print("Figures saved to:", FIG_DIR)



## Notes
- Parametric VaR assumes normality; if tails are heavy, risk can be underestimated.
- Historical VaR is non-parametric; depends on sample window.
- Monte Carlo VaR follows the assumed distribution of returns; parameter choice matters.
- Backtesting: for 95% VaR, theoretical exceedance ≈ 5% of trading days.
