In [1]:
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [23]:
# 周期映射: daily/weekly/monthly/3month -> yfinance interval
INTERVAL_MAP = {'daily': '1d', 'weekly': '1wk', 'monthly': '1mo', '3month': '3mo'}

def data_prep(ticker, start="2010-01-01", end="2025-12-31", interval="daily", data=None):
    """
    快速生成 data preparation
    
    参数:
        ticker: 股票代码（当 data 为 None 时使用）
        start, end: 日期范围（当 data 为 None 时使用）
        interval: 周期 - 'daily'(日线), 'weekly'(周线), 'monthly'(月线), '3month'(三个月线)
        data: 可选，直接传入 OHLC DataFrame（需含 Open/High/Low/Close/Adj Close）
    
    返回:
        含 C-C Returns, H-L Returns, O-C Returns, Randomness 列的 DataFrame
    """
    if data is None:
        iv = INTERVAL_MAP.get(interval, interval)  # 支持 '1d' 等直接传入
        raw = yf.download(ticker, start=start, end=end, interval=iv, auto_adjust=False)
        df = raw[['Open', 'High', 'Low', 'Close', 'Adj Close']].copy()
    else:
        df = data[['Open', 'High', 'Low', 'Close', 'Adj Close']].copy()
    
    df = df.sort_index(ascending=False)  # descending by date (newest first)
    
    adj = df['Adj Close']
    if isinstance(adj, pd.DataFrame):
        adj = adj.iloc[:, 0]
    # 计算 C-C Returns: (本行Adj Close - 下一行Adj Close) / 下一行Adj Close
    df['C-C Returns'] = (adj - adj.shift(-1)) / adj.shift(-1)
    # 3. 计算H-L Returns：(本行High - 本行Low) / 本行Low
    df['H-L Returns'] = (df['High'] - df['Low']) / df['Low']
    # 4. 计算O-C Returns：(本行Close - 本行Open) / 本行Open
    df['O-C Returns'] = (df['Close'] - df['Open']) / df['Open']
    # 5. 计算随机性指标
    df['Randomness'] = np.abs(df['C-C Returns'] - df['H-L Returns'])

    return df

In [22]:
RETURNS_MAP = {'cc': 'C-C Returns', 'hl': 'H-L Returns', 'oc': 'O-C Returns'}

def distribution_of_return(prices=None, ticker="SPY", start="2010-01-01", end="2025-12-31", interval="daily",
                        returns_type="cc", merge_bound_left=-0.02, merge_bound_right=0.02, bin_width=0.005, plot=True):
    """
    Returns 完整分布分析（描述统计、正负零、1-3 std、区间分布、正态性检验、可视化）
    
    参数:
        prices: 含 C-C/H-L/O-C Returns 的 DataFrame；为 None 时用 ticker 下载
        ticker, start, end: 当 prices 为 None 时使用
        interval: 周期 - 'daily'(日线), 'weekly'(周线), 'monthly'(月线), '3month'(三个月线)
        returns_type: 分析类型 - 'cc'/'C-C Returns'(日间), 'hl'/'H-L Returns'(振幅), 'oc'/'O-C Returns'(日内)
        merge_bound_left: 左尾合并边界，小于此值合并为一档
        merge_bound_right: 右尾合并边界，大于此值合并为一档
        bin_width: 中间区间每档宽度
        plot: 是否显示图表
    
    返回:
        dict: {returns, stats_df, sign_stats, std_dev_df, freq_df, jb_stat, jb_pval}
    """
    if prices is None:
        prices = data_prep(ticker, start=start, end=end, interval=interval)
    
    col = RETURNS_MAP.get(str(returns_type).lower(), returns_type)  # 'cc'->'C-C Returns', 或直接传列名
    if col not in prices.columns:
        raise ValueError(f"Column '{col}' not found. Use returns_type='cc'|'hl'|'oc' or ensure prices has C-C/H-L/O-C Returns.")
    
    cc = prices[col].dropna()
    if isinstance(cc, pd.DataFrame):
        cc = cc.iloc[:, 0]
    
    n = len(cc)
    results = {}
    
    # 1. Descriptive statistics
    stats_df = pd.DataFrame({
        'Statistic': ['Mean', 'Std', 'Skewness', 'Kurtosis', 'Min', '25%', 'Median', '75%', 'Max', 'N'],
        'Value': [cc.mean(), cc.std(), cc.skew(), cc.kurtosis(), cc.min(), cc.quantile(0.25), 
                  cc.median(), cc.quantile(0.75), cc.max(), len(cc)]
    })
    results['stats_df'] = stats_df
    
    # 2. Sign statistics
    cc_gt0, cc_lt0, cc_eq0 = cc > 0, cc < 0, cc == 0
    sign_stats = pd.DataFrame({
        'Cond': ['> 0', '< 0', '= 0'],
        'Freq': [cc_gt0.sum(), cc_lt0.sum(), cc_eq0.sum()],
        'Pct(%)': [(cc_gt0.sum()/n*100).round(4), (cc_lt0.sum()/n*100).round(4), (cc_eq0.sum()/n*100).round(4)],
        'Mean': [cc[cc_gt0].mean() if cc_gt0.any() else np.nan, 
                 cc[cc_lt0].mean() if cc_lt0.any() else np.nan, 
                 cc[cc_eq0].mean() if cc_eq0.any() else 0.0]
    })
    sign_stats['freq_adj_returns'] = sign_stats['Mean'] * (sign_stats['Pct(%)'] / 100)
    results['sign_stats'] = sign_stats
    
    # 3. 1-3 Std Dev
    cc_vals = np.asarray(cc).flatten()
    mu, sigma = np.mean(cc_vals), np.std(cc_vals)
    actual_counts = [np.sum((cc_vals >= mu - k*sigma) & (cc_vals <= mu + k*sigma)) for k in [1, 2, 3]]
    actual_pct = [c / len(cc_vals) * 100 for c in actual_counts]
    normal_pct = [(2*stats.norm.cdf(k)-1)*100 for k in [1, 2, 3]]
    std_dev_df = pd.DataFrame({
        'Upper Bound': [f'{(mu + k*sigma)*100:.2f}%' for k in [1, 2, 3]],
        'Lower Bound': [f'{(mu - k*sigma)*100:.2f}%' for k in [1, 2, 3]],
        'Actual Count': actual_counts,
        'Actual % Count': [f'{p:.2f}%' for p in actual_pct],
        'Normal % Count': [f'{p:.2f}%' for p in normal_pct],
    }, index=['1 std', '2 std', '3 std'])
    results['std_dev_df'] = std_dev_df
    
    # 4. Bin distribution (customizable merge bounds and bin width)
    n_mid = int(round((merge_bound_right - merge_bound_left) / bin_width))
    mid_edges = np.linspace(merge_bound_left, merge_bound_right, n_mid + 1)
    bins = np.concatenate([[-np.inf], mid_edges, [np.inf]])
    counts, _ = np.histogram(cc, bins=bins)
    def bin_label(i):
        lo, hi = bins[i], bins[i+1]
        if hi <= merge_bound_left: return f"< {merge_bound_left}"
        elif lo >= merge_bound_right: return f"> {merge_bound_right}"
        return f"[{lo:.3f}, {hi:.3f})"
    freq_df = pd.DataFrame({
        'Bin': [bin_label(i) for i in range(len(bins)-1)],
        'Freq': counts, 'Pct(%)': (counts / n * 100).round(4),
        'probability': (counts / n).round(6),
        'cumulative_probability': (counts / n).cumsum().round(6)
    })
    freq_df = freq_df[freq_df['Freq'] > 0].reset_index(drop=True)
    results['freq_df'] = freq_df
    results['bins'] = bins
    results['returns'] = cc
    results['returns_type'] = col
    
    # 5. Normality test
    jb_stat, jb_pval = stats.jarque_bera(cc)
    results['jb_stat'], results['jb_pval'] = jb_stat, jb_pval
    
    # Print & display
    print("=" * 50, f"\n{col} Descriptive Statistics\n", "=" * 50)
    print(stats_df.to_string(index=False))
    print("\n" + "=" * 50, f"\n{col} Sign Statistics\n", "=" * 50)
    w1, w2, w3, w4, w5 = 6, 8, 12, 14, 18
    print(f"{'Cond':<{w1}}{'Freq':>{w2}}{'Pct(%)':>{w3}}{'Mean':>{w4}}{'freq_adj_returns':>{w5}}")
    for _, r in sign_stats.iterrows():
        avg = r['Mean'] if pd.notna(r['Mean']) else 0
        print(f"{r['Cond']:<{w1}}{int(r['Freq']):>{w2}}{r['Pct(%)']:>{w3}.4f}{avg:>{w4}.6f}{r['freq_adj_returns']:>{w5}.6f}")
    print("\n" + "=" * 70, "\n1-3 Std Dev: Actual vs Normal\n", "=" * 70)
    display(std_dev_df)
    print(f"\n" + "=" * 80 + f"\nBin Distribution (<{merge_bound_left} merged, >{merge_bound_right} merged, middle {bin_width}/bin)\n" + "=" * 80)
    display(freq_df)
    print("\n" + "=" * 50, "\nNormality Test\n", "=" * 50)
    print(f"Jarque-Bera: stat={jb_stat:.4f}, p-value={jb_pval:.4e}")
    print(f"  -> {'Reject normality' if jb_pval < 0.05 else 'Cannot reject normality'} (alpha=0.05)")
    
    if plot:
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        bins_plot = bins.copy()
        bins_plot[bins_plot == -np.inf] = min(cc.min(), merge_bound_left) - 0.01
        bins_plot[bins_plot == np.inf] = max(cc.max(), merge_bound_right) + 0.01
        axes[0].hist(cc, bins=bins_plot, density=True, alpha=0.6, color='steelblue', edgecolor='white', label=col)
        x = np.linspace(cc.min(), cc.max(), 200)
        axes[0].plot(x, stats.norm.pdf(x, cc.mean(), cc.std()), 'r-', lw=2, label='Normal fit')
        axes[0].set_xlabel(col); axes[0].set_ylabel('density'); axes[0].set_title(f'{col} Distribution')
        axes[0].legend(); axes[0].grid(True, alpha=0.3)
        stats.probplot(cc, dist="norm", plot=axes[1])
        axes[1].set_title('Q-Q Plot'); axes[1].grid(True, alpha=0.3)
        plt.tight_layout(); plt.show()
    
    return results

In [None]:
# 快速使用：可选 interval = 'daily' | 'weekly' | 'monthly' | '3month'
# prices = data_prep("SPY", interval="daily")
prices = data_prep("SPY", interval="weekly")   # 周线
# prices = data_prep("SPY", interval="monthly") # 月线
# prices = data_prep("SPY", interval="3month")  # 三个月线
prices.head()

In [None]:
# Returns 分布分析：returns_type='cc'|'hl'|'oc' (C-C/H-L/O-C Returns)
results = distribution_of_return(prices=prices, returns_type="cc", plot=True)
# results = distribution_of_return(prices=prices, returns_type="hl", plot=True)  # H-L
# results = distribution_of_return(prices=prices, returns_type="oc", plot=True)  # O-C

# 或直接指定 ticker 和周期：
# results = distribution_of_return(ticker="AAPL", interval="weekly", start="2020-01-01", end="2024-12-31", plot=True)
# interval: 'daily'(日线), 'weekly'(周线), 'monthly'(月线), '3month'(三个月线)

# 返回的 results 包含：cc, stats_df, sign_stats, std_dev_df, freq_df, jb_stat, jb_pval


In [None]:
# 或直接指定 ticker 和周期：
results = distribution_of_return(ticker="GOOG", interval="weekly",returns_type="cc", start="2000-01-01", end="2025-12-31", merge_bound_left=-0.1, merge_bound_right=0.1, bin_width=0.02, plot=True)
# interval: 'daily'(日线), 'weekly'(周线), 'monthly'(月线), '3month'(三个月线)

# 返回的 results 包含：cc, stats_df, sign_stats, std_dev_df, freq_df, jb_stat, jb_pval

In [17]:
# Asset universe: Asset, Class, Notes
assets_df = pd.DataFrame([
    # Government Bonds
    ("TLT", "Government Bonds", "20+ yr"),
    ("IEF", "Government Bonds", "7-10 yr"),
    ("IEI", "Government Bonds", "3-7 yr"),
    ("SHY", "Government Bonds", "1-3 yr"),
    # Corporate Bonds
    ("LQD", "Corporate Bonds", "Investment Grade"),
    ("VCIT", "Corporate Bonds", "Investment Grade"),
    ("JNK", "Corporate Bonds", "Junk"),
    ("HYG", "Corporate Bonds", "Junk"),
    # FX Major
    ("EURUSD", "FX Major", None),
    ("USDJPY", "FX Major", None),
    ("GBPUSD", "FX Major", None),
    ("AUDUSD", "FX Major", None),
    # FX EM
    ("USDZAR", "FX EM", None),
    ("USDBRL", "FX EM", None),
    ("USDTRY", "FX EM", None),
    # Equity Index
    ("SPY", "Equity Index", "S&P 500"),
    ("QQQ", "Equity Index", "Nasdaq 100"),
    ("DIA", "Equity Index", "Dow Jones Industrial"),
    ("IWV", "Equity Index", "Russell 3000"),
    ("IWM", "Equity Index", "Russell 2000"),
    # Equity - Mega Cap (FAANMG)
    ("META", "Equity - Mega Cap", "FAANMG"),
    ("AAPL", "Equity - Mega Cap", "FAANMG"),
    ("AMZN", "Equity - Mega Cap", "FAANMG"),
    ("NFLX", "Equity - Mega Cap", "FAANMG"),
    ("MSFT", "Equity - Mega Cap", "FAANMG"),
    ("GOOG", "Equity - Mega Cap", "FAANMG"),
    # Equity - Large Cap
    ("MAR", "Equity - Large Cap", "Discretionary/Industrial"),
    ("LVS", "Equity - Large Cap", "Discretionary/Industrial"),
    ("LEN", "Equity - Large Cap", "Discretionary/Industrial"),
    ("BBY", "Equity - Large Cap", "Discretionary/Industrial"),
    ("ODFL", "Equity - Large Cap", "Discretionary/Industrial"),
    ("LUV", "Equity - Large Cap", "Discretionary/Industrial"),
    ("PCAR", "Equity - Large Cap", "Discretionary/Industrial"),
    ("JCI", "Equity - Large Cap", "Discretionary/Industrial"),
    ("WMT", "Equity - Large Cap", "Discretionary/Industrial/Tech?"),
    ("HSY", "Equity - Large Cap", "F&B, Healthcare, Utilities"),
    ("CPB", "Equity - Large Cap", "F&B, Healthcare, Utilities"),
    ("STZ", "Equity - Large Cap", "F&B, Healthcare, Utilities"),
    ("MNST", "Equity - Large Cap", "F&B, Healthcare, Utilities"),
    ("BIIB", "Equity - Large Cap", "F&B, Healthcare, Utilities"),
    ("ALGN", "Equity - Large Cap", "F&B, Healthcare, Utilities"),
    ("XEL", "Equity - Large Cap", "F&B, Healthcare, Utilities"),
    ("PPL", "Equity - Large Cap", "F&B, Healthcare, Utilities"),
    ("SNDK", "Equity - Large Cap", "Semiconductor"),
    ("MU", "Equity - Large Cap", "Semiconductor"),
    ("TSM", "Equity - Large Cap", "Semiconductor"),
    ("PLTR", "Equity - Large Cap", "AI/Military"),
    # Equity - Mid Cap
    ("WING", "Equity - Mid Cap", None),
    ("FRPT", "Equity - Mid Cap", None),
    ("BILL", "Equity - Mid Cap", None),
    ("PEGA", "Equity - Mid Cap", None),
    ("ZNGA", "Equity - Mid Cap", None),
    ("OLED", "Equity - Mid Cap", None),
    ("XPO", "Equity - Mid Cap", None),
    ("FND", "Equity - Mid Cap", None),
    ("TREX", "Equity - Mid Cap", None),
    ("MOS", "Equity - Mid Cap", None),
    ("OLLI", "Equity - Mid Cap", None),
    ("DKS", "Equity - Mid Cap", None),
    ("VIRT", "Equity - Mid Cap", None),
    ("JBLU", "Equity - Mid Cap", None),
], columns=["Asset", "Class", "Notes"])

In [24]:
# 为每个标的计算日、周、月、三个月 C-C Returns 的 Std、H-L Returns 的 Mean，拼接至 assets_df
try:
    from tqdm import tqdm
    iter_assets = tqdm(assets_df['Asset'], desc="Computing C-C std & H-L mean")
except ImportError:
    iter_assets = assets_df['Asset']

intervals = ['daily', 'weekly', 'monthly', '3month']
std_cols = ['Std_Daily(%)', 'Std_Weekly(%)', 'Std_Monthly(%)', 'Std_3Month(%)']
hl_mean_cols = ['MeanHL_Daily(%)', 'MeanHL_Weekly(%)', 'MeanHL_Monthly(%)', 'MeanHL_3Month(%)']
start, end = "2010-01-01", "2025-12-31"

def to_yf_ticker(ticker):
    if ticker in ['EURUSD', 'USDJPY', 'GBPUSD', 'AUDUSD', 'USDZAR', 'USDBRL', 'USDTRY']:
        return f"{ticker}=X"
    return ticker

def _to_scalar(x):
    arr = np.asarray(x).ravel()
    return float(arr[0]) if len(arr) > 0 and np.isfinite(arr[0]) else np.nan

std_data, hl_mean_data = [], []
for asset in iter_assets:
    std_row, hl_row = [], []
    for iv in intervals:
        try:
            df = data_prep(to_yf_ticker(asset), start=start, end=end, interval=iv)
            cc = df['C-C Returns'].dropna()
            hl = df['H-L Returns'].dropna()
            if isinstance(cc, pd.DataFrame):
                cc = cc.iloc[:, 0]
            if isinstance(hl, pd.DataFrame):
                hl = hl.iloc[:, 0]
            std_row.append(_to_scalar(cc.std()) * 100 if len(cc) > 0 else np.nan)
            hl_row.append(_to_scalar(hl.mean()) * 100 if len(hl) > 0 else np.nan)
        except Exception:
            std_row.append(np.nan)
            hl_row.append(np.nan)
    std_data.append(std_row)
    hl_mean_data.append(hl_row)

all_cols = std_cols + hl_mean_cols
base_df = assets_df.drop(columns=all_cols, errors='ignore')
assets_df = pd.concat([
    base_df,
    pd.DataFrame(std_data, columns=std_cols),
    pd.DataFrame(hl_mean_data, columns=hl_mean_cols)
], axis=1)
assets_df

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Unnamed: 0,Asset,Class,Notes,Std_Daily,Std_Weekly,Std_Monthly,Std_3Month,MeanHL_Daily,MeanHL_Weekly,MeanHL_Monthly,MeanHL_3Month,Std_Daily(%),Std_Weekly(%),Std_Monthly(%),Std_3Month(%),MeanHL_Daily(%),MeanHL_Weekly(%),MeanHL_Monthly(%),MeanHL_3Month(%)
0,TLT,Government Bonds,20+ yr,0.009510,0.019484,0.039665,0.080873,0.009218,0.027376,0.061267,0.114944,0.951036,1.948402,3.966498,8.087341,0.921815,2.737608,6.126722,11.494446
1,IEF,Government Bonds,7-10 yr,0.004178,0.008776,0.018549,0.036521,0.003891,0.011931,0.027211,0.050216,0.417775,0.877549,1.854896,3.652147,0.389108,1.193050,2.721107,5.021565
2,IEI,Government Bonds,3-7 yr,0.002361,0.005049,0.010844,0.021045,0.002299,0.007267,0.017992,0.035901,0.236088,0.504859,1.084366,2.104530,0.229891,0.726741,1.799152,3.590057
3,SHY,Government Bonds,1-3 yr,0.000842,0.001892,0.004114,0.008915,0.000748,0.002151,0.004939,0.008938,0.084194,0.189151,0.411395,0.891498,0.074843,0.215141,0.493931,0.893784
4,LQD,Corporate Bonds,Investment Grade,0.004823,0.012331,0.021538,0.037416,0.004568,0.012995,0.029701,0.054904,0.482324,1.233149,2.153772,3.741557,0.456795,1.299458,2.970144,5.490404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,MOS,Equity - Mid Cap,,0.025842,0.056245,0.112918,0.206657,0.032224,0.083732,0.196827,0.379696,2.584157,5.624502,11.291771,20.665650,3.222367,8.373202,19.682736,37.969566
57,OLLI,Equity - Mid Cap,,0.027128,0.056990,0.117943,0.247002,0.037700,0.090863,0.212668,0.408712,2.712773,5.699036,11.794265,24.700245,3.770047,9.086275,21.266791,40.871153
58,DKS,Equity - Mid Cap,,0.025438,0.054482,0.104078,0.229547,0.030800,0.079337,0.182270,0.373035,2.543800,5.448168,10.407771,22.954663,3.080010,7.933744,18.226988,37.303491
59,VIRT,Equity - Mid Cap,,0.022166,0.051122,0.100652,0.207493,0.030235,0.073166,0.166749,0.317315,2.216636,5.112195,10.065248,20.749293,3.023502,7.316597,16.674940,31.731488
