In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from scipy.stats import uniform, beta, norm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import skew, kurtosis
from tqdm import tqdm

# Data Processing

In [3]:
def calculate_descriptive_statistics(data):
    """
    Calculate descriptive statistics for each column in the dataset.
    
    Parameters:
        data (pd.DataFrame): DataFrame with numerical data for each index.
    
    Returns:
        pd.DataFrame: DataFrame containing Mean, Std. Dev., Skew, and Kurtosis.
    """
    stats = {
        "Mean": data.mean(),
        "Std. Dev.": data.std(),
        "Skew": data.apply(skew),
        "Kurtosis": data.apply(lambda x: kurtosis(x, fisher=True))  # Fisher=True gives excess kurtosis
    }
    
    return pd.DataFrame(stats)

In [4]:
# List of yfinance-compatible tickers
tickers = [
    "SPY",      # S&P 500 ETF (large-cap U.S. equities)
    "IWM",      # iShares Russell 2000 ETF (small-cap U.S. equities)
    "QQQ",      # Nasdaq 100 ETF (tech-heavy U.S. equities)
    "IEF",      # iShares 7-10 Year Treasury Bond ETF (intermediate bonds)
    "TLT",      # iShares 20+ Year Treasury Bond ETF (long-term bonds)
    "BND",      # Vanguard Total Bond Market ETF (broad bond market)
    "VNQ",      # Vanguard Real Estate ETF (U.S. REITs)
    "GLD",      # SPDR Gold Shares (gold commodity)
    "DBC",      # Invesco DB Commodity Index Tracking Fund (broad commodities)
    "VTI"       # Vanguard Total Stock Market ETF (overall U.S. equities)
]

# Download monthly returns data for the last 14 years
start_date = "2011-11-01"
end_date = "2024-11-01"

# Fetch monthly data for each ticker
monthly_returns = {}
for ticker in tickers:
    data = yf.download(ticker, start=start_date, end=end_date, interval='1mo', progress=False)['Adj Close']
    returns = data.pct_change().dropna() * 100  # Calculate monthly returns
    monthly_returns[ticker] = returns

# Combine all into a single DataFrame
monthly_returns_df = pd.DataFrame(monthly_returns)
monthly_returns_df.index.name = "Date"

# Abbreviation mapping for tickers
abbreviation_mapping = {
    "SPY": "USE",     # Large-cap U.S. equities
    "IWM": "USSC",    # Small-cap U.S. equities
    "QQQ": "UST",     # Technology-focused U.S. equities
    "IEF": "USB",     # Intermediate-term U.S. bonds
    "TLT": "LTB",     # Long-term U.S. bonds
    "BND": "BB",      # Broad U.S. bond market
    "VNQ": "USR",     # U.S. REITs
    "GLD": "GC",      # Gold commodity
    "DBC": "BC",      # Broad commodities
    "VTI": "TSE"      # Total U.S. equities
}

# Rename columns based on the abbreviation mapping
monthly_returns_df.rename(columns=abbreviation_mapping, inplace=True)
monthly_returns_df

Unnamed: 0_level_0,USE,USSC,UST,USB,LTB,BB,USR,GC,BC,TSE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2011-12-01,0.408023,0.027096,-0.993054,1.812884,3.124942,0.468271,3.701016,-10.662434,-2.894364,0.406013
2012-01-01,5.301066,7.668424,8.833274,1.239516,0.237954,1.602860,7.558314,11.395481,3.688533,5.667604
2012-02-01,4.340546,2.568979,6.410073,-1.277179,-2.829936,-0.202185,-1.150745,-2.964978,5.353931,4.218485
2012-03-01,2.766024,2.171502,4.875012,-1.573861,-4.224700,-0.497453,4.361405,-1.320834,-1.773532,2.627464
2012-04-01,-0.232274,-1.316607,-1.001761,2.499200,4.801546,1.003220,3.686084,-0.148032,-1.354165,-0.210355
...,...,...,...,...,...,...,...,...,...,...
2024-06-01,3.195099,-1.399620,6.301166,1.215949,1.825925,0.877788,0.624706,-0.134700,-0.171899,2.714648
2024-07-01,1.537427,10.643647,-1.521827,2.890107,3.654346,2.353127,9.281175,5.367196,-2.798104,2.252879
2024-08-01,2.336556,-1.688551,1.103867,1.349336,2.095658,1.453734,5.220651,2.092249,-2.081485,2.131556
2024-09-01,1.788252,0.368044,2.477588,1.386690,2.007476,1.317303,2.407239,5.088851,0.723654,1.717074


In [5]:
# Calculate Descriptive Statistics
descriptive_stats = calculate_descriptive_statistics(monthly_returns_df)
descriptive_stats

Unnamed: 0,Mean,Std. Dev.,Skew,Kurtosis
USE,1.223233,4.144436,-0.44468,0.978389
USSC,0.971101,5.501755,-0.333032,1.637718
UST,1.598465,5.009283,-0.273879,0.221405
USB,0.117529,1.868576,0.037398,0.112857
LTB,0.131238,3.904736,0.319343,0.167597
BB,0.15236,1.39252,-0.028801,1.141868
USR,0.800214,5.064621,-0.412086,1.282607
GC,0.351601,4.363482,0.188758,-0.083944
BC,0.039117,4.790997,-0.398904,0.449246
TSE,1.201381,4.253099,-0.459501,1.213145


# Model Construction

In [6]:
from CMSV import CMSV
from SMSV import SMSV
from SMSVEMA import SMSVEMA
from SMSVEMASO import SMSVEMASO

In [46]:
AD1_df = pd.DataFrame(index=monthly_returns_df.index[48:])
AD2_df = pd.DataFrame(index=monthly_returns_df.index[48:])
AD3_df = pd.DataFrame(index=monthly_returns_df.index[48:])
return_dfs = {}

## CMSV

In [52]:
# Apply CMCV to express the dynamics of bond returns
bond_tickers = ['USB','BB']
return_dfs['CMSV_df'] = pd.DataFrame(index=monthly_returns_df.index[48:])

for bond_ticker in bond_tickers:
    returns = monthly_returns_df[bond_ticker]

    observation = returns.values
    dates = returns.index
    
    expected_returns, AD1, AD2, AD3 = CMSV(observation, L = 1_000_000)

    return_dfs['CMSV_df'][f'{bond_ticker}_CMSV'] = expected_returns[48:]
    AD1_df[f'{bond_ticker}_CMSV_AD1'] = AD1
    AD2_df[f'{bond_ticker}_CMSV_AD2'] = AD2
    AD3_df[f'{bond_ticker}_CMSV_AD3'] = AD3

return_dfs['CMSV_df'].head()

100%|██████████| 154/154 [00:42<00:00,  3.58it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:43<00:00,  3.52it/s]
  log_likelihood = np.log(np.sum(α, axis=1))


Unnamed: 0_level_0,USB_CMSV,BB_CMSV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-12-01,0.155972,0.166779
2016-01-01,0.128244,0.147715
2016-02-01,0.207994,0.188314
2016-03-01,0.229341,0.199527
2016-04-01,0.229766,0.215681


## SMSV

In [39]:
# Apply SMSV/EMA on others
other_tickers = [ticker for ticker in monthly_returns_df.columns.to_list() if ticker not in bond_tickers]

In [55]:
return_dfs['SMSV_df'] = pd.DataFrame(index=monthly_returns_df.index[48:])

# SMSV
for ticker in other_tickers:
    returns = monthly_returns_df[ticker]

    observation = returns.values
    dates = returns.index
    
    expected_returns, AD1, AD2, AD3 = SMSV(observation, L = 10_000)

    return_dfs['SMSV_df'][f'{ticker}_SMSV'] = expected_returns[48:]
    AD1_df[f'{ticker}_SMSV_AD1'] = AD1
    AD2_df[f'{ticker}_SMSV_AD2'] = AD2
    AD3_df[f'{ticker}_SMSV_AD3'] = AD3

return_dfs['SMSV_df'].head()

100%|██████████| 154/154 [00:00<00:00, 314.19it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 313.84it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 345.68it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 343.90it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 348.82it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 349.87it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 348.73it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 331.92it/s]
  log_likelihood = np.log(np.sum(α, axis=1))


Unnamed: 0_level_0,USE_SMSV,USSC_SMSV,UST_SMSV,LTB_SMSV,USR_SMSV,GC_SMSV,BC_SMSV,TSE_SMSV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-12-01,1.342988,1.455152,1.807747,-0.037376,1.172989,-1.097444,-1.550323,1.329898
2016-01-01,1.112652,1.112331,1.652483,0.009696,1.090225,-0.74343,-1.932783,1.476314
2016-02-01,0.920565,0.802892,1.377988,0.759389,0.950087,-0.121442,-1.933316,1.365029
2016-03-01,1.030285,0.908846,1.220196,0.807402,0.917645,0.558472,-1.239547,1.137724
2016-04-01,1.486679,1.363866,1.720494,0.381276,0.992862,-0.313581,-0.253902,1.01908


# SMSV_EMASO

In [56]:
return_dfs['SMSV_EMASO_df'] = pd.DataFrame(index=monthly_returns_df.index[48:])

# SMSV
for ticker in other_tickers:
    returns = monthly_returns_df[ticker]

    observation = returns.values
    dates = returns.index
    
    expected_returns, AD1, AD2, AD3 = SMSVEMASO(observation, L = 10_000)

    return_dfs['SMSV_EMASO_df'][f'{ticker}_SMSV_EMASO'] = expected_returns[48:]
    AD1_df[f'{ticker}_SMSV_EMASO_AD1'] = AD1
    AD2_df[f'{ticker}_SMSV_EMASO_AD2'] = AD2
    AD3_df[f'{ticker}_SMSV_EMASO_AD3'] = AD3

return_dfs['SMSV_EMASO_df'].head()

100%|██████████| 154/154 [00:00<00:00, 368.87it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 353.41it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 372.02it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 372.01it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 366.24it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 368.89it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 375.23it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 375.80it/s]
  log_likelihood = np.log(np.sum(α, axis=1))


Unnamed: 0_level_0,USE_SMSV_EMASO,USSC_SMSV_EMASO,UST_SMSV_EMASO,LTB_SMSV_EMASO,USR_SMSV_EMASO,GC_SMSV_EMASO,BC_SMSV_EMASO,TSE_SMSV_EMASO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-12-01,0.855183,0.770376,1.418594,-0.09883,1.019998,-1.552436,-3.391618,0.860052
2016-01-01,0.4134,-0.335459,0.809959,-0.206743,0.858642,-1.209505,-3.997484,0.21985
2016-02-01,-0.287157,-1.675161,-0.405909,1.377181,0.166744,-0.137094,-4.159615,-0.730317
2016-03-01,-0.242586,-1.281901,-0.599636,1.782986,0.12219,1.834067,-3.026626,-0.519112
2016-04-01,0.809507,0.325201,0.842284,1.181415,1.801256,0.953921,-1.104814,0.697973


## SMSV_EMA1~9

In [51]:
# Create 9 DataFrames
for i in range(1, 10):  # Loop from 1 to 9
    # Initialize a DataFrame with the desired index
    return_dfs[f"SMSV_EMA{i}_df"] = pd.DataFrame(index=monthly_returns_df.index[48:])

    # SMSV + EMA
    for ticker in other_tickers:
        returns = monthly_returns_df[ticker]

        observation = returns.values
        dates = returns.index
        
        expected_returns, AD1, AD2, AD3 = SMSVEMA(observation, L = 1_000_000, β = i/10)

        return_dfs[f"SMSV_EMA{i}_df"][f'{ticker}_SMSV_EMA{i}'] = expected_returns[48:]
        AD1_df[f'{ticker}_SMSV_EMA{i}_AD1'] = AD1
        AD2_df[f'{ticker}_SMSV_EMA{i}_AD2'] = AD2
        AD3_df[f'{ticker}_SMSV_EMA{i}_AD3'] = AD3

return_dfs["SMSV_EMA1_df"].head()

100%|██████████| 154/154 [01:05<00:00,  2.37it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [01:04<00:00,  2.38it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [01:06<00:00,  2.33it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [01:06<00:00,  2.32it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [01:10<00:00,  2.19it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [01:06<00:00,  2.33it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [01:08<00:00,  2.25it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [01:09<00:00,  2.22it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [01:05<00:00,  2.34it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [01:05<00:00,  2.33it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [01:09<00:00,  2.22it/s]


Unnamed: 0_level_0,USE_SMSV_EMA1,USSC_SMSV_EMA1,UST_SMSV_EMA1,LTB_SMSV_EMA1,USR_SMSV_EMA1,GC_SMSV_EMA1,BC_SMSV_EMA1,TSE_SMSV_EMA1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-12-01,0.801975,0.546701,1.446897,0.301777,0.798931,-1.146545,-2.67661,0.730992
2016-01-01,0.496539,-0.055528,1.112248,0.215934,0.761259,-1.075786,-2.983136,0.390833
2016-02-01,0.000576,-0.868584,0.347799,0.800025,0.472714,-0.444111,-3.123269,-0.164211
2016-03-01,0.000658,-0.803977,0.145316,1.008257,0.387561,0.697679,-2.823348,-0.148983
2016-04-01,0.61871,0.040062,0.790514,0.899168,1.275447,0.543157,-2.128436,0.524666


In [57]:
AD1_df

Unnamed: 0_level_0,USB_CMSV_AD1,BB_CMSV_AD1,USE_SMSV_AD1,USSC_SMSV_AD1,UST_SMSV_AD1,LTB_SMSV_AD1,USR_SMSV_AD1,GC_SMSV_AD1,BC_SMSV_AD1,TSE_SMSV_AD1,...,BC_SMSV_EMA8_AD1,TSE_SMSV_EMA8_AD1,USE_SMSV_EMA9_AD1,USSC_SMSV_EMA9_AD1,UST_SMSV_EMA9_AD1,LTB_SMSV_EMA9_AD1,USR_SMSV_EMA9_AD1,GC_SMSV_EMA9_AD1,BC_SMSV_EMA9_AD1,TSE_SMSV_EMA9_AD1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-12-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2016-01-01,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-02-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-03-01,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2016-04-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-01,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-07-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2024-08-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2024-09-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [63]:
import os

# Define the folder path where files will be saved
output_folder = "Data"

# Export each DataFrame in return_dfs to a CSV file
for key, df in return_dfs.items():
    file_path = os.path.join(output_folder, f"{key}.csv")
    df.to_csv(file_path, index=True)

# Export AD DataFrames
AD1_df.to_csv(os.path.join(output_folder, 'AD1_df.csv'))
AD2_df.to_csv(os.path.join(output_folder, 'AD2_df.csv'))
AD3_df.to_csv(os.path.join(output_folder, 'AD3_df.csv'))

In [61]:
# Combine all keys into a DataFrame for tracking
all_keys = list(return_dfs.keys()) + ['AD1_df', 'AD2_df', 'AD3_df']
all_keys_df = pd.DataFrame(all_keys, columns=['key'])

# Save the keys DataFrame
all_keys_df.to_csv(os.path.join(output_folder, 'all_keys_df.csv'))

# 跑这个code read csv!!!

In [64]:
# Retrived calculated expected returns and ADs
all_dfs = {}
all_keys_list = pd.read_csv('Data/all_keys_df.csv')['key'].values.tolist()

for key in all_keys_list:
    all_dfs[key] = pd.read_csv(f"Data/{key}.csv")
    all_dfs[key]['Date'] = pd.to_datetime(all_dfs[key]['Date'])
    all_dfs[key].set_index('Date',inplace = True)