In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from scipy.stats import uniform, beta, norm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import skew, kurtosis
from tqdm import tqdm

# Data Processing

In [2]:
def calculate_descriptive_statistics(data):
    """
    Calculate descriptive statistics for each column in the dataset.
    
    Parameters:
        data (pd.DataFrame): DataFrame with numerical data for each index.
    
    Returns:
        pd.DataFrame: DataFrame containing Mean, Std. Dev., Skew, and Kurtosis.
    """
    stats = {
        "Mean": data.mean(),
        "Std. Dev.": data.std(),
        "Skew": data.apply(skew),
        "Kurtosis": data.apply(lambda x: kurtosis(x, fisher=True))  # Fisher=True gives excess kurtosis
    }
    
    return pd.DataFrame(stats)

In [3]:
# List of yfinance-compatible tickers
tickers = [
    "SPY",      # S&P 500 ETF (large-cap U.S. equities)
    # "IWM",      # iShares Russell 2000 ETF (small-cap U.S. equities)
    "QQQ",      # Nasdaq 100 ETF (tech-heavy U.S. equities)
    "IEF",      # iShares 7-10 Year Treasury Bond ETF (intermediate bonds)
    # "TLT",      # iShares 20+ Year Treasury Bond ETF (long-term bonds)
    "BND",      # Vanguard Total Bond Market ETF (broad bond market)
    "VNQ",      # Vanguard Real Estate ETF (U.S. REITs)
    "GLD",      # SPDR Gold Shares (gold commodity)
    "DBC",      # Invesco DB Commodity Index Tracking Fund (broad commodities)
    "VTI"       # Vanguard Total Stock Market ETF (overall U.S. equities)
]

# Download monthly returns data for the last 14 years
start_date = "2011-11-01"
end_date = "2024-11-01"

# Fetch monthly data for each ticker
monthly_returns = {}
for ticker in tickers:
    data = yf.download(ticker, start=start_date, end=end_date, interval='1mo', progress=False)['Adj Close']
    returns = data.pct_change().dropna() * 100  # Calculate monthly returns
    monthly_returns[ticker] = returns

# Combine all into a single DataFrame
monthly_returns_df = pd.DataFrame(monthly_returns)
monthly_returns_df.index.name = "Date"

# Abbreviation mapping for tickers
abbreviation_mapping = {
    "SPY": "USE",     # Large-cap U.S. equities
    # "IWM": "USSC",    # Small-cap U.S. equities
    "QQQ": "UST",     # Technology-focused U.S. equities
    "IEF": "USB",     # Intermediate-term U.S. bonds
    # "TLT": "LTB",     # Long-term U.S. bonds
    "BND": "BB",      # Broad U.S. bond market
    "VNQ": "USR",     # U.S. REITs
    "GLD": "GC",      # Gold commodity
    "DBC": "BC",      # Broad commodities
    "VTI": "TSE"      # Total U.S. equities
}

# Rename columns based on the abbreviation mapping
monthly_returns_df.rename(columns=abbreviation_mapping, inplace=True)
monthly_returns_df

Unnamed: 0_level_0,USE,UST,USB,BB,USR,GC,BC,TSE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-12-01,0.407999,-0.993100,1.812854,0.468225,3.701039,-10.662434,-2.894356,0.405990
2012-01-01,5.301098,8.833323,1.239497,1.602965,7.558290,11.395481,3.688533,5.667573
2012-02-01,4.340509,6.410051,-1.277225,-0.202256,-1.150766,-2.964978,5.353931,4.218520
2012-03-01,2.766039,4.875052,-1.573834,-0.497543,4.361406,-1.320834,-1.773532,2.627491
2012-04-01,-0.232240,-1.001805,2.499286,1.003246,3.686105,-0.148032,-1.354158,-0.210408
...,...,...,...,...,...,...,...,...
2024-06-01,3.195099,6.301166,1.215949,0.877777,0.624706,-0.134700,-0.171899,2.714648
2024-07-01,1.537427,-1.521827,2.890107,2.353127,9.281175,5.367196,-2.798104,2.252879
2024-08-01,2.336556,1.103867,1.349336,1.453744,5.220651,2.092249,-2.081485,2.131556
2024-09-01,1.788252,2.477588,1.386690,1.317303,2.407239,5.088851,0.723654,1.717074


In [4]:
# Calculate Descriptive Statistics
descriptive_stats = calculate_descriptive_statistics(monthly_returns_df)
descriptive_stats

Unnamed: 0,Mean,Std. Dev.,Skew,Kurtosis
USE,1.223233,4.144435,-0.444683,0.978387
UST,1.598465,5.009285,-0.273877,0.221408
USB,0.117528,1.868572,0.037398,0.112849
BB,0.152359,1.392522,-0.028788,1.141849
USR,0.800214,5.06462,-0.412084,1.2826
GC,0.351601,4.363482,0.188758,-0.083944
BC,0.039117,4.790996,-0.398905,0.449248
TSE,1.201381,4.2531,-0.459499,1.213137


# Model Construction

In [5]:
from CMSV import CMSV
from SMSV import SMSV
from SMSVEMA import SMSVEMA
from SMSVEMASO import SMSVEMASO

In [6]:
AD1_df = pd.DataFrame(index=monthly_returns_df.index[48:])
AD2_df = pd.DataFrame(index=monthly_returns_df.index[48:])
AD3_df = pd.DataFrame(index=monthly_returns_df.index[48:])
return_dfs = {}

## CMSV

In [7]:
# Apply CMCV to express the dynamics of bond returns
bond_tickers = ['USB','BB']
return_dfs['CMSV_df'] = pd.DataFrame(index=monthly_returns_df.index[48:])

for bond_ticker in bond_tickers:
    returns = monthly_returns_df[bond_ticker]

    observation = returns.values
    dates = returns.index
    
    expected_returns, AD1, AD2, AD3 = CMSV(observation, L = 1_00_000)

    return_dfs['CMSV_df'][f'{bond_ticker}_CMSV'] = expected_returns[48:]
    AD1_df[f'{bond_ticker}_CMSV_AD1'] = AD1
    AD2_df[f'{bond_ticker}_CMSV_AD2'] = AD2
    AD3_df[f'{bond_ticker}_CMSV_AD3'] = AD3

return_dfs['CMSV_df'].head()

100%|██████████| 154/154 [00:03<00:00, 39.77it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 40.75it/s]
  log_likelihood = np.log(np.sum(α, axis=1))


Unnamed: 0_level_0,USB_CMSV,BB_CMSV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-12-01,0.164079,0.168106
2016-01-01,0.12921,0.151996
2016-02-01,0.211702,0.191068
2016-03-01,0.235114,0.195744
2016-04-01,0.21966,0.214544


## SMSV

In [8]:
# Apply SMSV/EMA on others
other_tickers = [ticker for ticker in monthly_returns_df.columns.to_list() if ticker not in bond_tickers]

In [9]:
return_dfs['SMSV_df'] = pd.DataFrame(index=monthly_returns_df.index[48:])

# SMSV
for ticker in other_tickers:
    returns = monthly_returns_df[ticker]

    observation = returns.values
    dates = returns.index
    
    expected_returns, AD1, AD2, AD3 = SMSV(observation, L = 10_000)

    return_dfs['SMSV_df'][f'{ticker}_SMSV'] = expected_returns[48:]
    AD1_df[f'{ticker}_SMSV_AD1'] = AD1
    AD2_df[f'{ticker}_SMSV_AD2'] = AD2
    AD3_df[f'{ticker}_SMSV_AD3'] = AD3

return_dfs['SMSV_df'].head()

100%|██████████| 154/154 [00:00<00:00, 295.60it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 331.64it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 330.15it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 331.26it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 337.97it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:00<00:00, 332.01it/s]
  log_likelihood = np.log(np.sum(α, axis=1))


Unnamed: 0_level_0,USE_SMSV,UST_SMSV,USR_SMSV,GC_SMSV,BC_SMSV,TSE_SMSV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-12-01,1.434234,1.733166,1.27505,-0.93716,-1.597167,1.375487
2016-01-01,1.414003,1.485506,1.041005,-1.065335,-1.875417,1.049389
2016-02-01,1.226858,1.074246,0.987598,-0.83969,-1.890458,0.86694
2016-03-01,1.13061,1.333337,0.953949,0.085086,-1.240728,0.970811
2016-04-01,1.10253,1.92859,1.341621,-0.521699,-0.526596,1.431968


# SMSV_EMASO

In [10]:
return_dfs['SMSV_EMASO_df'] = pd.DataFrame(index=monthly_returns_df.index[48:])

# SMSV
for ticker in other_tickers:
    returns = monthly_returns_df[ticker]

    observation = returns.values
    dates = returns.index
    
    expected_returns, AD1, AD2, AD3 = SMSVEMASO(observation, L = 1_00_000)

    return_dfs['SMSV_EMASO_df'][f'{ticker}_SMSV_EMASO'] = expected_returns[48:]
    AD1_df[f'{ticker}_SMSV_EMASO_AD1'] = AD1
    AD2_df[f'{ticker}_SMSV_EMASO_AD2'] = AD2
    AD3_df[f'{ticker}_SMSV_EMASO_AD3'] = AD3

return_dfs['SMSV_EMASO_df'].head()

100%|██████████| 154/154 [00:04<00:00, 33.51it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:04<00:00, 34.98it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:04<00:00, 35.85it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:04<00:00, 35.52it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:04<00:00, 36.01it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:04<00:00, 34.22it/s]
  log_likelihood = np.log(np.sum(α, axis=1))


Unnamed: 0_level_0,USE_SMSV_EMASO,UST_SMSV_EMASO,USR_SMSV_EMASO,GC_SMSV_EMASO,BC_SMSV_EMASO,TSE_SMSV_EMASO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-12-01,0.821341,1.535654,0.876453,-1.577238,-3.283176,0.754832
2016-01-01,0.303,0.948735,0.776141,-1.288891,-3.974937,0.158676
2016-02-01,-0.514239,-0.377834,0.199169,-0.007897,-4.030865,-0.756748
2016-03-01,-0.372035,-0.570653,0.094688,2.20393,-2.907382,-0.526168
2016-04-01,0.719396,0.819203,1.719615,1.223117,-1.049873,0.663436


## SMSV_EMA1~9

In [11]:
# Create 9 DataFrames
for i in range(1, 10):  # Loop from 1 to 9
    # Initialize a DataFrame with the desired index
    return_dfs[f"SMSV_EMA{i}_df"] = pd.DataFrame(index=monthly_returns_df.index[48:])

    # SMSV + EMA
    for ticker in other_tickers:
        returns = monthly_returns_df[ticker]

        observation = returns.values
        dates = returns.index
        
        expected_returns, AD1, AD2, AD3 = SMSVEMA(observation, L = 1_00_000, β = i/10)

        return_dfs[f"SMSV_EMA{i}_df"][f'{ticker}_SMSV_EMA{i}'] = expected_returns[48:]
        AD1_df[f'{ticker}_SMSV_EMA{i}_AD1'] = AD1
        AD2_df[f'{ticker}_SMSV_EMA{i}_AD2'] = AD2
        AD3_df[f'{ticker}_SMSV_EMA{i}_AD3'] = AD3

return_dfs["SMSV_EMA1_df"].head()

100%|██████████| 154/154 [00:03<00:00, 45.27it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 46.85it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 46.99it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 47.09it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 47.04it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 44.62it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 44.43it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 45.42it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 44.79it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 45.65it/s]
  log_likelihood = np.log(np.sum(α, axis=1))
100%|██████████| 154/154 [00:03<00:00, 45.86it/s]


Unnamed: 0_level_0,USE_SMSV_EMA1,UST_SMSV_EMA1,USR_SMSV_EMA1,GC_SMSV_EMA1,BC_SMSV_EMA1,TSE_SMSV_EMA1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-12-01,0.80717,1.461945,0.790568,-1.13592,-2.671297,0.721954
2016-01-01,0.483095,1.119599,0.747103,-1.08597,-2.997303,0.380873
2016-02-01,-0.036164,0.346819,0.475336,-0.451233,-3.117448,-0.174505
2016-03-01,-0.007451,0.185306,0.40826,0.721522,-2.850307,-0.137451
2016-04-01,0.64412,0.78952,1.276363,0.535368,-2.116543,0.507535


In [13]:
AD1_df

Unnamed: 0_level_0,USB_CMSV_AD1,BB_CMSV_AD1,USE_SMSV_AD1,UST_SMSV_AD1,USR_SMSV_AD1,GC_SMSV_AD1,BC_SMSV_AD1,TSE_SMSV_AD1,USE_SMSV_EMASO_AD1,UST_SMSV_EMASO_AD1,...,USR_SMSV_EMA8_AD1,GC_SMSV_EMA8_AD1,BC_SMSV_EMA8_AD1,TSE_SMSV_EMA8_AD1,USE_SMSV_EMA9_AD1,UST_SMSV_EMA9_AD1,USR_SMSV_EMA9_AD1,GC_SMSV_EMA9_AD1,BC_SMSV_EMA9_AD1,TSE_SMSV_EMA9_AD1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-12-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2016-01-01,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-02-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-03-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-04-01,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-07-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-08-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-09-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
import os

# Define the folder path where files will be saved
output_folder = "Data"

# Export each DataFrame in return_dfs to a CSV file
for key, df in return_dfs.items():
    file_path = os.path.join(output_folder, f"{key}.csv")
    df.to_csv(file_path, index=True)

# Export AD DataFrames
AD1_df.to_csv(os.path.join(output_folder, 'AD1_df.csv'))
AD2_df.to_csv(os.path.join(output_folder, 'AD2_df.csv'))
AD3_df.to_csv(os.path.join(output_folder, 'AD3_df.csv'))

In [15]:
# Combine all keys into a DataFrame for tracking
all_keys = list(return_dfs.keys()) + ['AD1_df', 'AD2_df', 'AD3_df']
all_keys_df = pd.DataFrame(all_keys, columns=['key'])

# Save the keys DataFrame
all_keys_df.to_csv(os.path.join(output_folder, 'all_keys_df.csv'))

# 跑这个code read csv!!!

In [16]:
# Retrived calculated expected returns and ADs
all_dfs = {}
all_keys_list = pd.read_csv('Data/all_keys_df.csv')['key'].values.tolist()

for key in all_keys_list:
    all_dfs[key] = pd.read_csv(f"Data/{key}.csv")
    all_dfs[key]['Date'] = pd.to_datetime(all_dfs[key]['Date'])
    all_dfs[key].set_index('Date',inplace = True)