## Modules

In [1]:
import os
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
 
from datetime import datetime, timedelta
from time import time
from itertools import product

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import yfinance as yf
from tdcalendar import *

## Historical return data collection

### Price data collection

In [2]:
def getYahooData(symbolList, adjust=True, startStr='1990-01-01', endStr='2046-12-31'):
    """Scrape via yahoo API to obtain data for a symbolList."""
    symbolStr = ' '.join(symbolList)
    renameDict = {'Date': 'date', 'Open': 'op', 'High': 'hi', 'Low': 'lo', 'Close': 'cl',
                  'Volume': 'vol', 'Adj Close': 'adj_cl', 'Dividends': 'div', 'Stock Splits': 'split'}
    priceFields = ['op', 'hi', 'lo', 'cl']
    dataFields = priceFields + ['vol', 'div']

    dataDict = {}

    try:
        dfData = yf.download(symbolStr, start=startStr, end=endStr, auto_adjust=False, actions=True,
                             group_by='Tickers', threads=16)
    except:
        dfData = pd.DataFrame()

    for symbol in symbolList:
        try:
            dfSymbol = dfData[(symbol,)].dropna()  # Raw data for the symbol
            dfSymbol = dfSymbol[(dfSymbol['Volume'] > 0) | (dfSymbol['High'] > dfSymbol['Low'])]  # Filter bad data
            dfSymbol.reset_index(inplace=True)
            dfSymbol.rename(columns=renameDict, inplace=True)
            dfSymbol.set_index('date', inplace=True)

            if not adjust:
                dfSymbol = dfSymbol[dataFields]
                dfSymbol.rename(columns={field: f'{symbol}_{field}' for field in dataFields}, inplace=True)
            else:
                adjfactor = dfSymbol['adj_cl'] / dfSymbol['cl']
                for field in priceFields:
                    dfSymbol[f'adj_{field}'] = dfSymbol[field] * adjfactor
                dfSymbol['adj_vol'] = dfSymbol['vol'] / adjfactor
                dfSymbol = dfSymbol[[f'adj_{field}' for field in dataFields[:-1]]]
                dfSymbol.rename(columns={f'adj_{field}': f'{symbol}_{field}' for field in dataFields[:-1]}, inplace=True)
                dfSymbol = np.round(dfSymbol, 4)

            dataDict[symbol] = dfSymbol
        except:
            print(f'Failed preparing data for {symbol}.')

    dfAll = pd.concat(dataDict.values(), axis=1, join='inner')
    dfAll.fillna(method='ffill', inplace=True)

    return dfAll

In [3]:
symbolList = ['SPY', 'QQQ', 'IWM', 'SOXX', 'AAPL', 'MSFT', 'NVDA', 'TSLA', 'BRK-B', 'BA']

startStr = '2011-01-01'
endStr = '2023-01-01'
dfAll = getYahooData(symbolList, True, startStr, endStr)

[*********************100%***********************]  10 of 10 completed


  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)


### Dictionary of dataframes of open, close & log-change columns in training period

In [4]:
startStr_train1 = '2019-07-01'
endStr_train1 = '2022-07-01'
startDate_train1 = datetime.strptime(startStr_train1, '%Y-%m-%d')
endDate_train1 = datetime.strptime(endStr_train1, '%Y-%m-%d')

startStr_test1 = getLatestTradingDay(holidayList_ny, endDate_train1, days=0, offset=0)
endStr_test1 = '2022-12-31'
startDate_test1 = datetime.strptime(startStr_test1, '%Y-%m-%d')
endDate_test1 = datetime.strptime(endStr_test1, '%Y-%m-%d')

dataDict_train1 = {}
dataDict_test1 = {}

for symbol in symbolList:
    dfSymbol = dfAll[[f'{symbol}_op', f'{symbol}_cl']]
    dfSymbol[f'{symbol}_pct'] = np.log(dfSymbol[f'{symbol}_cl'] / dfSymbol[f'{symbol}_cl'].shift(1))
    dataDict_train1[symbol] = dfSymbol.loc[startStr_train1:endStr_train1]
    dataDict_test1[symbol] = dfSymbol.loc[startStr_test1:endStr_test1]
    
print(dataDict_train1['NVDA'].tail(10), '\n')
print(dataDict_test1['NVDA'].tail(10))

             NVDA_op   NVDA_cl  NVDA_pct
date                                    
2022-06-17  156.3669  158.6853  0.017726
2022-06-21  164.6310  165.5403  0.042292
2022-06-22  162.1428  163.4818 -0.012513
2022-06-23  165.0707  162.1328 -0.008286
2022-06-24  164.8808  171.1363  0.054045
2022-06-27  172.9949  168.5681 -0.015120
2022-06-28  168.8779  159.7045 -0.054015
2022-06-29  158.0257  155.3077 -0.027917
2022-06-30  153.4990  151.4805 -0.024951
2022-07-01  148.8824  145.1251 -0.042861 

             NVDA_op   NVDA_cl  NVDA_pct
date                                    
2022-12-16  168.6114  165.6819 -0.022731
2022-12-19  165.6919  162.5124 -0.019315
2022-12-20  160.6127  160.8227 -0.010452
2022-12-21  161.1126  164.9820  0.025534
2022-12-22  160.9127  153.3640 -0.073022
2022-12-23  151.9342  152.0342 -0.008709
2022-12-27  150.7144  141.1860 -0.074027
2022-12-28  139.2464  140.3362 -0.006037
2022-12-29  143.9956  146.0052  0.039601
2022-12-30  143.3157  146.1152  0.000753


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfSymbol[f'{symbol}_pct'] = np.log(dfSymbol[f'{symbol}_cl'] / dfSymbol[f'{symbol}_cl'].shift(1))


### Descriptive Statistics of historical daily returns

In [5]:
dfStat_train = pd.DataFrame()

for symbol, df in dataDict_train1.items():
    dfStat_train[symbol] = df.describe()[f'{symbol}_pct']
    
dfStat_train

Unnamed: 0,SPY,QQQ,IWM,SOXX,AAPL,MSFT,NVDA,TSLA,BRK-B,BA
count,758.0,758.0,758.0,758.0,758.0,758.0,758.0,758.0,758.0,758.0
mean,0.000412,0.000568,0.000175,0.000739,0.001394,0.000912,0.001673,0.003595,0.000348,-0.001238
std,0.014985,0.017578,0.018954,0.023961,0.022159,0.020432,0.033142,0.044341,0.015375,0.03754
min,-0.115886,-0.127593,-0.142335,-0.165202,-0.137708,-0.159453,-0.203979,-0.236518,-0.100838,-0.272444
25%,-0.004945,-0.006911,-0.008644,-0.0108,-0.008537,-0.008609,-0.015402,-0.017704,-0.006363,-0.016361
50%,0.001302,0.001803,0.000999,0.001203,0.001478,0.001219,0.0028,0.002982,0.000624,-0.002076
75%,0.007544,0.009538,0.010466,0.01464,0.013506,0.011553,0.019618,0.024721,0.007286,0.013798
max,0.086731,0.081309,0.087545,0.1027,0.113158,0.132928,0.15834,0.181446,0.10984,0.217678


### Dummy dataframe of zero-return forward price time-series

In [6]:
tradeStrList_test1 = getTradingDays(holidayList_ny, startDate_test1, endDate_test1)
tradeDateList_test1 = [datetime.strptime(dtStr, '%Y-%m-%d') for dtStr in tradeStrList_test1]

fwdDict = {}

for symbol, df in dataDict_train1.items():
    dfFwd = pd.DataFrame()
    latestPrice = df.loc[startStr_test1, f'{symbol}_cl']
    for date in tradeDateList_test1:
        dfFwd.loc[date, f'pct-0'] = 0
        dfFwd.loc[date, f'cl-0'] = latestPrice
    fwdDict[symbol] = dfFwd
        
fwdDict['NVDA'].head(10)

Unnamed: 0,pct-0,cl-0
2022-07-01,0.0,145.1251
2022-07-05,0.0,145.1251
2022-07-06,0.0,145.1251
2022-07-07,0.0,145.1251
2022-07-08,0.0,145.1251
2022-07-11,0.0,145.1251
2022-07-12,0.0,145.1251
2022-07-13,0.0,145.1251
2022-07-14,0.0,145.1251
2022-07-15,0.0,145.1251


## Monte-Carlo Resampling (MCR)

### Construction of 1x leverage forward time series by MCR

In [7]:
def getSim_mcr1x(dfData, dateList, startPrice=1.0, numSim=400):
    """Obtain the dataframe of MCR simulations of forward price time-series."""
    # Build MCR dataframe by dummy zero-return price time-series
    dfSim = pd.DataFrame()
    for date in dateList:
        dfSim.loc[date, f'pct-0'] = 0
        dfSim.loc[date, f'cl-0'] = latestPrice
    # Historical return array     
    pctArray = np.array(dfData[f'{symbol}_pct'])
    numData = len(pctArray)
    # MCR simulations
    lenPeriod = len(dateList) - 1

    for num in range(numSim):
        ordinalArray = np.random.randint(numData, size=lenPeriod)
        fwdPctArray = [0] + [pctArray[n] for n in ordinalArray]
        dfSim[f'pct-{num + 1}'] = fwdPctArray
        dfSim[f'cl-{num + 1}'] = startPrice * np.exp(dfSim[f'pct-{num + 1}'].cumsum())
    
    return dfSim        

def fitActualvsMCR(dfSim, dfData, dateList):
    """Fit the actual price time-series into an MCR dataframe."""
    dfSim['cl-0'] = dfData.loc[dateList, f'{symbol}_cl']
    dfSim['pct-0'] = dfData.loc[dateList, f'{symbol}_pct']
    
    return dfSim

In [8]:
for symbol in symbolList:
    dfData_train = dataDict_train1[symbol]
    dfData_test = dataDict_test1[symbol]
    fwdDict[symbol] = getSim_mcr1x(dfData_train, tradeDateList_test1, dfData_train[f'{symbol}_cl'].iloc[-1]) 
    fwdDict[symbol] = fitActualvsMCR(fwdDict[symbol], dfData_test, tradeDateList_test1)
    print(f'{symbol} MCR done.')

SPY MCR done.
QQQ MCR done.
IWM MCR done.
SOXX MCR done.
AAPL MCR done.
MSFT MCR done.
NVDA MCR done.
TSLA MCR done.
BRK-B MCR done.
BA MCR done.


In [9]:
fwdDict['NVDA'].iloc[-10:, :12]

Unnamed: 0,pct-0,cl-0,pct-1,cl-1,pct-2,cl-2,pct-3,cl-3,pct-4,cl-4,pct-5,cl-5
2022-12-16,-0.022731,165.6819,0.076561,224.399388,-0.01207,242.279568,0.023341,151.290491,-0.036598,280.961082,-0.06305,167.81331
2022-12-19,-0.019315,162.5124,0.030117,231.26049,-0.006593,240.687514,0.025903,155.260502,-0.011959,277.621112,-0.031908,162.543242
2022-12-20,-0.010452,160.8227,-0.000357,231.178037,-0.057274,227.289651,-0.019753,152.223739,-0.034292,268.262424,-0.015098,160.107623
2022-12-21,0.025534,164.982,-0.076099,214.238266,0.025635,233.191574,-0.012181,150.380681,0.003596,269.228797,0.043464,167.220001
2022-12-22,-0.073022,153.364,-0.01925,210.153599,0.04002,242.713056,-0.035939,145.072185,-0.01288,265.783351,-0.016404,164.499349
2022-12-23,-0.008709,152.0342,-0.045027,200.900837,-0.033783,234.650503,0.030117,149.507826,0.012807,269.209059,-0.007507,163.269084
2022-12-27,-0.074027,141.186,-0.005375,199.823798,0.012447,237.589544,-0.000402,149.447785,-0.073321,250.176618,-0.007417,162.062643
2022-12-28,-0.006037,140.3362,0.036662,207.285776,0.113706,266.200665,0.011348,151.153452,0.010419,252.796938,0.019402,165.237696
2022-12-29,0.039601,146.0052,0.010263,209.424036,-0.018708,261.266891,-0.003492,150.626503,-0.018829,248.081488,-0.057586,155.991154
2022-12-30,0.000753,146.1152,-0.005179,208.342183,-0.031969,253.046514,0.052383,158.727026,0.04141,258.570112,0.002315,156.352742


### Descriptive Statistics of the simulated log-return groups

In [12]:
dfStat_mcr = pd.DataFrame()
numSim = 400

for symbol, df in fwdDict.items():
    dfPrice_mcr = np.log(df[[f'cl-{n + 1}' for n in range(numSim)]] / df['cl-0'].iloc[0])
    dfFinal_mcr = dfPrice_mcr.iloc[[-1]].transpose()
    dfStat_mcr[symbol] = dfFinal_mcr.describe().iloc[:, 0]
    
dfStat_mcr

Unnamed: 0,SPY,QQQ,IWM,SOXX,AAPL,MSFT,NVDA,TSLA,BRK-B,BA
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,0.041648,0.071015,0.029743,0.095804,0.20077,0.08869,0.245831,0.447462,0.050974,-0.178716
std,0.161728,0.18812,0.218802,0.287722,0.261366,0.225049,0.380032,0.514059,0.164504,0.433342
min,-0.384603,-0.556582,-0.659779,-0.929157,-0.688959,-0.486016,-0.783192,-1.107559,-0.425012,-1.353794
25%,-0.057486,-0.042044,-0.125763,-0.090179,-0.000354,-0.075097,-0.002345,0.115971,-0.06766,-0.440074
50%,0.045039,0.084343,0.026401,0.082236,0.200411,0.094486,0.256097,0.466678,0.053615,-0.191209
75%,0.150331,0.204305,0.190285,0.311184,0.383021,0.251819,0.491867,0.778295,0.164179,0.122194
max,0.491995,0.53944,0.594648,0.815753,0.897253,0.824128,1.306379,2.482673,0.45637,0.949012


### Visualize the simulated equity curves, the actual curve

In [None]:
def plotMCRcurves(dfSim):
    """"""