## Modules

In [1]:
import os
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
 
from datetime import datetime, timedelta
from time import time
from itertools import product

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import yfinance as yf
from tdcalendar import *

## Historical return data collection

### Price data collection

In [2]:
def getYahooData(symbolList, adjust=True, startStr='1990-01-01', endStr='2046-12-31'):
    """Scrape via yahoo API to obtain data for a symbolList."""
    symbolStr = ' '.join(symbolList)
    renameDict = {'Date': 'date', 'Open': 'op', 'High': 'hi', 'Low': 'lo', 'Close': 'cl',
                  'Volume': 'vol', 'Adj Close': 'adj_cl', 'Dividends': 'div', 'Stock Splits': 'split'}
    priceFields = ['op', 'hi', 'lo', 'cl']
    dataFields = priceFields + ['vol', 'div']

    dataDict = {}

    try:
        dfData = yf.download(symbolStr, start=startStr, end=endStr, auto_adjust=False, actions=True,
                             group_by='Tickers', threads=16)
    except:
        dfData = pd.DataFrame()

    for symbol in symbolList:
        try:
            dfSymbol = dfData[(symbol,)].dropna()  # Raw data for the symbol
            dfSymbol = dfSymbol[(dfSymbol['Volume'] > 0) | (dfSymbol['High'] > dfSymbol['Low'])]  # Filter bad data
            dfSymbol.reset_index(inplace=True)
            dfSymbol.rename(columns=renameDict, inplace=True)
            dfSymbol.set_index('date', inplace=True)

            if not adjust:
                dfSymbol = dfSymbol[dataFields]
                dfSymbol.rename(columns={field: f'{symbol}_{field}' for field in dataFields}, inplace=True)
            else:
                adjfactor = dfSymbol['adj_cl'] / dfSymbol['cl']
                for field in priceFields:
                    dfSymbol[f'adj_{field}'] = dfSymbol[field] * adjfactor
                dfSymbol['adj_vol'] = dfSymbol['vol'] / adjfactor
                dfSymbol = dfSymbol[[f'adj_{field}' for field in dataFields[:-1]]]
                dfSymbol.rename(columns={f'adj_{field}': f'{symbol}_{field}' for field in dataFields[:-1]}, inplace=True)
                dfSymbol = np.round(dfSymbol, 4)

            dataDict[symbol] = dfSymbol
        except:
            print(f'Failed preparing data for {symbol}.')

    dfAll = pd.concat(dataDict.values(), axis=1, join='inner')
    dfAll.fillna(method='ffill', inplace=True)

    return dfAll

In [3]:
symbolList = ['SPY', 'QQQ', 'IWM', 'SOXX', 'AAPL', 'MSFT', 'NVDA', 'TSLA', 'BRK-B', 'BA']

startStr = '2011-01-01'
endStr = '2023-01-01'
dfAll = getYahooData(symbolList, True, startStr, endStr)

[*********************100%***********************]  10 of 10 completed


  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)
  return runner(coro)


### Dictionary of dataframes of open, close & log-change columns in training period

In [4]:
startStr_train1 = '2019-07-01'
endStr_train1 = '2022-07-01'
startDate_train1 = datetime.strptime(startStr_train1, '%Y-%m-%d')
endDate_train1 = datetime.strptime(endStr_train1, '%Y-%m-%d')

startStr_test1 = getLatestTradingDay(holidayList_ny, endDate_train1, days=0, offset=0)
endStr_test1 = '2022-12-31'
startDate_test1 = datetime.strptime(startStr_test1, '%Y-%m-%d')
endDate_test1 = datetime.strptime(endStr_test1, '%Y-%m-%d')

dataDict_train1 = {}
dataDict_test1 = {}

for symbol in symbolList:
    dfSymbol = dfAll[[f'{symbol}_op', f'{symbol}_cl']]
    dfSymbol[f'{symbol}_pct'] = np.log(dfSymbol[f'{symbol}_cl'] / dfSymbol[f'{symbol}_cl'].shift(1))
    dataDict_train1[symbol] = dfSymbol.loc[startStr_train1:endStr_train1]
    dataDict_test1[symbol] = dfSymbol.loc[startStr_test1:endStr_test1]
    
print(dataDict_train1['NVDA'].tail(10), '\n')
print(dataDict_test1['NVDA'].tail(10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfSymbol[f'{symbol}_pct'] = np.log(dfSymbol[f'{symbol}_cl'] / dfSymbol[f'{symbol}_cl'].shift(1))


             NVDA_op   NVDA_cl  NVDA_pct
date                                    
2022-06-17  156.3669  158.6853  0.017726
2022-06-21  164.6310  165.5403  0.042292
2022-06-22  162.1428  163.4818 -0.012513
2022-06-23  165.0706  162.1328 -0.008286
2022-06-24  164.8808  171.1363  0.054045
2022-06-27  172.9949  168.5681 -0.015120
2022-06-28  168.8779  159.7045 -0.054015
2022-06-29  158.0257  155.3077 -0.027917
2022-06-30  153.4990  151.4805 -0.024951
2022-07-01  148.8824  145.1251 -0.042861 

             NVDA_op   NVDA_cl  NVDA_pct
date                                    
2022-12-16  168.6114  165.6819 -0.022731
2022-12-19  165.6919  162.5124 -0.019315
2022-12-20  160.6127  160.8227 -0.010452
2022-12-21  161.1126  164.9820  0.025534
2022-12-22  160.9127  153.3640 -0.073022
2022-12-23  151.9342  152.0342 -0.008709
2022-12-27  150.7144  141.1860 -0.074027
2022-12-28  139.2464  140.3362 -0.006037
2022-12-29  143.9956  146.0052  0.039601
2022-12-30  143.3157  146.1152  0.000753


### Descriptive Statistics of historical daily returns

In [5]:
dfStat_train = pd.DataFrame()

for symbol, df in dataDict_train1.items():
    dfStat_train[symbol] = df.describe()[f'{symbol}_pct']
    
dfStat_train

Unnamed: 0,SPY,QQQ,IWM,SOXX,AAPL,MSFT,NVDA,TSLA,BRK-B,BA
count,758.0,758.0,758.0,758.0,758.0,758.0,758.0,758.0,758.0,758.0
mean,0.000412,0.000568,0.000175,0.000739,0.001394,0.000912,0.001673,0.003595,0.000348,-0.001238
std,0.014985,0.017578,0.018954,0.023961,0.022159,0.020432,0.033142,0.044341,0.015375,0.03754
min,-0.115886,-0.127593,-0.142335,-0.165202,-0.137708,-0.159454,-0.203979,-0.236518,-0.100838,-0.272444
25%,-0.004945,-0.006912,-0.008644,-0.0108,-0.008537,-0.00861,-0.015402,-0.017704,-0.006363,-0.016361
50%,0.001302,0.001803,0.000999,0.001203,0.001478,0.001219,0.0028,0.002982,0.000624,-0.002076
75%,0.007545,0.009538,0.010466,0.01464,0.013506,0.011553,0.019618,0.024721,0.007286,0.013798
max,0.086731,0.081309,0.087545,0.102701,0.113158,0.132929,0.15834,0.181446,0.10984,0.217678


### Dummy dataframe of zero-return forward price time-series

In [6]:
tradeStrList_test1 = getTradingDays(holidayList_ny, startDate_test1, endDate_test1)
tradeDateList_test1 = [datetime.strptime(dtStr, '%Y-%m-%d') for dtStr in tradeStrList_test1]

fwdDict = {}

for symbol, df in dataDict_train1.items():
    dfFwd = pd.DataFrame()
    latestPrice = df.loc[startStr_test1, f'{symbol}_cl']
    for date in tradeDateList_test1:
        dfFwd.loc[date, f'pct-0'] = 0
        dfFwd.loc[date, f'cl-0'] = latestPrice
    fwdDict[symbol] = dfFwd
        
fwdDict['NVDA'].head(10)

Unnamed: 0,pct-0,cl-0
2022-07-01,0.0,145.1251
2022-07-05,0.0,145.1251
2022-07-06,0.0,145.1251
2022-07-07,0.0,145.1251
2022-07-08,0.0,145.1251
2022-07-11,0.0,145.1251
2022-07-12,0.0,145.1251
2022-07-13,0.0,145.1251
2022-07-14,0.0,145.1251
2022-07-15,0.0,145.1251


## Monte-Carlo Resampling (MCR)

### Construction of 1x leverage forward time series by MCR

In [7]:
numSim = 400
len_testPeriod1 = len(tradeDateList_test1) - 1

startTime = time()
for symbol, df in dataDict_train1.items():
    pctArray = np.array(df[f'{symbol}_pct'])
    numData = len(pctArray)
    latestPrice = df[f'{symbol}_cl'].iloc[-1]
    

    for num in range(numSim):
        ordinalArray = np.random.randint(numData, size=len_testPeriod1)
        fwdPctArray = [0] + [pctArray[n] for n in ordinalArray]
        fwdDict[symbol][f'pct-{num + 1}'] = fwdPctArray
        fwdDict[symbol][f'cl-{num + 1}'] = latestPrice * np.exp(fwdDict[symbol][f'pct-{num + 1}'].cumsum())
        
endTime = time()
print(f'Time elapsed for {numSim} simulations forward time series: {round(endTime - startTime, 4)}s')    

Time elapsed for 400 simulations forward time series: 6.9644s


In [8]:
fwdDict['NVDA'].iloc[-10:, :12]

Unnamed: 0,pct-0,cl-0,pct-1,cl-1,pct-2,cl-2,pct-3,cl-3,pct-4,cl-4,pct-5,cl-5
2022-12-16,0.0,145.1251,0.024076,202.081346,0.026553,292.09344,0.018852,297.665768,0.005166,216.46785,-0.012977,238.394743
2022-12-19,0.0,145.1251,0.001254,202.33499,-0.048553,278.250323,0.003347,298.663651,-0.045382,206.863729,-0.007886,236.522058
2022-12-20,0.0,145.1251,-0.009848,200.352197,0.034522,288.023928,0.003755,299.787221,0.00532,207.967131,0.044758,247.348916
2022-12-21,0.0,145.1251,-0.071562,186.515532,0.049498,302.639418,-0.019049,294.130504,0.038855,216.206812,-0.038987,237.890985
2022-12-22,0.0,145.1251,-0.066698,174.481162,0.011348,306.09348,0.000604,294.3081,0.037347,224.434181,-0.097365,215.820564
2022-12-23,0.0,145.1251,0.017954,177.64205,0.003526,307.174819,0.0011,294.631939,-0.006479,222.984862,0.00813,217.582327
2022-12-27,0.0,145.1251,-0.008701,176.103037,-0.130528,269.586517,0.05622,311.670612,0.011743,225.618857,-0.009127,215.605447
2022-12-28,0.0,145.1251,0.010419,177.947519,-0.011411,266.52786,0.05622,329.694637,-0.040235,216.721357,-0.130528,189.222286
2022-12-29,0.0,145.1251,-0.0062,176.847613,-0.036497,256.97575,-0.048553,314.069496,0.033503,224.105202,-0.014586,186.482273
2022-12-30,0.0,145.1251,-0.007955,175.446318,-0.039278,247.077979,-0.017686,308.563587,0.079272,242.593633,-0.024313,182.002962


### Descriptive Statistics of the simulated log-return groups

In [9]:
dfStat_mcr = pd.DataFrame()

for symbol, df in fwdDict.items():
    dfPrice_mcr = np.log(df[[f'cl-{n + 1}' for n in range(numSim)]] / df['cl-0'].iloc[0])
    dfFinal_mcr = dfPrice_mcr.iloc[[-1]].transpose()
    dfStat_mcr[symbol] = dfFinal_mcr.describe().iloc[:, 0]
    
dfStat_mcr

Unnamed: 0,SPY,QQQ,IWM,SOXX,AAPL,MSFT,NVDA,TSLA,BRK-B,BA
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,0.054586,0.085588,0.007055,0.096775,0.175576,0.108053,0.196977,0.45872,0.043482,-0.124899
std,0.155404,0.210483,0.22331,0.282117,0.260221,0.220243,0.378615,0.466207,0.177456,0.439606
min,-0.430297,-0.458084,-0.687077,-0.764198,-0.639436,-0.52012,-1.027893,-0.763785,-0.468571,-1.440007
25%,-0.041109,-0.057774,-0.141115,-0.096438,-0.015597,-0.03881,-0.036349,0.188627,-0.076237,-0.423566
50%,0.050855,0.07938,0.004715,0.104803,0.184716,0.113229,0.212813,0.471474,0.047711,-0.10996
75%,0.153482,0.228636,0.167655,0.293069,0.358116,0.242169,0.431104,0.747828,0.161263,0.176871
max,0.558392,0.693818,0.582805,0.840822,0.828078,0.742025,1.173594,1.99262,0.54633,1.060853


### Compare the actual return data in testing set

In [None]:
for symbol in symbolList:
    fwdDict[symbol]['cl-0'] = dataDict_test1[symbol].loc[startStr_test1:endStr_test1, f'{symbol}_cl']
    fwdDict[symbol]['pct-0'] = dataDict_test1[symbol].loc[startStr_test1:endStr_test1, f'{symbol}_pct']
    
fwdDict['NVDA'].iloc[-10:, :12]