In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
from pathlib import Path
import sys
sys.path.append('../src')
from utils.get_prices import *

In [2]:
DATA_PATH = Path('../data')
DATA_PATH_BASKETS = DATA_PATH / 'baskets'
DATA_PATH_RAW = DATA_PATH / 'raw'
DATA_PATH_PROCESSED = DATA_PATH / 'processed'

In [70]:
basket_name = 'scorecard_single_ticker'
basket_path = DATA_PATH_BASKETS / (basket_name + '.csv')
tickers = pd.read_csv(basket_path, header=None, names=['Ticker'], squeeze=True)


In [71]:
tickers.head()

0       A
1     AAL
2     AAP
3    AAPL
4    ABBV
Name: Ticker, dtype: object

In [72]:
%%time

start = '1980-01-01'
end = '2020-12-23'

timestamp = start.replace('-', '') + '_' + end.replace('-', '')

price_filename = '_'.join(['prc', basket_name, timestamp]) + '.csv'
price_filepath = DATA_PATH_RAW / price_filename

if Path(price_filepath).exists():
    print("Found existing price file. Reading...")
    prices = pd.read_csv(price_filepath, header=[0, 1], index_col=0)
    print("Prices read from: ", price_filepath)
else:
    prices = get_prices(tickers, start, end,
                        types=['Adj Close', 'High'],
                        out_path=price_filepath,
                        sort_tks=True)

Downloading prices from Yahoo...
Gaps found and filled in  ('Adj Close', 'AAPL')  :
['19810810']
Gaps found and filled in  ('Adj Close', 'ARNC')  :
['20201001', '20201002']
Gaps found and filled in  ('Adj Close', 'LUMN')  :
['20200625', '20200626', '20200629', '20200630', '20200701', '20200702', '20200706', '20200707', '20200708', '20200709', '20200710', '20200713', '20200714', '20200715', '20200716', '20200717', '20200720', '20200721', '20200722', '20200723', '20200724', '20200727', '20200728', '20200729', '20200730', '20200731', '20200803', '20200804', '20200805', '20200806', '20200807', '20200810', '20200811', '20200812', '20200813', '20200814', '20200817', '20200818', '20200819', '20200820', '20200821', '20200824', '20200825', '20200826', '20200827', '20200828', '20200831', '20200901', '20200902', '20200903', '20200904', '20200908', '20200909', '20200910', '20200911', '20200914', '20200915', '20200916', '20200917']
Gaps found and filled in  ('Adj Close', 'SIVB')  :
['19871124']
Gap

In [73]:
prices.head()

Attributes,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,High,High,High,High,High,High,High,High,High,High
Symbols,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ACWI,...,YUM,Z,ZBH,ZBRA,ZEN,ZION,ZM,ZS,ZTS,ZUO
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1980-01-02,,,,,,,,,,,...,,,,,,,,,,
1980-01-03,,,,,,,,,,,...,,,,,,,,,,
1980-01-04,,,,,,,,,,,...,,,,,,,,,,
1980-01-07,,,,,,,,,,,...,,,,,,,,,,
1980-01-08,,,,,,,,,,,...,,,,,,,,,,


In [74]:
prices_ad = prices['Adj Close']
prices_hi = prices['High']

In [78]:
window_size = 22
thld_rtn = 0.1
backtest_start = '2020-07-01'
timestamp_backtest = backtest_start.replace('-', '') + '_' + end.replace('-', '')

In [80]:
roll_rtns = prices_ad.pct_change(window_size)
roll_rtns.tail()

Symbols,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ACWI,...,YUM,Z,ZBH,ZBRA,ZEN,ZION,ZM,ZS,ZTS,ZUO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-12-17,0.078897,0.313526,0.06308,0.069825,0.066389,-0.094212,0.033877,-0.043019,0.07969,0.043985,...,0.043138,0.294189,0.007589,0.08512,0.144713,0.053333,0.006188,0.406231,-0.015669,0.23285
2020-12-18,0.085928,0.3,0.073264,0.060893,0.052817,-0.059872,0.086912,-0.036175,0.091994,0.041752,...,0.059915,0.284438,0.011322,0.094516,0.114138,0.037374,0.010906,0.433548,-0.010881,0.174677
2020-12-21,0.111656,0.263736,0.085342,0.086419,0.050547,-0.055781,0.11119,-0.015305,0.090477,0.043912,...,0.047503,0.334116,0.002767,0.113544,0.119634,0.089625,-0.014625,0.474516,-0.016269,0.176682
2020-12-22,0.07996,0.210321,0.092876,0.111598,0.031404,-0.050834,0.117217,-0.024505,0.0669,0.036859,...,0.024821,0.284684,0.029346,0.060186,0.128243,0.095077,-0.012769,0.492565,-0.026938,0.222707
2020-12-23,0.057805,0.268156,0.105364,0.116073,0.024197,-0.032053,0.133627,-0.031022,0.057509,0.044283,...,0.021718,0.265956,0.020831,0.06972,0.128171,0.136625,-0.126752,0.450738,-0.038207,0.193772


In [118]:
roll_rtns_shifted = roll_rtns.shift(-window_size).loc[backtest_start:].dropna()

count = (roll_rtns_shifted >= thld_rtn).apply(sum, axis=0)
prob_rtn_over_thld_over_window = count / roll_rtns_shifted.shape[0]
prob_rtn_over_thld_over_window.name = 'Probability'

# save results to file
filename = '_'.join(
    ['prob_rtn_over', format(thld_rtn, '.2f'),
     'over', str(window_size)+'d', timestamp_backtest])
filepath = DATA_PATH_PROCESSED / (filename + '.csv')
prob_rtn_over_thld_over_window.to_csv(filepath)