In [1]:
#
import os
import sys
import copy
from pathlib import Path
from typing import Dict

sys.path.append("./src/")
sys.path.append("./")
sys.path.append("../")
sys.path.append("../../")
sys.path.append("../../../")

import pandas as pd

from finrl import config_tickers
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split

#
from configuration.settings import ProjectDir, ExperimentDir
from rl.experiments._2_ta_orig.Env import StockPortfolioEnv
from rl.experiments._2_ta_orig.train import dataset_name, base_cols, data_cols, features_cols
from rl.experiments.common.classes import Program
from rl.experiments.common.utils import ignore_warnings, load_all_initial_symbol_data
from rl.data.CompanyInfo import CompanyInfo



In [2]:
ignore_warnings()

In [3]:
_TRAIN_DATA_START = "2010-01-01"
_TRAIN_DATE_END = "2021-12-31"
_TEST_DATA_START = "2021-01-01"
_TEST_DATA_END = "2021-12-31"

In [5]:
#
program = Program(
    prj_dir=ProjectDir(root=Path("/Users/zlapik/my-drive-zlapik/0-todo/ai-investing")),
    exp_dir=ExperimentDir(root=Path(os.getcwd())),
    DEBUG=False,
)
program.exp_dir.check_and_create_dirs()

#
TICKERS = copy.deepcopy(config_tickers.DOW_30_TICKER)
TICKERS.remove("DOW")  # TODO: I don't have all necessary data for "DOW"

In [6]:
print(TICKERS)
print(program.prj_dir.root)
print(program.exp_dir.root)
print(base_cols)
print(data_cols)
print(features_cols)

['AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 'CSCO', 'CVX', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'KO', 'JPM', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'CRM', 'VZ', 'V', 'WBA', 'WMT', 'DIS']
/Users/zlapik/my-drive-zlapik/0-todo/ai-investing
/Users/zlapik/my-drive-zlapik/0-todo/ai-investing/src/rl/experiments/_2_ta_orig
['date', 'tic']
['open', 'high', 'low', 'close', 'volume']
['macd', 'boll_ub', 'boll_lb', 'rsi_30', 'cci_30', 'dx_30', 'close_30_sma', 'close_60_sma']


In [7]:
tickers_data: Dict[str, CompanyInfo] = load_all_initial_symbol_data(TICKERS, program.prj_dir.data.tickers)

In [8]:
df = pd.DataFrame()

for k, v in tickers_data.items():
    # Prices
    data = v.data_detailed[data_cols]
    data.insert(0, "tic", k)

    # Fill before or forward
    data = data.fillna(method="bfill")
    data = data.fillna(method="ffill")

    # Clean
    clean = data.drop(data[~data.index.str.contains("\d{4}-\d{2}-\d{2}")].index)
    df = pd.concat([clean, df])

df.insert(0, "date", df.index)
df = df.sort_values(by="date")
df.index = df["date"].factorize()[0]

In [9]:
df = df[df["date"] > _TRAIN_DATA_START]
print(df.shape)
df.head()

(94627, 7)


Unnamed: 0,date,tic,open,high,low,close,volume
12084,2010-01-04,VZ,31.325493,31.372387,31.034746,31.212946,16176648.0
12084,2010-01-04,TRV,50.150002,50.439999,49.66,49.810001,3716000.0
12084,2010-01-04,NKE,16.514999,16.5175,16.280001,16.3375,11972400.0
12084,2010-01-04,JNJ,64.709999,64.989998,64.410004,64.68,9506200.0
12084,2010-01-04,BA,55.720001,56.389999,54.799999,56.18,6186700.0


In [10]:
fe = FeatureEngineer(use_technical_indicator=True, use_turbulence=False, user_defined_feature=False)

df = fe.preprocess_data(df)

Successfully added technical indicators


In [11]:
print(df.shape)
df.head()

(94627, 15)


Unnamed: 0,date,tic,open,high,low,close,volume,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma
0,2010-01-04,AAPL,7.6225,7.660714,7.585,7.643214,493729600.0,0.0,7.66851,7.631133,100.0,66.666667,100.0,7.643214,7.643214
3263,2010-01-04,AMGN,56.630001,57.869999,56.560001,57.720001,5277400.0,0.0,7.66851,7.631133,100.0,66.666667,100.0,57.720001,57.720001
6526,2010-01-04,AXP,40.810001,41.099998,40.389999,40.919998,6894300.0,0.0,7.66851,7.631133,100.0,66.666667,100.0,40.919998,40.919998
9789,2010-01-04,BA,55.720001,56.389999,54.799999,56.18,6186700.0,0.0,7.66851,7.631133,100.0,66.666667,100.0,56.18,56.18
13052,2010-01-04,CAT,57.650002,59.189999,57.509998,58.549999,7325600.0,0.0,7.66851,7.631133,100.0,66.666667,100.0,58.549999,58.549999


In [12]:
assert df.isna().any().any() == False  # Can't be any Nan/np.inf values

In [13]:
# add covariance matrix as states
df = df.sort_values(["date", "tic"], ignore_index=True)
df.index = df.date.factorize()[0]

cov_list = []
return_list = []

# look back is one year
lookback = 252
for i in range(lookback, len(df.index.unique())):
    data_lookback = df.loc[i - lookback : i, :]
    price_lookback = data_lookback.pivot_table(index="date", columns="tic", values="close")
    return_lookback = price_lookback.pct_change().dropna()
    return_list.append(return_lookback)

    covs = return_lookback.cov().values
    cov_list.append(covs)

df_cov = pd.DataFrame({"date": df.date.unique()[lookback:], "cov_list": cov_list, "return_list": return_list})
df = df.merge(df_cov, on="date")
df = df.sort_values(["date", "tic"]).reset_index(drop=True)

In [17]:
_size = sys.getsizeof(df)

In [19]:
print(f"{_size // 1000_000} MB")
print(df.shape)
df.head()

6612 MB
(87319, 17)


Unnamed: 0,date,tic,open,high,low,close,volume,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,cov_list,return_list
0,2011-01-03,AAPL,11.63,11.795,11.601429,11.770357,445138400.0,0.118248,11.709899,11.32678,62.862181,144.28656,25.488754,11.415357,11.212333,"[[0.000284961868415017, 9.196035563171802e-05,...",tic AAPL AMGN AXP ...
1,2011-01-03,AMGN,55.200001,56.279999,55.18,55.549999,5453300.0,0.26172,58.568809,52.495191,50.863117,34.401812,4.92728,54.949,55.565166,"[[0.000284961868415017, 9.196035563171802e-05,...",tic AAPL AMGN AXP ...
2,2011-01-03,AXP,43.299999,43.619999,43.110001,43.400002,7633300.0,-0.178628,47.098241,41.260759,51.443736,-26.4239,13.049417,43.883,42.473,"[[0.000284961868415017, 9.196035563171802e-05,...",tic AAPL AMGN AXP ...
3,2011-01-03,BA,66.150002,66.68,66.0,66.400002,8072900.0,-0.08158,66.520526,63.197475,51.659857,149.651679,11.92439,64.853001,66.948501,"[[0.000284961868415017, 9.196035563171802e-05,...",tic AAPL AMGN AXP ...
4,2011-01-03,CAT,94.379997,94.809998,94.110001,94.150002,5231500.0,2.266966,96.271975,88.886025,68.687626,82.33022,50.929602,90.157333,85.266833,"[[0.000284961868415017, 9.196035563171802e-05,...",tic AAPL AMGN AXP ...


In [None]:
df.to_json(program.exp_dir.out.datasets.joinpath(dataset_name))