In [1]:
#
import os
import copy
import sys
import warnings
import matplotlib
from pathlib import Path
from typing import Dict

#
sys.path.append("./src/")
sys.path.append("./")
sys.path.append("../")
sys.path.append("../../")
sys.path.append("../../../")

# FinRL
from finrl.config_tickers import DOW_30_TICKER

# FinRL-Meta
from meta.data_processors.yahoofinance import Yahoofinance

#
import yfinance as yf

In [2]:
def config():
    #
    warnings.filterwarnings("ignore", category=UserWarning)  # TODO: zipline problem
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", category=RuntimeWarning)

    #
    matplotlib.use("Agg")


config()

In [3]:
#
import numpy as np
import pandas as pd
import tqdm

#
from common.utils import now_time
from configuration.settings import ProjectDir, ExperimentDir
from rl.data.CompanyInfo import CompanyInfo

In [4]:
#
prj_dir = ProjectDir(root=Path("/Users/zlapik/my-drive-zlapik/0-todo/ai-investing"))
exp_dir = ExperimentDir(root=Path(os.getcwd()))
exp_dir.check_and_create_dirs()

#
_TRAIN_DATA_START = "2000-01-01"
_TRAIN_DATA_END = "2015-12-31"
_TEST_DATA_START = "2016-01-01"
_TEST_DATA_END = "2021-12-31"

#
TICKERS = copy.deepcopy(DOW_30_TICKER)
TICKERS.remove("DOW")  # TODO: I don't have all necessary data

In [5]:
TICKERS

['AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CSCO',
 'CVX',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'KO',
 'JPM',
 'MCD',
 'MMM',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'TRV',
 'UNH',
 'CRM',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS']

In [6]:
print(prj_dir.root)
print(exp_dir.root)

/Users/zlapik/my-drive-zlapik/0-todo/ai-investing
/Users/zlapik/my-drive-zlapik/0-todo/ai-investing/src/rl/experiments/_1_same_bigger_data_fundamental


In [7]:
from rl.experiments._1_same_bigger_data_fundamental.train import base_cols, data_cols, ratios_cols

print(base_cols)
print(data_cols)
print(ratios_cols)

['date', 'tic']
['open', 'high', 'low', 'close', 'volume']
['operatingProfitMargin', 'netProfitMargin', 'returnOnAssets', 'returnOnEquity', 'currentRatio', 'quickRatio', 'cashRatio', 'inventoryTurnover', 'receivablesTurnover', 'payablesTurnover', 'debtRatio', 'debtEquityRatio', 'priceEarningsRatio', 'priceBookValueRatio', 'dividendYield']


In [8]:
# Load All Initial Tickers Data
tickers_data: Dict[str, CompanyInfo] = dict()
for tic in TICKERS:
    data = dict(symbol=tic)
    files = copy.deepcopy(CompanyInfo.Names.list())
    files.remove("symbol")
    for f in files:
        tic_file = prj_dir.dataset.tickers.joinpath(tic).joinpath(f + ".csv")
        if tic_file.exists():
            data[f] = pd.read_csv(tic_file, index_col=0)
        else:
            raise FileExistsError(f"File not exists: {tic_file}")
    tickers_data[tic] = CompanyInfo(**data)

In [9]:
print(tickers_data.keys())
# print(tickers_data.values())

dict_keys(['AXP', 'AMGN', 'AAPL', 'BA', 'CAT', 'CSCO', 'CVX', 'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'KO', 'JPM', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'CRM', 'VZ', 'V', 'WBA', 'WMT', 'DIS'])


In [10]:
dataset = pd.DataFrame()

# Merge tickers information into one pd.Dataframe
# for k, v in [("DIS", tickers_data["DIS"])]:
for k, v in tickers_data.items():
    # Prices
    data = v.data_detailed[data_cols]
    data.insert(0, "tic", k)

    # Fill before or forward
    data = data.fillna(method="bfill")
    data = data.fillna(method="ffill")

    # Ratios
    ratios = v.financial_ratios.loc[ratios_cols].transpose()

    # Fill 0, where Nan/np.inf
    ratios = ratios.fillna(0)
    ratios = ratios.replace(np.inf, 0)

    #
    merge = pd.merge(data, ratios, how="outer", left_index=True, right_index=True)
    filled = merge.fillna(method="bfill")
    filled = filled.fillna(method="ffill")
    clean = filled.drop(filled[~filled.index.str.contains("\d{4}-\d{2}-\d{2}")].index)
    dataset = pd.concat([clean, dataset])

dataset.insert(0, "date", dataset.index)

In [11]:
assert dataset.isna().any().any() == False  # Can't be any Nan/np.inf values

In [12]:
dataset

Unnamed: 0,date,tic,open,high,low,close,volume,operatingProfitMargin,netProfitMargin,returnOnAssets,...,quickRatio,cashRatio,inventoryTurnover,receivablesTurnover,payablesTurnover,debtRatio,debtEquityRatio,priceEarningsRatio,priceBookValueRatio,dividendYield
1962-01-02,1962-01-02,DIS,0.092908,0.096026,0.092908,0.092908,817400.0,0.2144077006675982,0.13538270454898307,0.025337420638956283,...,1.052858261550509,0.313469068128426,3.7693993862341078,1.3643295911883075,0.6732967893500391,0.5342069706963433,1.1468762671158106,22.104169110154817,4.809540679094226,0.0010570777653587061
1962-01-03,1962-01-03,DIS,0.092908,0.094467,0.092908,0.094155,778500.0,0.2144077006675982,0.13538270454898307,0.025337420638956283,...,1.052858261550509,0.313469068128426,3.7693993862341078,1.3643295911883075,0.6732967893500391,0.5342069706963433,1.1468762671158106,22.104169110154817,4.809540679094226,0.0010570777653587061
1962-01-04,1962-01-04,DIS,0.094155,0.094467,0.093532,0.094155,934200.0,0.2144077006675982,0.13538270454898307,0.025337420638956283,...,1.052858261550509,0.313469068128426,3.7693993862341078,1.3643295911883075,0.6732967893500391,0.5342069706963433,1.1468762671158106,22.104169110154817,4.809540679094226,0.0010570777653587061
1962-01-05,1962-01-05,DIS,0.094155,0.094779,0.093844,0.094467,934200.0,0.2144077006675982,0.13538270454898307,0.025337420638956283,...,1.052858261550509,0.313469068128426,3.7693993862341078,1.3643295911883075,0.6732967893500391,0.5342069706963433,1.1468762671158106,22.104169110154817,4.809540679094226,0.0010570777653587061
1962-01-08,1962-01-08,DIS,0.094467,0.095714,0.092285,0.094155,1245600.0,0.2144077006675982,0.13538270454898307,0.025337420638956283,...,1.052858261550509,0.313469068128426,3.7693993862341078,1.3643295911883075,0.6732967893500391,0.5342069706963433,1.1468762671158106,22.104169110154817,4.809540679094226,0.0010570777653587061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-12,2022-12-12,AXP,154.260000,157.760000,154.000000,157.310000,2773675.0,0.0,0.13909245688059813,0.008742991415210665,...,0,0,0.0,0,0.0,0.8886071237465974,7.977234753550543,13.426381421979777,4.215233198312448,0.004033189771966654
2022-12-13,2022-12-13,AXP,160.620000,161.550000,156.160000,157.520000,3110000.0,0.0,0.13909245688059813,0.008742991415210665,...,0,0,0.0,0,0.0,0.8886071237465974,7.977234753550543,13.426381421979777,4.215233198312448,0.004033189771966654
2022-12-14,2022-12-14,AXP,157.570000,158.620000,153.840000,154.110000,3554804.0,0.0,0.13909245688059813,0.008742991415210665,...,0,0,0.0,0,0.0,0.8886071237465974,7.977234753550543,13.426381421979777,4.215233198312448,0.004033189771966654
2022-12-15,2022-12-15,AXP,151.650000,152.280000,146.470000,150.220000,4282628.0,0.0,0.13909245688059813,0.008742991415210665,...,0,0,0.0,0,0.0,0.8886071237465974,7.977234753550543,13.426381421979777,4.215233198312448,0.004033189771966654


In [13]:
# Take only those dates where we have data for all stock in each day
max_size = dataset.groupby("date").size().unique().max()
_d = dataset.groupby("date").size()
binary = _d.values == 29

print(_d.size)
print(binary.size)

latest_date = _d[binary].index[0]
dataset = dataset[dataset["date"] > latest_date]

15347
15347


In [14]:
dataset

Unnamed: 0,date,tic,open,high,low,close,volume,operatingProfitMargin,netProfitMargin,returnOnAssets,...,quickRatio,cashRatio,inventoryTurnover,receivablesTurnover,payablesTurnover,debtRatio,debtEquityRatio,priceEarningsRatio,priceBookValueRatio,dividendYield
2008-03-20,2008-03-20,DIS,31.280001,31.980000,31.250000,31.900000,13802300.0,0.21881766998700736,0.13902122130792552,0.020850248449222175,...,0.8225918916226715,0.2578942125709732,7.273185483870968,1.6292115011465866,1.3753335874952344,0.4476470397193985,0.8409957594801549,11.54205644470405,1.808474996186583,0
2008-03-24,2008-03-24,DIS,32.000000,32.160000,31.799999,32.040001,10536000.0,0.21881766998700736,0.13902122130792552,0.020850248449222175,...,0.8225918916226715,0.2578942125709732,7.273185483870968,1.6292115011465866,1.3753335874952344,0.4476470397193985,0.8409957594801549,11.54205644470405,1.808474996186583,0
2008-03-25,2008-03-25,DIS,31.990000,32.240002,31.840000,32.080002,8547300.0,0.21881766998700736,0.13902122130792552,0.020850248449222175,...,0.8225918916226715,0.2578942125709732,7.273185483870968,1.6292115011465866,1.3753335874952344,0.4476470397193985,0.8409957594801549,11.54205644470405,1.808474996186583,0
2008-03-26,2008-03-26,DIS,31.910000,32.040001,31.530001,31.760000,9219700.0,0.21881766998700736,0.13902122130792552,0.020850248449222175,...,0.8225918916226715,0.2578942125709732,7.273185483870968,1.6292115011465866,1.3753335874952344,0.4476470397193985,0.8409957594801549,11.54205644470405,1.808474996186583,0
2008-03-27,2008-03-27,DIS,31.879999,31.959999,31.309999,31.379999,9569900.0,0.21881766998700736,0.13902122130792552,0.020850248449222175,...,0.8225918916226715,0.2578942125709732,7.273185483870968,1.6292115011465866,1.3753335874952344,0.4476470397193985,0.8409957594801549,11.54205644470405,1.808474996186583,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-12,2022-12-12,AXP,154.260000,157.760000,154.000000,157.310000,2773675.0,0.0,0.13909245688059813,0.008742991415210665,...,0,0,0.0,0,0.0,0.8886071237465974,7.977234753550543,13.426381421979777,4.215233198312448,0.004033189771966654
2022-12-13,2022-12-13,AXP,160.620000,161.550000,156.160000,157.520000,3110000.0,0.0,0.13909245688059813,0.008742991415210665,...,0,0,0.0,0,0.0,0.8886071237465974,7.977234753550543,13.426381421979777,4.215233198312448,0.004033189771966654
2022-12-14,2022-12-14,AXP,157.570000,158.620000,153.840000,154.110000,3554804.0,0.0,0.13909245688059813,0.008742991415210665,...,0,0,0.0,0,0.0,0.8886071237465974,7.977234753550543,13.426381421979777,4.215233198312448,0.004033189771966654
2022-12-15,2022-12-15,AXP,151.650000,152.280000,146.470000,150.220000,4282628.0,0.0,0.13909245688059813,0.008742991415210665,...,0,0,0.0,0,0.0,0.8886071237465974,7.977234753550543,13.426381421979777,4.215233198312448,0.004033189771966654


In [15]:
dataset = dataset.sort_values(by="date")
dataset.index = dataset["date"].factorize()[0]

In [16]:
assert dataset.groupby("date").size().unique().size == 1

In [75]:
dataset

Unnamed: 0,date,tic,open,high,low,close,volume,operatingProfitMargin,netProfitMargin,returnOnAssets,...,quickRatio,cashRatio,inventoryTurnover,receivablesTurnover,payablesTurnover,debtRatio,debtEquityRatio,priceEarningsRatio,priceBookValueRatio,dividendYield
0,2008-03-20,DIS,31.280001,31.980000,31.250000,31.900000,13802300.0,0.21881766998700736,0.13902122130792552,0.020850248449222175,...,0.8225918916226715,0.2578942125709732,7.273185483870968,1.6292115011465866,1.3753335874952344,0.4476470397193985,0.8409957594801549,11.54205644470405,1.808474996186583,0
0,2008-03-20,UNH,35.729999,35.729999,34.840000,35.320000,12731500.0,0.03319850039463299,0.016623914759273876,0.006221385319745976,...,0.4222244458898284,0.263821483964577,0.0,8.626382978723404,0,0.6417811253876827,1.7915893630179345,23.67952522255193,1.645021645021645,0
0,2008-03-20,HON,52.606930,52.606930,51.005268,51.758431,12340441.0,0.10967541864792227,0.07473640686375853,0.019657956986323718,...,0.7566042365257217,0.18025680870205282,1.8330413016270337,1.372782744430254,1.7390168606031822,0.7210092715952038,2.5843485040444403,12.354489952725562,3.4820373202692063,0.005737606394476217
0,2008-03-20,TRV,46.470001,48.259998,46.259998,47.790001,6908900.0,0.0,0.14964257347100873,0.2121143886512047,...,0.23136199395324827,0.004043457955411351,0.0,0.48561289824886217,0,19.748254897545596,3.383173243837519,6.895854803091826,1.002336955524052,0.006927451635948628
0,2008-03-20,CVX,81.279999,84.070000,80.870003,83.209999,18373200.0,0.14176577618720554,0.07199749364373592,0.03664160523959624,...,0.9225880857674112,0.1925294796055264,9.670538377020875,2.7128567225654603,2.1070793433652533,0.49410668073050173,0.9793844508192736,8.505988366230666,2.4711081095068907,0.0065717914465864054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3713,2022-12-16,CAT,228.630000,233.220000,228.050000,232.720000,7610640.0,0.1617313592103508,0.13612111511271174,0.025226494617276628,...,0.775511555526124,0.2101185351963446,0.6140569395017793,0.8780744905130007,1.2533898305084745,0.8069141112635495,4.187351677249695,10.551445485668788,5.525110701366172,0.007348331743584158
3713,2022-12-16,CSCO,47.410000,48.000000,47.080000,47.810000,66126196.0,0.2596830985915493,0.19586267605633803,0.028693016957895414,...,1.1610700514138816,0.29289845758354754,1.9842342342342343,1.4944091208068406,2.2823834196891193,0.5672190341092269,1.3106376638855781,17.55516104868914,4.655570123162495,0.008320482809882516
3713,2022-12-16,WMT,144.480000,144.720000,142.570000,142.750000,13443780.0,0.017635934115552997,-0.011766014671526636,-0.007260070420260361,...,0.19530017355632692,0.1142612022720101,1.7867431150125181,18.59491360428328,2.018982589106404,0.6754247827631876,2.315101103068385,-53.65081130141824,5.3403545164878965,0.003936695297395082
3713,2022-12-16,JPM,129.150000,129.900000,128.425000,129.290000,15917668.0,0.0,0.29762195867465463,0.002580100501234272,...,0,0,0.0,0.2273444286161009,0,0.9236812790218247,12.102944954829212,7.945090890417993,1.0743960446916514,0.011026177800671784


In [26]:
print(dataset["date"].min())
print(dataset["date"].max())

date_split = dataset.iloc[dataset.index.size // 4 * 3]["date"]
dataset[dataset["date"] < date_split]

2008-03-20
2022-12-16


Unnamed: 0,date,tic,open,high,low,close,volume,operatingProfitMargin,netProfitMargin,returnOnAssets,...,quickRatio,cashRatio,inventoryTurnover,receivablesTurnover,payablesTurnover,debtRatio,debtEquityRatio,priceEarningsRatio,priceBookValueRatio,dividendYield
0,2008-03-20,DIS,31.280001,31.980000,31.250000,31.900000,13802300.0,0.21881766998700736,0.13902122130792552,0.020850248449222175,...,0.8225918916226715,0.2578942125709732,7.273185483870968,1.6292115011465866,1.3753335874952344,0.4476470397193985,0.8409957594801549,11.54205644470405,1.808474996186583,0
0,2008-03-20,MRK,41.593510,41.593510,40.076336,41.374046,17209522.0,0.2570805380217456,0.29219405796622494,0.03726070694832218,...,1.40292840532587,0.7557878706809623,0.6374965762804711,1.6593912805045243,2.630438877378037,0.5223916135489649,1.223904028436019,10.903727198135215,3.807476462176639,0.010656830663198981
0,2008-03-20,IBM,111.940727,113.269600,111.520073,113.126198,11943123.0,0.11490564596225838,0.1056230422492169,0.0228648451971421,...,0.8297339032741758,0.21542868652507666,5.221612349914237,0.9614367562802997,2.021112734032665,0.7662741465996296,3.278516841211435,13.99869448680199,5.47783615284567,0.004424334040641282
0,2008-03-20,MCD,53.950001,54.759998,53.700001,54.419998,13075600.0,0.2722828502296183,0.1959574012805952,0.03900554694590335,...,1.1150006611133148,0.7743289699854555,33.45351867940921,5.894915583155443,6.402560691719321,0.5209378368549177,1.087411774361219,13.32775290942461,4.340616577849209,0.00664284744150737
0,2008-03-20,AMGN,40.160000,40.220001,39.160000,40.110001,17844400.0,0.313230605738576,0.25,0.0260946729153379,...,1.9918953766807883,0.6247927795174065,0.24133083411433928,1.6147576147576148,0.6740837696335078,0.48606527827847257,0.9457724059785247,13.506503662167908,2.7431327785247936,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2784,2019-04-11,WBA,54.590000,54.799999,53.320000,53.439999,8194100.0,0.03477783238414617,0.029631985198462028,0.014944740909223457,...,0.3223463687150838,0.033479648842777335,2.7484302207818514,4.778422434037851,1.9205944798301486,0.6388329979879276,1.8155637508805371,10.949869800482928,1.8602936303808062,0.008976607459067159
2784,2019-04-11,V,159.110001,159.300003,157.440002,157.860001,5771400.0,0.6691780821917809,0.5309931506849315,0.04414297712422953,...,1.3294856404100102,0.7306307138239911,0.0,1.813664596273292,7.04,0.5018434426112828,1.0074010572938992,27.933608898806835,9.901085434513503,0.001630645821772223
2784,2019-04-11,HD,199.899994,201.500000,199.029999,201.479996,2465100.0,0.1363481293355066,0.09525795079792275,0.04878190818208289,...,0.21343973974482794,0.09566410816855589,1.120619554695063,11.385843763487268,1.6840267675298226,1.0415995341162767,-25.038730751283246,21.865562613897733,-102.56305898035464,0.00682006472740434
2784,2019-04-11,CVX,125.339996,126.260002,124.750000,125.989998,3248300.0,0.10194642513008287,0.11851994603969937,0.016824424139629042,...,0.7962121474903743,0.2952929341982032,3.6964311940079306,2.5254119446568866,1.7000337723741978,0.3846637850850796,0.6293487643466863,13.696622276987194,1.5080778516558682,0.009539725799802423


In [76]:
from rl.experiments._1_same_bigger_data_fundamental.train import dataset_name

dataset.to_csv(exp_dir.out.datasets.joinpath(dataset_name).with_suffix(".csv"))