In [1]:
from tqdm import tqdm
from pathlib import Path
import warnings
import sys
import logging
from pprint import pformat

import pandas as pd
import numpy as np
import matplotlib as mpl
import dynamic_yaml
import yaml

sys.path.append("/workspace/correlation-change-predict/ywt_library")
import data_generation
from data_generation import data_gen_cfg, gen_corr_dist_mat, gen_corr_mat_thru_t
from stl_decompn import stl_decompn
from corr_property import calc_corr_ser_property


with open('../config/data_config.yaml') as f:
    data = dynamic_yaml.load(f)
    data_cfg = yaml.full_load(dynamic_yaml.dump(data))

warnings.simplefilter("ignore")
logging.basicConfig(format='%(levelname)-8s [%(filename)s] \n%(message)s',
                    level=logging.INFO)
matplotlib_logger = logging.getLogger("matplotlib")
matplotlib_logger.setLevel(logging.ERROR)
mpl.rcParams[u'font.sans-serif'] = ['simhei']
mpl.rcParams['axes.unicode_minus'] = False
# logger_list = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
# print(logger_list)

# %load_ext pycodestyle_magic
# %pycodestyle_on --ignore E501
# logging.debug(pformat(data_cfg, indent=1, width=100, compact=True))
# logging.info(pformat(data_gen_cfg, indent=1, width=100, compact=True))

time: 909 ms (started: 2023-02-13 08:42:56 +00:00)


# Prepare data

## Data implement & output setting & testset setting

In [2]:
# setting of output files
save_corr_data = False
# data implement setting
data_implement = "SP500_20082017_CORR_SER_REG_CORR_MAT_HRCHY_11_CLUSTER"  # watch options by operate: print(data_cfg["DATASETS"].keys())
# data split period setting, only suit for only settings of Korean paper
data_split_setting = "-data_sp_test2"
# train set setting
train_items_setting = "-train_train"  # -train_train|-train_all
# Decide composition of graph_matrix
#     - sim : output a matrix with similiarity dat
#     - dist : output a matrix with distance data
graph_mat_compo = "sim"
# setting of output files
save_corr_graph_arr = False

time: 566 µs (started: 2023-02-13 08:42:57 +00:00)


In [3]:
# data loading & implement setting
dataset_df = pd.read_csv(data_cfg["DATASETS"][data_implement]['FILE_PATH'])
dataset_df = dataset_df.set_index('Date')
all_set = list(dataset_df.columns)  # all data
train_set = data_cfg["DATASETS"][data_implement]['TRAIN_SET']
test_set = data_cfg['DATASETS'][data_implement]['TEST_SET'] if data_cfg['DATASETS'][data_implement].get('TEST_SET') else [p for p in all_set if p not in train_set]  # all data - train data
logging.info(f"===== len(train_set): {len(train_set)}, len(all_set): {len(all_set)}, len(test_set): {len(test_set)} =====")

# train items implement settings
items_implement = train_set if train_items_setting == "-train_train" else all_set
target_df = dataset_df.loc[::,items_implement]
logging.info(f"===== len(train set): {len(items_implement)} =====")

# setting of name of output files and pictures title
output_file_name = data_cfg["DATASETS"][data_implement]['OUTPUT_FILE_NAME_BASIS'] + train_items_setting
logging.info(f"===== file_name basis:{output_file_name} =====")
logging.info(f"\n{dataset_df}")

# output folder settings
corr_data_dir = Path(data_cfg["DIRS"]["PIPELINE_DATA_DIR"])/f"{output_file_name}-corr_data"
res_dir = Path(data_cfg["DIRS"]["PIPELINE_DATA_DIR"])/f"{output_file_name}-graph_data"
corr_data_dir.mkdir(parents=True, exist_ok=True)
res_dir.mkdir(parents=True, exist_ok=True)

INFO     [2239372210.py] 
===== len(train_set): 66, len(all_set): 97, len(test_set): 31 =====
INFO     [2239372210.py] 
===== len(train set): 66 =====
INFO     [2239372210.py] 
===== file_name basis:sp500_20082017_corr_ser_reg_corr_mat_hrchy_11_cluster-train_train =====
INFO     [2239372210.py] 

                   FE        KEY         ROK        WDC         CLX  \
Date                                                                  
2008-01-02  44.431971  18.352796   52.709816  25.208257   47.021449   
2008-01-03  44.920234  18.248153   52.804675  25.014347   46.690260   
2008-01-04  45.933845  17.580047   50.662449  22.590476   46.690260   
2008-01-07  47.948704  17.893976   49.958914  22.281984   46.925772   
2008-01-08  46.582802  17.314414   48.172408  20.765963   46.447388   
...               ...        ...         ...        ...         ...   
2017-12-22  30.480000  20.440000  194.570000  80.680000  149.520000   
2017-12-26  30.320000  20.330000  194.530000  80.000000  149.69

time: 33.7 ms (started: 2023-02-13 08:42:57 +00:00)


## Load or Create Correlation Data

In [4]:
# DEFAULT SETTING: data_gen_cfg["DATA_DIV_STRIDE"] == 20, data_gen_cfg["CORR_WINDOW"]==100, data_gen_cfg["CORR_STRIDE"]==100
s_l, w_l = data_gen_cfg["CORR_STRIDE"], data_gen_cfg["CORR_WINDOW"]
train_df_path = corr_data_dir/f"corr_s{s_l}_w{w_l}_train.csv"
dev_df_path = corr_data_dir/f"corr_s{s_l}_w{w_l}_dev.csv"
test1_df_path = corr_data_dir/f"corr_s{s_l}_w{w_l}_test1.csv"
test2_df_path = corr_data_dir/f"corr_s{s_l}_w{w_l}_test2.csv"
all_corr_df_paths = dict(zip(["train_df", "dev_df", "test1_df", "test2_df"],
                             [train_df_path, dev_df_path, test1_df_path, test2_df_path]))
if all([df_path.exists() for df_path in all_corr_df_paths.values()]):
    corr_datasets = [pd.read_csv(df_path, index_col=["items"]) for df_path in all_corr_df_paths.values()]
else:
    # corr_datasets = data_generation.gen_train_data(items_implement, raw_data_df=dataset_df, corr_df_paths=all_corr_df_paths, corr_ser_len_max=corr_ser_len_max, corr_ind=corr_ind, max_data_div_start_add=max_data_div_start_add, save_file=save_corr_data)
    corr_datasets = data_generation.gen_train_data(items_implement, raw_data_df=dataset_df, corr_df_paths=all_corr_df_paths, save_file=save_corr_data)

if data_split_setting == "-data_sp_test2":
    corr_dataset = corr_datasets[3]
    logging.info(f"{corr_dataset.head()}")


2145it [00:19, 107.30it/s]
INFO     [1640925214.py] 
Date         2008-01-18  2008-01-22  2008-01-23  2008-01-24  2008-01-25  \
items                                                                     
ABT & ADI_0   -0.784505   -0.240349   -0.363866   -0.653930   -0.602313   
ABT & ADS_0   -0.358393    0.115923    0.243212    0.073157   -0.105883   
ABT & AFL_0    0.236135    0.506458    0.537641    0.562178    0.607391   
ABT & AMP_0    0.031137    0.570727    0.623278    0.619504    0.593820   
ABT & AMT_0    0.041613    0.729295    0.817976    0.656269    0.691261   

Date         2008-01-28  2008-01-29  2008-01-30  2008-01-31  2008-02-01  ...  \
items                                                                    ...   
ABT & ADI_0   -0.575414   -0.596802   -0.530457   -0.552757   -0.146133  ...   
ABT & ADS_0    0.198989    0.296020    0.351901    0.302613    0.432930  ...   
ABT & AFL_0    0.474709    0.335618    0.295664   -0.060492   -0.327804  ...   
ABT & AMP_0    0.4710

time: 20 s (started: 2023-02-13 08:42:57 +00:00)


## generate correlation matrix across time

In [5]:
gen_corr_mat_thru_t(corr_dataset, target_df,
                    save_dir=res_dir if save_corr_graph_arr else False,
                    show_mat_info_inds=[0,1,2,12])

INFO     [data_generation.py] 
correlation graph of No.0 time-step
INFO     [data_generation.py] 
correlation graph.shape:(66, 66)
INFO     [data_generation.py] 
number of correlation graph:2497
INFO     [data_generation.py] 

Min of corr_mat:items
ABT   -0.863100
ADI   -0.784505
ADS   -0.668486
AFL   -0.710643
AMP   -0.503436
         ...   
WHR   -0.687515
WU    -0.155122
WYN   -0.304867
XEC   -0.678613
XRX   -0.689992
Length: 66, dtype: float32
INFO     [data_generation.py] 

(66, 66)
INFO     [data_generation.py] 

items       ABT       ADI       ADS       AFL       AMP       AMT      ANTM  \
items                                                                         
ABT    1.000000 -0.784505 -0.358393  0.236135  0.031137  0.041613 -0.440890   
ADI   -0.784505  1.000000  0.427657 -0.134256 -0.013307 -0.105937  0.476576   
ADS   -0.358393  0.427657  1.000000  0.597765  0.522863  0.332702  0.906885   
AFL    0.236135 -0.134256  0.597765  1.000000  0.664186  0.766788  0.643345   
A

time: 12.8 s (started: 2023-02-13 08:43:17 +00:00)
