In [None]:
from pathlib import Path
import os
import logging
from io import StringIO
import pprint
from pprint import pformat
import random
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dynamic_yaml
import yaml
# import wfdb

sys.path.append("/workspace/correlation-change-predict/utils")
from utils import calc_corr_ser_property
from gen_corr_graph_data import gen_corr_dist_mat

with open('../config/data_config.yaml') as f:
    data = dynamic_yaml.load(f)
    data_cfg = yaml.full_load(dynamic_yaml.dump(data))

output_buf = StringIO()
logging.basicConfig(level=logging.INFO)

logging.debug(pformat(data_cfg, indent=1, width=100, compact=True))

# preprocess setting

In [None]:
data_selection = True
raw_file_dir = Path(data_cfg['DIRS']['DATASET_DIR'])/f"raw_data"

# Randon pick items for trainset # Not always necessary to operate

In [None]:
def gen_random_trainset(all_items: list, train_set_len: int = 100, verbose: int = 0):
    random.seed(10)
    train_set = sorted(random.sample(all_items, train_set_len))

    if verbose==1:
        print(f"len(train_set):{len(train_set)}")
        pp = pprint.PrettyPrinter(width=500, compact=True)
        pp.pprint(train_set)

    return train_set

# Tetuan City power consumption Dataset

In [None]:
raw_file_name = 'Tetuan City power consumption.csv'
file_name = 'tetuan_city_power_consumption.csv'
raw_data = pd.read_csv(raw_file_dir/raw_file_name)
raw_data['Date'] = pd.to_datetime(raw_data['Date'])
raw_data = raw_data.set_index('Date')
display(raw_data)

## Sort Data

In [None]:
sort_pre_df = raw_data.sort_values('Date')
display(sort_pre_df.info())

## Missing Data

In [None]:
na_mask = sort_pre_df.isna().sum() < len(raw_data)*0.01 # null values ratio==1%
na_pre_df = sort_pre_df.iloc[::, na_mask.tolist()]
na_pre_df = na_pre_df.fillna(method='ffill')
na_pre_df = na_pre_df.fillna(method='bfill')
if na_pre_df.isna().sum().sum() == 0:
    output_df = na_pre_df
else:
    print(na_pre_df.iloc[::, (na_pre_df.isna().sum()>0).tolist()])
    raise Exception(f'Still has {na_pre_df.isna().sum().sum()} null value')

## Data selection

In [None]:
if data_selection == True:
    output_df = na_pre_df.iloc[::18, ::] # extract for every 3 hour
    display(output_df)

## Output Data

In [None]:
output_df.to_csv(Path(raw_file_dir).parent/(str(Path(file_name).stem) + "-pre" + str(Path(file_name).suffix)))

# Nvidia stock & Bitcoin

In [None]:
bitcoin_file_name = 'bitcoin_20102022.csv'
nvda_file_name = 'nvda_20102022.csv'
file_name = "bitcoin_nvda_20122022.csv"
bitcoin_raw_data = pd.read_csv(raw_file_dir/bitcoin_file_name)
nvda_raw_data = pd.read_csv(raw_file_dir/nvda_file_name)
bitcoin_raw_data['Date'] = pd.to_datetime(bitcoin_raw_data['Date'])
nvda_raw_data['Date'] = pd.to_datetime(nvda_raw_data['Date'])

raw_data = pd.merge(bitcoin_raw_data, nvda_raw_data, on=["Date"], how="right").set_index("Date")

logging.debug(f"\n {raw_data}")

## Sort Data

In [None]:
sort_pre_df = raw_data.sort_values('Date')
sort_pre_df.info(buf=output_buf)
logging.debug(f"\n{sort_pre_df}\n" + "="*50)
logging.debug(f"\n{output_buf.getvalue()}\n" + "="*50)

## Missing Data

In [None]:
# test if na exists
logging.info(f"\n{sort_pre_df.isnull().any()}\n" + "="*50)
logging.info(f"\n{sort_pre_df.isnull().sum()}\n" + "="*50)
logging.info(f"\n{sort_pre_df.loc[raw_data.isnull().values, raw_data.isnull().any().values]}\n" + "="*50)

In [None]:
na_mask = sort_pre_df.isna().sum() < len(raw_data)*0.01 # null values ratio==1%
na_pre_df = sort_pre_df.iloc[::, na_mask.tolist()]
na_pre_df = na_pre_df.fillna(method='ffill')
na_pre_df = na_pre_df.fillna(method='bfill')
if na_pre_df.isna().sum().sum() == 0:
    output_df = na_pre_df
else:
    print(na_pre_df.iloc[::, (na_pre_df.isna().sum()>0).tolist()])
    raise Exception(f'Still has {na_pre_df.isna().sum().sum()} null value')

# test if na exists
logging.info(f"\n{na_pre_df.isnull().any()}\n" + "="*50)
logging.info(f"\n{na_pre_df.isnull().sum()}\n" + "="*50)

## Data selection

In [None]:
if data_selection == True:
    output_df = na_pre_df.iloc[len(na_pre_df)-2519:, ::]
    logging.info(f"\n{output_df}\n")

## Output Data

In [None]:
output_df.to_csv(Path(raw_file_dir).parent/(str(Path(file_name).stem) + "_pre" + str(Path(file_name).suffix)))

# Compute moving average of sp500

## load sp500_hold_20082017-adj_close-pre.csv

In [None]:
sp500_20082017_pre_file_name = Path("sp500_hold_20082017-adj_close-pre.csv")
sp500_20082017_df = pd.read_csv(raw_file_dir.parent/sp500_20082017_pre_file_name).set_index("Date")
sp500_20082017_tickers = set(sp500_20082017_df.columns)

In [None]:
display(sp500_20082017_df)
sp500_20082017_df.rolling(window=60, axis=0).mean()[60:]

# Split sp500 by random

In [None]:
sp500_pre_file_name = Path("sp500_hold_20112015-adj_close-pre.csv")
sp500_df = pd.read_csv(raw_file_dir.parent/sp500_pre_file_name).set_index("Date")
sp500_df

In [None]:
sample_tickers = gen_random_trainset(list(sp500_20082017_df.columns), train_set_len=30, verbose=1)

# Split sp500 constituent by GICS sector|sub_industry

## Find number of tickers for each GICS_sector & GICS_sub_industry

In [None]:
file_name = "sp500_constituent_gics.csv"
sp500_constituent_industry_df = pd.read_csv(raw_file_dir/file_name).set_index('Unnamed: 0')
display(sp500_constituent_industry_df)
display(sp500_constituent_industry_df.groupby("GICS_sector")["ticker"].count().sort_values(ascending=False))
display(sp500_constituent_industry_df.groupby("GICS_sub_industry")["ticker"].count().sort_values(ascending=False))
display(sp500_constituent_industry_df.groupby("GICS_sub_industry")["ticker"].count().sum())
# display(sp500_constituent_industry_df.groupby("GICS_sector")["ticker"].value_counts())

## Filtering the tickers in `sp500_20082017-pre.csv` by GICS_sector

In [None]:
sp500_20082017_pre_file_name = Path("sp500_hold_20082017-adj_close-pre.csv")
sp500_20082017_df = pd.read_csv(raw_file_dir.parent/sp500_20082017_pre_file_name).set_index("Date")
consumer_discretionary_tickers = set(sp500_constituent_industry_df.loc[sp500_constituent_industry_df["GICS_sector"]=="Consumer Discretionary", "ticker"])
information_technology_tickers = set(sp500_constituent_industry_df.loc[sp500_constituent_industry_df["GICS_sector"]=="Information Technology", "ticker"])
financials_tickers = set(sp500_constituent_industry_df.loc[sp500_constituent_industry_df["GICS_sector"]=="Financials", "ticker"])
sp500_20082017_tickers = set(sp500_20082017_df.columns)
sp500_20082017_consumer_discretionary_tickers = sp500_20082017_tickers.intersection(consumer_discretionary_tickers)
sp500_20082017_information_technology_tickers = sp500_20082017_tickers.intersection(information_technology_tickers)
sp500_20082017_financials_tickers = sp500_20082017_tickers.intersection(financials_tickers)
logging.info(f"consumer_discretionary_tickers in sp500: {len(consumer_discretionary_tickers)}, consumer_discretionary_tickers in sp500_pre {len(sp500_20082017_consumer_discretionary_tickers)}")
logging.info(f"information_technology_tickers in sp500: {len(information_technology_tickers)}, information_technology_tickers in sp500_pre {len(sp500_20082017_information_technology_tickers)}")
logging.info(f"financials_tickers in sp500: {len(financials_tickers)}, financials_tickers in sp500_pre {len(sp500_20082017_financials_tickers)}")

## Output Data

In [None]:
sp500_20082017_consumer_discretionary_df = sp500_20082017_df.loc[:, list(sp500_20082017_consumer_discretionary_tickers)]
output_df = sp500_20082017_consumer_discretionary_df
file_name = sp500_20082017_pre_file_name
output_df.to_csv(Path(raw_file_dir).parent/(str(Path(file_name).stem).replace("-pre", "") + "-consumDiscretionary-pre" + str(Path(file_name).suffix)))

# Split sp500 constituent by hierarchy clustering

## Load cluster results and preprocessed data

In [None]:
# Watch the section "## Data implement & output setting & testset setting" of correlation_dat_etl.ipynb
cluster_results_dir_base = "sp500_20112015-train_all"
corr_type = "pearson"
# window_setting = "corr_s1_w50"
s_l = 1
w_l = 50
corr_ser_clac_method = "corr_ser_calc_regular"
corr_ser_reduction_method = "corr_ser_std"
corr_mat_compo = "sim"
filtered_distance_mat_method = "large_corr_ser_mean_filtered"
is_save_output = False

cluster_results_dir = Path(data_cfg["DIRS"]['PIPELINE_DATA_DIR'])/f"{cluster_results_dir_base}/{corr_type}/cluster/corr_s{s_l}_w{w_l}/{corr_ser_clac_method}/{corr_ser_reduction_method}/{corr_mat_compo}/{filtered_distance_mat_method}"
cluster_results_file_name = Path("corr_mat_hrchy_6_cluster.csv")  # change this file if there is any better cluster results
cluster_results_df = pd.read_csv(cluster_results_dir/cluster_results_file_name, index_col="Unnamed: 0")
cluster_label_col_name = cluster_results_file_name.stem+"_label"  # check label_col_name by cluster_results_df.columns

sp500_pre_file_name = Path("sp500_hold_20112015-adj_close-pre.csv")
sp500_df = pd.read_csv(raw_file_dir.parent/sp500_pre_file_name).set_index("Date")
sp500_tickers = set(sp500_df.columns)
display(sp500_df.head())

## Find tickers with cluster label that have many nodes

In [None]:
cluster_labels_count_df = cluster_results_df.groupby([cluster_label_col_name]).count().sort_values("items", ascending=False)
num_clusters = cluster_labels_count_df.shape[0]
frequent_cluster_info = {}
for i, (cluster_label, num_nodes) in enumerate(cluster_labels_count_df.iterrows(), start=1):
    if i < num_clusters:
        frequent_cluster_info.update({f"frequent_cluster_{i}th": {"cluster_label": cluster_label, "num_nodes": num_nodes[0]}})
    else:
        frequent_cluster_info.update({f"frequent_cluster_last": {"cluster_label": cluster_label, "num_nodes": num_nodes[0]}})

all_mix_cluster_df = cluster_results_df.groupby([cluster_label_col_name]).apply(lambda x: x.sample(n=1, random_state=0)).reset_index(drop=True)
frequent_cluster_info.update({f"frequent_cluster_all_mix": {"cluster_labels": all_mix_cluster_df.loc[::, cluster_label_col_name].tolist(), "num_nodes": len(all_mix_cluster_df)}})

mix_target_label = frequent_cluster_info['frequent_cluster_last']['cluster_label']
not_target_mask = all_mix_cluster_df.loc[::,cluster_label_col_name]!=mix_target_label
sameple_not_target_df = all_mix_cluster_df.loc[not_target_mask].sample(n=5, random_state=0)
sample_target_df = cluster_results_df.loc[cluster_results_df[cluster_label_col_name]==mix_target_label, ::].sample(n=5, random_state=0)
half_mix_cluster_df = pd.concat([sameple_not_target_df, sample_target_df])
frequent_cluster_info.update({f"frequent_cluster_half_mix": {"cluster_labels": half_mix_cluster_df.loc[::, cluster_label_col_name].unique().tolist(), "num_nodes": len(half_mix_cluster_df)}})

logging.info(f"frequent_cluster_1th[cluster_label]: 【{frequent_cluster_info['frequent_cluster_1th']['cluster_label']}】, quantity of nodes with frequent_cluster_label_1th: {frequent_cluster_info['frequent_cluster_1th']['num_nodes']}")
logging.info(f"frequent_cluster_2th[cluster_label]: 【{frequent_cluster_info['frequent_cluster_2th']['cluster_label']}】, quantity of nodes with frequent_cluster_label_2th: {frequent_cluster_info['frequent_cluster_2th']['num_nodes']}")
logging.info(f"frequent_cluster_last[cluster_label]: 【{frequent_cluster_info['frequent_cluster_last']['cluster_label']}】, quantity of nodes with frequent_cluster_label_last: {frequent_cluster_info['frequent_cluster_last']['num_nodes']}")
logging.info(f"frequent_cluster_all_mix[cluster_labels]: 【{frequent_cluster_info['frequent_cluster_all_mix']['cluster_labels']}】, quantity of nodes with frequent_cluster_all_mix: {frequent_cluster_info['frequent_cluster_all_mix']['num_nodes']}")
logging.info(f"frequent_cluster_half_mix[cluster_labels]: 【{frequent_cluster_info['frequent_cluster_half_mix']['cluster_labels']}】, quantity of nodes with frequent_cluster_half_mix: {frequent_cluster_info['frequent_cluster_half_mix']['num_nodes']}")
display("cluster_results_df.head():")
display(cluster_results_df.head())
display("cluster_labels_count_df:")
display(cluster_labels_count_df)
display("all_mix_cluster_df:")
display(all_mix_cluster_df)
display("half_mix_cluster_df:")
display(half_mix_cluster_df)

## Filtering the tickers in `sp500_xxx-pre.csv` by tickers with cluster label that have many nodes and
## Output data

In [None]:
src_file_name = str(Path(sp500_pre_file_name).stem)
save_dir = Path(raw_file_dir).parent/f"{(src_file_name+'-extension')}/{corr_type}/corr_s{s_l}_w{w_l}/{corr_ser_clac_method}/{corr_ser_reduction_method}/{corr_mat_compo}/{filtered_distance_mat_method}"
save_dir.mkdir(parents=True, exist_ok=True)

for frequent_cluster in frequent_cluster_info:
    info = frequent_cluster_info[frequent_cluster]
    if "mix" not in frequent_cluster:
        all_tickers = cluster_results_df.loc[cluster_results_df[cluster_label_col_name]==info['cluster_label'], "items"]
    elif "all_mix" in frequent_cluster:
        all_tickers = all_mix_cluster_df.loc[::, "items"]
    elif "half_mix" in frequent_cluster:
        all_tickers = half_mix_cluster_df.loc[::, "items"]
    assert len(sp500_tickers.intersection(all_tickers)) == len(all_tickers), "The tickers of frequnt_cluster should contains in sp500 data, but it's not"
    frequent_cluster_info[frequent_cluster].update({"tickers": set(all_tickers)})
    output_df = sp500_df.loc[:, list(all_tickers)]
    freq_rank = frequent_cluster.replace("frequent_cluster_", "")
    file_name = cluster_results_file_name.stem+f"_label_{freq_rank}-pre.csv"
    logging.info(f"cluster name: {frequent_cluster}, len of tickers: {len(all_tickers)}")
    if is_save_output:
        output_df.to_csv(save_dir/file_name)
        logging.info(f"{file_name} has been save to {save_dir}")

## Randon pick items for trainset

In [None]:
selected_tickers = frequent_cluster_info["frequent_cluster_last"]["tickers"]
sample_df = gen_random_trainset(list(selected_tickers), train_set_len=5, verbose=1)

## Plot info of trainset

In [None]:
# Plot the distance matrix of `sample_df`
distance_mat = pd.read_csv(cluster_results_dir/"distance_mat.csv", index_col=["items"])
# Observe other types of distance matrix
data_implement = "SP500_20112015"  # watch options by printing /config/data_config.yaml/["DATASETS"].keys()
etl_items_setting = "-train_all"  # -train_train|-train_all
# data split period setting, only suit for only settings of Korean paper
data_split_setting = "data_sp_test2"
dataset_df = pd.read_csv(data_cfg["DATASETS"][data_implement]['FILE_PATH'])
dataset_df = dataset_df.set_index('Date')
all_set = list(dataset_df.columns)  # all data
train_set = data_cfg["DATASETS"][data_implement]['TRAIN_SET']
# test items implement settings
items_implement = train_set if etl_items_setting == "-train_train" else all_set
output_file_name = data_cfg["DATASETS"][data_implement]['OUTPUT_FILE_NAME_BASIS'] + etl_items_setting
etl_res_dir= Path(data_cfg["DIRS"]["PIPELINE_DATA_DIR"])/f"{output_file_name}/{corr_type}/corr_property/corr_s{s_l}_w{w_l}/{corr_ser_clac_method}"
if corr_ser_clac_method == "corr_ser_calc_regular":
    corr_property_df_path = etl_res_dir/f"{output_file_name}-{data_split_setting}-corr_series_property.csv"
elif corr_ser_clac_method == "corr_ser_calc_abs":
    # calculate corr_property_df with abs(corr_dataset)
    corr_property_df_path = etl_res_dir/f"{output_file_name}-{data_split_setting}-corr_series_abs_property.csv"
corr_property_df = pd.read_csv(corr_property_df_path).set_index("items")
corr_ser_std = corr_property_df.loc[::, "corr_ser_std"]
corr_ser_mean = corr_property_df.loc[::, "corr_ser_mean"]
selected_dataset_df = dataset_df.loc[::, items_implement]
obs_distance_mat_mean = gen_corr_dist_mat(corr_ser_mean, selected_dataset_df, out_mat_compo=corr_mat_compo).loc[sample_df, sample_df]
obs_distance_mat_std = gen_corr_dist_mat(corr_ser_std, selected_dataset_df, out_mat_compo=corr_mat_compo).loc[sample_df, sample_df]
obs_distance_mat_mean = obs_distance_mat_mean.style.set_caption("distance_mat_mean").set_table_styles([{'selector': 'caption',
                                                                                                        'props': [('color', 'red'),
                                                                                                                  ('font-size', '24px')]}])
obs_distance_mat_std = obs_distance_mat_std.style.set_caption("distance_mat_std").set_table_styles([{'selector': 'caption',
                                                                                                     'props': [('color', 'red'),
                                                                                                               ('font-size', '24px')]}])


# If items are constituent, observe their GICS sector|sub_industry
obs_items = sample_df
gics_df = pd.read_csv(Path(data_cfg["DIRS"]["DATASET_DIR"])/"raw_data/sp500_constituent_gics.csv")
mask = gics_df.loc[::, "ticker"].isin(obs_items)


display(distance_mat.loc[sample_df, sample_df])
display(obs_distance_mat_mean)
display(obs_distance_mat_std)
display(gics_df.iloc[mask.tolist(), ::])

# Split sp500 constituent by filter and max_clique

## load data

In [None]:
# Watch the section "## Data implement & output setting & testset setting" of correlation_dat_etl.ipynb
filter_clique_results_dir_base = "sp500_20112015-train_all"
corr_type = "pearson"
# window_setting = "corr_s1_w50"
s_l = 1
w_l = 50
corr_ser_clac_method = "corr_ser_calc_regular"
corr_ser_reduction_method = "corr_ser_std"
corr_mat_compo = "sim"
filtered_distance_mat_method = "negative_corr_ser_mean_filtered"
is_save_output = False

filter_clique_results_dir = Path(data_cfg["DIRS"]['PIPELINE_DATA_DIR'])/f"{filter_clique_results_dir_base}/{corr_type}/cluster/corr_s{s_l}_w{w_l}/{corr_ser_clac_method}/{corr_ser_reduction_method}/{corr_mat_compo}/{filtered_distance_mat_method}"
filter_clique_results_file_name = Path("no_cluster.csv")  # change this file if there is any better filter_clique results
filter_clique_results_df = pd.read_csv(filter_clique_results_dir/filter_clique_results_file_name, index_col="Unnamed: 0")
# filter_clique_label_col_name = filter_clique_results_file_name.stem+"_label"  # check label_col_name by filter_clique_results_df.columns

sp500_pre_file_name = Path("sp500_hold_20112015-adj_close-pre.csv")
sp500_df = pd.read_csv(raw_file_dir.parent/sp500_pre_file_name).set_index("Date")
sp500_tickers = set(sp500_df.columns)
display(sp500_df.head())

## Filtering the tickers in `sp500_xxx-pre.csv` by tickers with `filter_clique_results_df` and
## Output data

In [None]:
src_file_name = str(Path(sp500_pre_file_name).stem)
save_dir = Path(raw_file_dir).parent/f"{(src_file_name+'-extension')}/{corr_type}/corr_s{s_l}_w{w_l}/{corr_ser_clac_method}/{corr_ser_reduction_method}/{corr_mat_compo}/{filtered_distance_mat_method}"
save_dir.mkdir(parents=True, exist_ok=True)
all_tickers = filter_clique_results_df.loc[::, "items"]
output_df = sp500_df.loc[:, list(all_tickers)]
file_name = filter_clique_results_file_name.stem+"-pre.csv"
logging.info(f"len of tickers: {len(all_tickers)}")
if is_save_output:
    output_df.to_csv(save_dir/file_name)
    logging.info(f"{file_name} has been save to {save_dir}")

## Show items for trainset

In [None]:
sample_df = all_tickers.tolist()
print(sample_df)

## Plot info of trainset

In [None]:
# Plot the distance matrix of `sample_df`
distance_mat = pd.read_csv(filter_clique_results_dir/"distance_mat.csv", index_col=["items"])
# Observe other types of distance matrix
data_implement = "SP500_20112015"  # watch options by printing /config/data_config.yaml/["DATASETS"].keys()
etl_items_setting = "-train_all"  # -train_train|-train_all
# data split period setting, only suit for only settings of Korean paper
data_split_setting = "data_sp_test2"
dataset_df = pd.read_csv(data_cfg["DATASETS"][data_implement]['FILE_PATH'])
dataset_df = dataset_df.set_index('Date')
all_set = list(dataset_df.columns)  # all data
train_set = data_cfg["DATASETS"][data_implement]['TRAIN_SET']
# test items implement settings
items_implement = train_set if etl_items_setting == "-train_train" else all_set
output_file_name = data_cfg["DATASETS"][data_implement]['OUTPUT_FILE_NAME_BASIS'] + etl_items_setting
etl_res_dir= Path(data_cfg["DIRS"]["PIPELINE_DATA_DIR"])/f"{output_file_name}/{corr_type}/corr_property/corr_s{s_l}_w{w_l}/{corr_ser_clac_method}"
if corr_ser_clac_method == "corr_ser_calc_regular":
    corr_property_df_path = etl_res_dir/f"{output_file_name}-{data_split_setting}-corr_series_property.csv"
elif corr_ser_clac_method == "corr_ser_calc_abs":
    # calculate corr_property_df with abs(corr_dataset)
    corr_property_df_path = etl_res_dir/f"{output_file_name}-{data_split_setting}-corr_series_abs_property.csv"
corr_property_df = pd.read_csv(corr_property_df_path).set_index("items")
corr_ser_std = corr_property_df.loc[::, "corr_ser_std"]
corr_ser_mean = corr_property_df.loc[::, "corr_ser_mean"]
selected_dataset_df = dataset_df.loc[::, items_implement]
obs_distance_mat_mean = gen_corr_dist_mat(corr_ser_mean, selected_dataset_df, out_mat_compo=corr_mat_compo).loc[sample_df, sample_df]
obs_distance_mat_std = gen_corr_dist_mat(corr_ser_std, selected_dataset_df, out_mat_compo=corr_mat_compo).loc[sample_df, sample_df]
obs_distance_mat_mean = obs_distance_mat_mean.style.set_caption("distance_mat_mean").set_table_styles([{'selector': 'caption',
                                                                                                        'props': [('color', 'red'),
                                                                                                                  ('font-size', '24px')]}])
obs_distance_mat_std = obs_distance_mat_std.style.set_caption("distance_mat_std").set_table_styles([{'selector': 'caption',
                                                                                                     'props': [('color', 'red'),
                                                                                                               ('font-size', '24px')]}])

# If items are constituent, observe their GICS sector|sub_industry
obs_items = sample_df
gics_df = pd.read_csv(Path(data_cfg["DIRS"]["DATASET_DIR"])/"raw_data/sp500_constituent_gics.csv")
mask = gics_df.loc[::, "ticker"].isin(obs_items)

display(distance_mat.loc[sample_df, sample_df])
display(obs_distance_mat_mean)
display(obs_distance_mat_std)
display(gics_df.iloc[mask.tolist(), ::])

# Synthetic dataset

In [None]:
synthetic_set = "pw_constant"
dim = 70  # time_length(number of samples), number of variables(dimension)
n_bkps, noise_std = 0, 10  # number of change points, noise standart deviation
syn_file_name = Path(f"{synthetic_set}-bkps{n_bkps}-noise_std{noise_std}.csv")
syn_df = pd.read_csv(raw_file_dir.parent/f"synthetic/dim{dim}"/syn_file_name).set_index("Date")

syn_df.head()

## Output Data

In [None]:
cluster_results_dir = Path(data_cfg["DIRS"]['PIPELINE_DATA_DIR'])/f"sp500_20082017-train_all/cluster/corr_s1_w50/corr_ser_calc_regular"
cluster_results_file_name = Path("corr_mat_hrchy_10_cluster.csv")  # change this file if there is any better cluster results
cluster_results_df = pd.read_csv(cluster_results_dir/cluster_results_file_name, index_col="Unnamed: 0")
save_dir = Path(raw_file_dir).parent/f"{src_file_name}/corr_s{s_l}_w{w_l}/{corr_ser_clac_method}/{corr_mat_compo}/{corr_ser_reduction_method}"
save_dir.mkdir(parents=True, exist_ok=True)
output_df.to_csv(save_dir/file_name)
logging.info(f"{file_name} has been save to {save_dir}")

## Randon pick items for trainset

In [None]:
sample_train_set = sorted(gen_random_trainset(syn_df.columns.tolist(), train_set_len=60, verbose=1), key=lambda x:int(x.split("_")[1]))
print(sample_train_set)

# Mimic Database

## visulization all patients' signal

In [None]:
numerics_folder = Path(raw_file_dir/"mimic-database-1.0.0"/"numerics")
numerics_files = [f.stem for f in numerics_folder.iterdir()]
ignored_files = {"HEADER", "ANNOTATORS", "RECORDS", ".htaccess", ".ipynb_checkpoints"}
records = sorted(set(numerics_files) - ignored_files)
for i,rec in enumerate(records):
    record = wfdb.rdrecord(numerics_folder/rec)
    wfdb.plot_wfdb(record=record, title=f'{record.record_name}', figsize=(20, 20), sharex=True)

## visulization specific signal & time-period & annotations

In [None]:
record_252n_abp = wfdb.rdrecord(numerics_folder/"252n", channels=[1,2], sampfrom=8000, sampto=9500)
# display(record_252n_abp record_252n.__dict__)
wfdb.plot_items(signal=record_252n_abp.p_signal, sig_name=record_252n_abp.sig_name, sig_units=record_252n_abp.units, title=f'{record_252n_abp.record_name}', figsize=(20, 8), sharex=True)

In [None]:
numerics_folder = Path(raw_file_dir/"mimic-database-1.0.0"/"numerics")
record_252n = wfdb.rdrecord(numerics_folder/"252n")
ann_patient = wfdb.rdann(str(numerics_folder/"252n"), 'al', summarize_labels=True)  # al patient alarms
ann_monitor = wfdb.rdann(str(numerics_folder/"252n"), 'in', summarize_labels=True)  # in monitor status alarms
display(len(record_252n.p_signal))

record_252n_fig = wfdb.plot_wfdb(record=record_252n, title=f'{record_252n.record_name}', figsize=(20, 20), sharex=True, return_fig=True)
plt.xticks(range(0, 110001, 5000))
display(ann_patient.description)
# display(ann_patient.contained_labels)
# display(ann_patient.chan)
wfdb.plot_wfdb(annotation=ann_patient)
wfdb.plot_wfdb(annotation=ann_monitor)

# numpy.savetxt("252n.csv", record_252n.p_signal, delimiter=",")

## Observe correlation series record_252n_apb

In [None]:
# wfdb_fig, wfdb_axes = wfdb.plot_items(signal=record_252n.p_signal, sig_name=record_252n.sig_name, sig_units=record_252n.units, figsize=(20, 8), sharex=True, return_fig_axes=True)  # return matplotlib axes
record_252n_abp_corr_series = pd.DataFrame(record_252n_abp.p_signal).rolling(window=100).corr().iloc[pd.IndexSlice[1::2],0].to_numpy()
fig1, ax = plt.subplots(3, 1, figsize=(20,12), sharex=True)
ax[0].plot(record_252n_abp.p_signal[:,0])
ax[0].axvline(x = 1433, color = 'r', label = 'correlation change')
# ax[0].axvline(x = 1564, color = 'r', label = 'correlation change')
ax[1].plot(record_252n_abp.p_signal[:,1])
ax[1].axvline(x = 1433, color = 'r', label = 'correlation change')
# ax[1].axvline(x = 1564, color = 'r', label = 'correlation change')
ax[2].plot(record_252n_abp_corr_series)
ax[2].axvline(x = 1433, color = 'r', label = 'correlation change')
# ax[2].axvline(x = 1564, color = 'r', label = 'correlation change')

pd.DataFrame(record_252n.p_signal, columns=["ABPsys/mmHg", "ABBPdias/mmHg"]).to_csv("../../tmp/252n.csv")
pd.DataFrame(record_252n_abp_corr_series, columns=["Pearson corr"]).to_csv("../../tmp/252n_abp_corr.csv")