In [None]:
from pathlib import Path
import os
import logging
from io import StringIO
import pprint
from pprint import pformat
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dynamic_yaml
import yaml
# import wfdb

with open('../config/data_config.yaml') as f:
    data = dynamic_yaml.load(f)
    data_cfg = yaml.full_load(dynamic_yaml.dump(data))

output_buf = StringIO()
logging.basicConfig(level=logging.INFO)

logging.debug(pformat(data_cfg, indent=1, width=100, compact=True))

# preprocess setting

In [None]:
data_selection = True
raw_file_dir = Path(data_cfg['DIRS']['DATASET_DIR'])/f"raw_data"

# Randon pick items for trainset # Not always necessary to operate

In [None]:
def gen_random_trainset(all_items: list, train_set_len: int = 100, verbose: int = 0):
    random.seed(10)
    train_set = sorted(random.sample(all_items, train_set_len))

    if verbose==1:
        print(f"len(train_set):{len(train_set)}")
        pp = pprint.PrettyPrinter(width=500, compact=True)
        pp.pprint(train_set)

    return train_set

# Tetuan City power consumption Dataset

In [None]:
raw_file_name = 'Tetuan City power consumption.csv'
file_name = 'tetuan_city_power_consumption.csv'
raw_data = pd.read_csv(raw_file_dir/raw_file_name)
raw_data['Date'] = pd.to_datetime(raw_data['Date'])
raw_data = raw_data.set_index('Date')
display(raw_data)

## Sort Data

In [None]:
sort_pre_df = raw_data.sort_values('Date')
display(sort_pre_df.info())

## Missing Data

In [None]:
na_mask = sort_pre_df.isna().sum() < len(raw_data)*0.01 # null values ratio==1%
na_pre_df = sort_pre_df.iloc[::, na_mask.tolist()]
na_pre_df = na_pre_df.fillna(method='ffill')
na_pre_df = na_pre_df.fillna(method='bfill')
if na_pre_df.isna().sum().sum() == 0:
    output_df = na_pre_df
else:
    print(na_pre_df.iloc[::, (na_pre_df.isna().sum()>0).tolist()])
    raise Exception(f'Still has {na_pre_df.isna().sum().sum()} null value')

## Data selection

In [None]:
if data_selection == True:
    output_df = na_pre_df.iloc[::18, ::] # extract for every 3 hour
    display(output_df)

## Output Data

In [None]:
output_df.to_csv(Path(raw_file_dir).parent/(str(Path(file_name).stem) + "-pre" + str(Path(file_name).suffix)))

# Nvidia stock & Bitcoin

In [None]:
bitcoin_file_name = 'bitcoin_20102022.csv'
nvda_file_name = 'nvda_20102022.csv'
file_name = "bitcoin_nvda_20122022.csv"
bitcoin_raw_data = pd.read_csv(raw_file_dir/bitcoin_file_name)
nvda_raw_data = pd.read_csv(raw_file_dir/nvda_file_name)
bitcoin_raw_data['Date'] = pd.to_datetime(bitcoin_raw_data['Date'])
nvda_raw_data['Date'] = pd.to_datetime(nvda_raw_data['Date'])

raw_data = pd.merge(bitcoin_raw_data, nvda_raw_data, on=["Date"], how="right").set_index("Date")

logging.debug(f"\n {raw_data}")

## Sort Data

In [None]:
sort_pre_df = raw_data.sort_values('Date')
sort_pre_df.info(buf=output_buf)
logging.debug(f"\n{sort_pre_df}\n" + "="*50)
logging.debug(f"\n{output_buf.getvalue()}\n" + "="*50)

## Missing Data

In [None]:
# test if na exists
logging.info(f"\n{sort_pre_df.isnull().any()}\n" + "="*50)
logging.info(f"\n{sort_pre_df.isnull().sum()}\n" + "="*50)
logging.info(f"\n{sort_pre_df.loc[raw_data.isnull().values, raw_data.isnull().any().values]}\n" + "="*50)

In [None]:
na_mask = sort_pre_df.isna().sum() < len(raw_data)*0.01 # null values ratio==1%
na_pre_df = sort_pre_df.iloc[::, na_mask.tolist()]
na_pre_df = na_pre_df.fillna(method='ffill')
na_pre_df = na_pre_df.fillna(method='bfill')
if na_pre_df.isna().sum().sum() == 0:
    output_df = na_pre_df
else:
    print(na_pre_df.iloc[::, (na_pre_df.isna().sum()>0).tolist()])
    raise Exception(f'Still has {na_pre_df.isna().sum().sum()} null value')

# test if na exists
logging.info(f"\n{na_pre_df.isnull().any()}\n" + "="*50)
logging.info(f"\n{na_pre_df.isnull().sum()}\n" + "="*50)

## Data selection

In [None]:
if data_selection == True:
    output_df = na_pre_df.iloc[len(na_pre_df)-2519:, ::]
    logging.info(f"\n{output_df}\n")

## Output Data

In [None]:
output_df.to_csv(Path(raw_file_dir).parent/(str(Path(file_name).stem) + "_pre" + str(Path(file_name).suffix)))

# Compute moving average of sp500

## load sp500_hold_20082017-adj_close-pre.csv

In [None]:
sp500_20082017_pre_file_name = Path("sp500_hold_20082017-adj_close-pre.csv")
sp500_20082017_df = pd.read_csv(raw_file_dir.parent/sp500_20082017_pre_file_name).set_index("Date")
sp500_20082017_tickers = set(sp500_20082017_df.columns)

In [None]:
display(sp500_20082017_df)
sp500_20082017_df.rolling(window=60, axis=0).mean()[60:]

# Split sp500 constituent by GICS sector|sub_industry

## Find number of tickers for each GICS_sector & GICS_sub_industry

In [None]:
file_name = "sp500_constituent_gics.csv"
sp500_constituent_industry_df = pd.read_csv(raw_file_dir/file_name).set_index('Unnamed: 0')
display(sp500_constituent_industry_df)
display(sp500_constituent_industry_df.groupby("GICS_sector")["ticker"].count().sort_values(ascending=False))
display(sp500_constituent_industry_df.groupby("GICS_sub_industry")["ticker"].count().sort_values(ascending=False))
display(sp500_constituent_industry_df.groupby("GICS_sub_industry")["ticker"].count().sum())
# display(sp500_constituent_industry_df.groupby("GICS_sector")["ticker"].value_counts())

## Filtering the tickers in `sp500_20082017-pre.csv` by GICS_sector

In [None]:
sp500_20082017_pre_file_name = Path("sp500_hold_20082017-adj_close-pre.csv")
sp500_20082017_df = pd.read_csv(raw_file_dir.parent/sp500_20082017_pre_file_name).set_index("Date")
consumer_discretionary_tickers = set(sp500_constituent_industry_df.loc[sp500_constituent_industry_df["GICS_sector"]=="Consumer Discretionary", "ticker"])
information_technology_tickers = set(sp500_constituent_industry_df.loc[sp500_constituent_industry_df["GICS_sector"]=="Information Technology", "ticker"])
financials_tickers = set(sp500_constituent_industry_df.loc[sp500_constituent_industry_df["GICS_sector"]=="Financials", "ticker"])
sp500_20082017_tickers = set(sp500_20082017_df.columns)
sp500_20082017_consumer_discretionary_tickers = sp500_20082017_tickers.intersection(consumer_discretionary_tickers)
sp500_20082017_information_technology_tickers = sp500_20082017_tickers.intersection(information_technology_tickers)
sp500_20082017_financials_tickers = sp500_20082017_tickers.intersection(financials_tickers)
logging.info(f"consumer_discretionary_tickers in sp500: {len(consumer_discretionary_tickers)}, consumer_discretionary_tickers in sp500_pre {len(sp500_20082017_consumer_discretionary_tickers)}")
logging.info(f"information_technology_tickers in sp500: {len(information_technology_tickers)}, information_technology_tickers in sp500_pre {len(sp500_20082017_information_technology_tickers)}")
logging.info(f"financials_tickers in sp500: {len(financials_tickers)}, financials_tickers in sp500_pre {len(sp500_20082017_financials_tickers)}")

## Output Data

In [None]:
sp500_20082017_consumer_discretionary_df = sp500_20082017_df.loc[:, list(sp500_20082017_consumer_discretionary_tickers)]
output_df = sp500_20082017_consumer_discretionary_df
file_name = sp500_20082017_pre_file_name
output_df.to_csv(Path(raw_file_dir).parent/(str(Path(file_name).stem).replace("-pre", "") + "-consumDiscretionary-pre" + str(Path(file_name).suffix)))

# Split sp500 constituent by hierarchy clustering

## Load cluster results and preprocessed data

In [None]:
cluster_resluts_dir = Path(data_cfg["DIRS"]['PIPELINE_DATA_DIR'])/f"sp500_20082017-train_all/cluster/corr_s1_w50/corr_ser_calc_regular"
cluster_resluts_file_name = Path("corr_mat_hrchy_10_cluster.csv")  # change this file if there is any better cluster results
cluster_resluts_df = pd.read_csv(cluster_resluts_dir/cluster_resluts_file_name, index_col="Unnamed: 0")
cluster_label_col_name = cluster_resluts_file_name.stem+"_label"  # check label_col_name by cluster_resluts_df.columns

sp500_20082017_pre_file_name = Path("sp500_hold_20082017-adj_close-pre.csv")
sp500_20082017_df = pd.read_csv(raw_file_dir.parent/sp500_20082017_pre_file_name).set_index("Date")
sp500_20082017_tickers = set(sp500_20082017_df.columns)

## Find tickers with cluster label that have many nodes

In [None]:
frequent_cluster_label_1st = cluster_resluts_df.groupby([cluster_label_col_name]).count().sort_values("items", ascending=False).index[0]
frequent_cluster_label_1st_nodes_num = cluster_resluts_df.groupby([cluster_label_col_name]).count().sort_values("items", ascending=False).iloc[0, 0]
frequent_cluster_label_2nd = cluster_resluts_df.groupby([cluster_label_col_name]).count().sort_values("items", ascending=False).index[1]
frequent_cluster_label_2nd_nodes_num = cluster_resluts_df.groupby([cluster_label_col_name]).count().sort_values("items", ascending=False).iloc[1, 0]
frequent_cluster_label_7th = cluster_resluts_df.groupby([cluster_label_col_name]).count().sort_values("items", ascending=False).index[6]
frequent_cluster_label_7th_nodes_num = cluster_resluts_df.groupby([cluster_label_col_name]).count().sort_values("items", ascending=False).iloc[6, 0]
frequent_cluster_label_last = cluster_resluts_df.groupby([cluster_label_col_name]).count().sort_values("items", ascending=False).index[-1]
frequent_cluster_label_last_nodes_num = cluster_resluts_df.groupby([cluster_label_col_name]).count().sort_values("items", ascending=False).iloc[-1, 0]

all_mix_cluster_df = cluster_resluts_df.groupby([cluster_label_col_name]).apply(lambda x: x.sample(n=1, random_state=0)).reset_index(drop=True)
not_target_mask = all_mix_cluster_df.loc[::,cluster_label_col_name]!=frequent_cluster_label_7th
sameple_not_target_df = all_mix_cluster_df.loc[not_target_mask].sample(n=5, random_state=0)
sample_target_df = cluster_resluts_df.loc[cluster_resluts_df[cluster_label_col_name]==frequent_cluster_label_7th, ::].sample(n=5, random_state=0)
half_mix_cluster_df = pd.concat([sameple_not_target_df, sample_target_df])

frequent_cluster_label_all_mix = all_mix_cluster_df.loc[::, cluster_label_col_name].tolist()
frequent_cluster_label_all_mix_nodes_num = len(all_mix_cluster_df)
frequent_cluster_label_half_mix = half_mix_cluster_df.loc[::, cluster_label_col_name].unique().tolist()
frequent_cluster_label_half_mix_nodes_num = len(half_mix_cluster_df)

logging.info(f"frequent_cluster_label_1st: 【{frequent_cluster_label_1st}】, quantity of nodes with frequent_cluster_label_1st: {frequent_cluster_label_1st_nodes_num}")
logging.info(f"frequent_cluster_label_2nd: 【{frequent_cluster_label_2nd}】, quantity of nodes with frequent_cluster_label_2nd: {frequent_cluster_label_2nd_nodes_num}")
logging.info(f"frequent_cluster_label_7th: 【{frequent_cluster_label_7th}】, quantity of nodes with frequent_cluster_label_7th: {frequent_cluster_label_7th_nodes_num}")
logging.info(f"frequent_cluster_label_last: 【{frequent_cluster_label_last}】, quantity of nodes with frequent_cluster_label_last: {frequent_cluster_label_last_nodes_num}")
logging.info(f"frequent_cluster_label_all_mix: 【{frequent_cluster_label_all_mix}】, quantity of nodes with frequent_cluster_label_all_mix: {frequent_cluster_label_all_mix_nodes_num}")
logging.info(f"frequent_cluster_label_half_mix: 【{frequent_cluster_label_half_mix}】, quantity of nodes with frequent_cluster_label_half_mix: {frequent_cluster_label_half_mix_nodes_num}")

display(cluster_resluts_df.head())
display(cluster_resluts_df.groupby([cluster_label_col_name]).count().sort_values("items", ascending=False))
display("all_mix_cluster_df:")
display(all_mix_cluster_df)
display("half_mix_cluster_df:")
display(half_mix_cluster_df)

## Filtering the tickers in `sp500_20082017-pre.csv` by tickers with cluster label that have many nodes

In [None]:
frequent_cluster_label_1st_tickers = set(cluster_resluts_df.loc[cluster_resluts_df[cluster_label_col_name]==frequent_cluster_label_1st, "items"])
frequent_cluster_label_2nd_tickers = set(cluster_resluts_df.loc[cluster_resluts_df[cluster_label_col_name]==frequent_cluster_label_2nd, "items"])
frequent_cluster_label_7th_tickers = set(cluster_resluts_df.loc[cluster_resluts_df[cluster_label_col_name]==frequent_cluster_label_7th, "items"])
frequent_cluster_label_last_tickers = set(cluster_resluts_df.loc[cluster_resluts_df[cluster_label_col_name]==frequent_cluster_label_last, "items"])
frequent_cluster_label_all_mix_tickers = set(all_mix_cluster_df.loc[::, "items"])
frequent_cluster_label_half_mix_tickers = set(half_mix_cluster_df.loc[::, "items"])

sp500_20082017_frequent_cluster_label_1st_tickers = sp500_20082017_tickers.intersection(frequent_cluster_label_1st_tickers)
sp500_20082017_frequent_cluster_label_2nd_tickers = sp500_20082017_tickers.intersection(frequent_cluster_label_2nd_tickers)
sp500_20082017_frequent_cluster_label_7th_tickers = sp500_20082017_tickers.intersection(frequent_cluster_label_7th_tickers)
sp500_20082017_frequent_cluster_label_last_tickers = sp500_20082017_tickers.intersection(frequent_cluster_label_last_tickers)
sp500_20082017_frequent_cluster_label_all_mix_tickers = sp500_20082017_tickers.intersection(frequent_cluster_label_all_mix_tickers)
sp500_20082017_frequent_cluster_label_half_mix_tickers = sp500_20082017_tickers.intersection(frequent_cluster_label_half_mix_tickers)

logging.info(f"frequent_cluster_label_1st_tickers in sp500: {len(frequent_cluster_label_1st_tickers)}, frequent_cluster_label_1st_tickers in sp500_pre: {len(sp500_20082017_frequent_cluster_label_1st_tickers)}")
logging.info(f"frequent_cluster_label_2nd_tickers in sp500: {len(frequent_cluster_label_2nd_tickers)}, frequent_cluster_label_2nd_tickers in sp500_pre: {len(sp500_20082017_frequent_cluster_label_2nd_tickers)}")
logging.info(f"frequent_cluster_label_7th_tickers in sp500: {len(frequent_cluster_label_7th_tickers)}, frequent_cluster_label_7th_tickers in sp500_pre: {len(sp500_20082017_frequent_cluster_label_7th_tickers)}")
logging.info(f"frequent_cluster_label_last_tickers in sp500: {len(frequent_cluster_label_last_tickers)}, frequent_cluster_label_last_tickers in sp500_pre: {len(sp500_20082017_frequent_cluster_label_last_tickers)}")
logging.info(f"frequent_cluster_label_all_mix_tickers in sp500: {len(frequent_cluster_label_all_mix_tickers)}, frequent_cluster_label_all_mix_tickers in sp500_pre: {len(sp500_20082017_frequent_cluster_label_all_mix_tickers)}")
logging.info(f"frequent_cluster_label_half_mix_tickers in sp500: {len(frequent_cluster_label_half_mix_tickers)}, frequent_cluster_label_half_mix_tickers in sp500_pre: {len(sp500_20082017_frequent_cluster_label_half_mix_tickers)}")

## Output Data

In [None]:
sp500_20082017_frequent_cluster_label_1st_df = sp500_20082017_df.loc[:, list(frequent_cluster_label_1st_tickers)]
sp500_20082017_frequent_cluster_label_2nd_df = sp500_20082017_df.loc[:, list(frequent_cluster_label_2nd_tickers)]
sp500_20082017_frequent_cluster_label_7th_df = sp500_20082017_df.loc[:, list(frequent_cluster_label_7th_tickers)]
sp500_20082017_frequent_cluster_label_last_df = sp500_20082017_df.loc[:, list(frequent_cluster_label_last_tickers)]
sp500_20082017_frequent_cluster_label_all_mix_df = sp500_20082017_df.loc[:, list(frequent_cluster_label_all_mix_tickers)]
sp500_20082017_frequent_cluster_label_half_mix_df = sp500_20082017_df.loc[:, list(frequent_cluster_label_half_mix_tickers)]

src_file_name = str(Path(sp500_20082017_pre_file_name).stem)
window_setting = "corr_s1_w50"
corr_ser_clac_method = "corr_ser_calc_reg"
corr_mat_compo = "sim"
corr_ser_reduction_method = "corr_ser_std"
freq_label_setup = "label_half_mix"
if freq_label_setup == "label_1st":
    output_df = sp500_20082017_frequent_cluster_label_1st_df
    file_name = cluster_resluts_file_name.stem+"-pre.csv"
elif freq_label_setup == "label_2nd":
    output_df = sp500_20082017_frequent_cluster_label_2nd_df
    file_name = cluster_resluts_file_name.stem+"_label_2-pre.csv"
elif freq_label_setup == "label_7th":
    output_df = sp500_20082017_frequent_cluster_label_7th_df
    file_name = cluster_resluts_file_name.stem+"_label_7-pre.csv"
elif freq_label_setup == "label_last":
    output_df = sp500_20082017_frequent_cluster_label_last_df
    file_name = cluster_resluts_file_name.stem+"_label_last-pre.csv"
elif freq_label_setup == "label_all_mix":
    output_df = sp500_20082017_frequent_cluster_label_all_mix_df
    file_name = cluster_resluts_file_name.stem+"_label_all_mix-pre.csv"
elif freq_label_setup == "label_half_mix":
    output_df = sp500_20082017_frequent_cluster_label_half_mix_df
    file_name = cluster_resluts_file_name.stem+"_label_half_mix-pre.csv"
save_dir = Path(raw_file_dir).parent/src_file_name/window_setting/corr_ser_clac_method/corr_mat_compo/corr_ser_reduction_method
save_dir.mkdir(parents=True, exist_ok=True)
output_df.to_csv(save_dir/file_name)
logging.info(f"{file_name} has been save to {save_dir}")

## Randon pick items for trainset

In [None]:
sample_df = gen_random_trainset(list(frequent_cluster_label_half_mix_tickers), train_set_len=10, verbose=1)

# Synthetic dataset

In [None]:
synthetic_set = "pw_constant"
dim = 70  # time_length(number of samples), number of variables(dimension)
n_bkps, noise_std = 0, 10  # number of change points, noise standart deviation
syn_file_name = Path(f"{synthetic_set}-bkps{n_bkps}-noise_std{noise_std}.csv")
syn_df = pd.read_csv(raw_file_dir.parent/f"synthetic/dim{dim}"/syn_file_name).set_index("Date")

syn_df.head()

## Output Data

In [None]:
cluster_resluts_dir = Path(data_cfg["DIRS"]['PIPELINE_DATA_DIR'])/f"sp500_20082017-train_all/cluster/corr_s1_w50/corr_ser_calc_regular"
cluster_resluts_file_name = Path("corr_mat_hrchy_10_cluster.csv")  # change this file if there is any better cluster results
cluster_resluts_df = pd.read_csv(cluster_resluts_dir/cluster_resluts_file_name, index_col="Unnamed: 0")
save_dir = Path(raw_file_dir).parent/src_file_name/window_setting/corr_ser_clac_method/corr_mat_compo/corr_ser_reduction_method
save_dir.mkdir(parents=True, exist_ok=True)
output_df.to_csv(save_dir/file_name)
logging.info(f"{file_name} has been save to {save_dir}")

## Randon pick items for trainset

In [None]:
sample_train_set = sorted(gen_random_trainset(syn_df.columns.tolist(), train_set_len=60, verbose=1), key=lambda x:int(x.split("_")[1]))
print(sample_train_set)

# Mimic Database

## visulization all patients' signal

In [None]:
numerics_folder = Path(raw_file_dir/"mimic-database-1.0.0"/"numerics")
numerics_files = [f.stem for f in numerics_folder.iterdir()]
ignored_files = {"HEADER", "ANNOTATORS", "RECORDS", ".htaccess", ".ipynb_checkpoints"}
records = sorted(set(numerics_files) - ignored_files)
for i,rec in enumerate(records):
    record = wfdb.rdrecord(numerics_folder/rec)
    wfdb.plot_wfdb(record=record, title=f'{record.record_name}', figsize=(20, 20), sharex=True)

## visulization specific signal & time-period & annotations

In [None]:
record_252n_abp = wfdb.rdrecord(numerics_folder/"252n", channels=[1,2], sampfrom=8000, sampto=9500)
# display(record_252n_abp record_252n.__dict__)
wfdb.plot_items(signal=record_252n_abp.p_signal, sig_name=record_252n_abp.sig_name, sig_units=record_252n_abp.units, title=f'{record_252n_abp.record_name}', figsize=(20, 8), sharex=True)

In [None]:
numerics_folder = Path(raw_file_dir/"mimic-database-1.0.0"/"numerics")
record_252n = wfdb.rdrecord(numerics_folder/"252n")
ann_patient = wfdb.rdann(str(numerics_folder/"252n"), 'al', summarize_labels=True)  # al patient alarms
ann_monitor = wfdb.rdann(str(numerics_folder/"252n"), 'in', summarize_labels=True)  # in monitor status alarms
display(len(record_252n.p_signal))

record_252n_fig = wfdb.plot_wfdb(record=record_252n, title=f'{record_252n.record_name}', figsize=(20, 20), sharex=True, return_fig=True)
plt.xticks(range(0, 110001, 5000))
display(ann_patient.description)
# display(ann_patient.contained_labels)
# display(ann_patient.chan)
wfdb.plot_wfdb(annotation=ann_patient)
wfdb.plot_wfdb(annotation=ann_monitor)

# numpy.savetxt("252n.csv", record_252n.p_signal, delimiter=",")

## Observe correlation series record_252n_apb

In [None]:
# wfdb_fig, wfdb_axes = wfdb.plot_items(signal=record_252n.p_signal, sig_name=record_252n.sig_name, sig_units=record_252n.units, figsize=(20, 8), sharex=True, return_fig_axes=True)  # return matplotlib axes
record_252n_abp_corr_series = pd.DataFrame(record_252n_abp.p_signal).rolling(window=100).corr().iloc[pd.IndexSlice[1::2],0].to_numpy()
fig1, ax = plt.subplots(3, 1, figsize=(20,12), sharex=True)
ax[0].plot(record_252n_abp.p_signal[:,0])
ax[0].axvline(x = 1433, color = 'r', label = 'correlation change')
# ax[0].axvline(x = 1564, color = 'r', label = 'correlation change')
ax[1].plot(record_252n_abp.p_signal[:,1])
ax[1].axvline(x = 1433, color = 'r', label = 'correlation change')
# ax[1].axvline(x = 1564, color = 'r', label = 'correlation change')
ax[2].plot(record_252n_abp_corr_series)
ax[2].axvline(x = 1433, color = 'r', label = 'correlation change')
# ax[2].axvline(x = 1564, color = 'r', label = 'correlation change')

pd.DataFrame(record_252n.p_signal, columns=["ABPsys/mmHg", "ABBPdias/mmHg"]).to_csv("../../tmp/252n.csv")
pd.DataFrame(record_252n_abp_corr_series, columns=["Pearson corr"]).to_csv("../../tmp/252n_abp_corr.csv")