In [None]:
import sys
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dynamic_yaml
import yaml

sys.path.append("/workspace/correlation-change-predict/utils")
from utils import convert_str_bins_list, split_and_norm_data

data_config_path = Path("../config/data_config.yaml")
with open(data_config_path) as f:
    data_cfg_yaml = dynamic_yaml.load(f)
    data_cfg = yaml.full_load(dynamic_yaml.dump(data_cfg_yaml))

In [None]:
batch_size = 64
corr_type = "pearson"
s_l = 1
w_l = 50
filt_mode = None
quan_discrete_bins = None
custom_discrete_bins = None
graph_nodes_v_mode = None
target_mats_path = "pearson/custom_discretize_graph_adj_mat/bins_-10_-025_025_10"
output_file_name = "sp500_20112015_corr_ser_reg_std_corr_mat_negative_filtered-train_train"

assert (bool(filt_mode) != bool(quan_discrete_bins)) or (filt_mode is None and quan_discrete_bins is None), "filt_mode and quan_discrete_bins must be both not input or one input"

In [None]:
if filt_mode:
    graph_adj_mode_dir = f"filtered_graph_adj_mat/{filt_mode}-quan{str(filt_quan).replace('.', '')}"
elif quan_discrete_bins:
    graph_adj_mode_dir = f"quan_discretize_graph_adj_mat/bins{quan_discrete_bins}"
elif custom_discrete_bins:
    graph_adj_mode_dir = f"custom_discretize_graph_adj_mat/bins_{'_'.join((str(f) for f in custom_discrete_bins)).replace('.', '')}"
else:
    graph_adj_mode_dir = "graph_adj_mat"
graph_adj_mat_dir = Path(data_cfg["DIRS"]["PIPELINE_DATA_DIR"])/f"{output_file_name}/{corr_type}/{graph_adj_mode_dir}"
graph_node_mat_dir = Path(data_cfg["DIRS"]["PIPELINE_DATA_DIR"])/f"{output_file_name}/graph_node_mat"
target_mat_dir = Path(data_cfg["DIRS"]["PIPELINE_DATA_DIR"])/f"{output_file_name}/{target_mats_path}"

gra_edges_data_mats = np.load(graph_adj_mat_dir/f"corr_s{s_l}_w{w_l}_adj_mat.npy")
gra_nodes_data_mats = np.load(graph_node_mat_dir/f"{graph_nodes_v_mode}_s{s_l}_w{w_l}_nodes_mat.npy") if graph_nodes_v_mode else np.ones((gra_edges_data_mats.shape[0], 1, gra_edges_data_mats.shape[2]))
target_mats = np.load(target_mat_dir/f"corr_s{s_l}_w{w_l}_adj_mat.npy") if target_mats_path else None
norm_train_dataset, norm_val_dataset, norm_test_dataset, scaler = split_and_norm_data(edges_mats=gra_edges_data_mats, nodes_mats=gra_nodes_data_mats, target_mats=target_mats, batch_size= batch_size)

In [None]:
tr_labels, tr_labels_freq_counts = np.unique(norm_train_dataset['target'], return_counts=True)
val_labels, val_labels_freq_counts = np.unique(norm_val_dataset['target'], return_counts=True)
print(f"implement dataset:{output_file_name}")
print(f"norm_train_dataset[target].shape: {norm_train_dataset['target'].shape}")
print(f"norm_val_dataset[target].shape: {norm_val_dataset['target'].shape}")
for label, freq in dict(zip(tr_labels, tr_labels_freq_counts)).items():
    print(f"train label :{label}, frequency: {freq}")
for label, freq in dict(zip(val_labels, val_labels_freq_counts)).items():
    print(f"val label :{label}, frequency: {freq}")


In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16,9))
colors_labels_map = {"-1.0": "lime", "0.0": "darkorange", "1.0": "dodgerblue"}
axes[0].pie(tr_labels_freq_counts, labels=tr_labels, autopct='%1.1f%%', textprops={'fontsize': 24}, colors=[colors_labels_map[str(label)] for label in tr_labels])
axes[0].set_title("Train", fontsize=32)
axes[1].pie(val_labels_freq_counts, labels=val_labels, autopct='%1.1f%%', textprops={'fontsize': 24}, colors=[colors_labels_map[str(label)] for label in val_labels])
axes[1].set_title("Validation", fontsize=32)
fig.suptitle(f'Negative filtered', fontsize=40)

plt.show()
plt.close()