In [None]:
from pathlib import Path
from pprint import pformat, pprint
import logging
import json
import re
import sys
from math import ceil
from itertools import repeat, chain, product
import traceback

import numpy as np
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
import dynamic_yaml
import yaml

logging.basicConfig(format='%(levelname)-8s [%(filename)s] %(message)s',
                    level=logging.DEBUG)
matplotlib_logger = logging.getLogger("matplotlib")
matplotlib_logger.setLevel(logging.ERROR)
mpl.rcParams[u'font.sans-serif'] = ['simhei']
mpl.rcParams['axes.unicode_minus'] = False
%load_ext pycodestyle_magic

# Draw the training process

In [None]:
def mts_corr_ad_tr_proc_est(log_path_list: list, condition_dict: dict,  plot_pic:bool = True):
    try:
        df = pd.DataFrame()
        for log_path in log_path_list:
            with open(log_path, "r") as source:
                log_dict = json.load(source)

            corr_info = str(next(filter(lambda p: p.startswith("corr"), log_path.parts)))
            filt_mode = log_dict.get('filt_mode')
            filt_quan = log_dict.get('filt_quan')
            loss_fns = str(log_dict.get('loss_fns'))
            discr_loss_r = log_dict.get('discr_loss_r')
            discr_loss_disp_r = log_dict.get('discr_loss_disp_r')
            drop_pos = str(log_dict.get('drop_pos'))
            graph_enc = log_dict.get('graph_enc')
            best_epoch_n = log_dict.get('best_val_epoch', 500)
            tr_batch = log_dict.get('train_batch')
            batchs_per_epoch_n = log_dict.get('batchs_per_epoch', 0)
            loss_his_dict = {"tr_loss": log_dict.get('train_loss_history', [0]), "tr_l2_loss": log_dict.get('tr_l2_loss_history', [0]), "tr_discr_loss": log_dict.get('tr_discr_loss_history', [0]), "val_loss": log_dict.get('val_loss_history', [0])}
            min_tr_loss, min_val_loss = min(loss_his_dict["tr_loss"]), min(loss_his_dict["val_loss"])
            min_tr_l2_loss, min_tr_discr_loss = min(loss_his_dict["tr_l2_loss"]), min(loss_his_dict["tr_discr_loss"])

            if model_struct_str := log_dict.get('model_structure'):
                drop_p = re.search("\(dropout\): Dropout\(p=(?P<drop_p>\d*\.\d+|\d+), inplace=False\)", model_struct_str).group('drop_p')
                gra_enc_l = len(re.findall("\(\d\)\:\s.*Conv", model_struct_str))
                gra_enc_h = int(re.search("(\(\d\)\:\s.*Conv.*\n.*)(out_features\=)(\d*)", model_struct_str).group(3))
                gru_l = int(re.search("(\(gru1\)\:.*)(num_layers\=)(\d*)", model_struct_str)[0][-1] if re.search("(\(gru1\)\:.*)(num_layers\=)(\d*)", model_struct_str) else 1)
                gru_h = int(re.search("(\(gru1\)\:\sGRU\(\d*\,)\s(\d*)", model_struct_str).group(2))
            else:
                gra_enc_l, gra_enc_h, gru_l, gru_h = [None] * 4

            if embeds_history := log_dict.get("graph_embeds_history"):
                pred_embeds_history = embeds_history.get('graph_embeds_pred', [0])
                y_embeds_history = embeds_history.get('y_graph_embeds', [0])
                tr_gra_enc_embeds_disp_history = embeds_history.get('graph_embeds_disparity', {"train_gra_enc": []}).get("train_gra_enc", [])
                val_gra_enc_embeds_disp_history = embeds_history.get('graph_embeds_disparity', {"val_gra_enc": []}).get("val_gra_enc", [])
                tr_pred_embeds_disp_history = embeds_history.get('graph_embeds_disparity', {"train_pred": []}).get("train_pred", [])
                val_pred_embeds_disp_history = embeds_history.get('graph_embeds_disparity', {"val_pred": []}).get("val_pred", [])
                embeds_his_dict = {"pred_embeds": np.array(pred_embeds_history[:batchs_per_epoch_n * 2]\
                                                               + [([np.nan] * (gra_enc_l * gra_enc_h)) for _ in range(20)]\
                                                               + pred_embeds_history[-batchs_per_epoch_n * 2:]),
                                   "y_embeds": np.array(y_embeds_history[:batchs_per_epoch_n * 2]\
                                                        + [([np.nan] * (gra_enc_l * gra_enc_h)) for _ in range(20)]\
                                                        + y_embeds_history[-batchs_per_epoch_n * 2:]),
                                   "last_y_embeds": y_embeds_history[-batchs_per_epoch_n * 5:],
                                   "tr_gra_enc_embeds_disp": tr_gra_enc_embeds_disp_history,
                                   "val_gra_enc_embeds_disp": val_gra_enc_embeds_disp_history,
                                   "tr_pred_embeds_disp": tr_pred_embeds_disp_history,
                                   "val_pred_embeds_disp": val_pred_embeds_disp_history}

            assert not(set(condition_dict.keys()) - set(locals().keys())), "one of condition_dict.keys() doesn't match the local variables if mts_corr_ad_est()"
            est_values_dict = locals()
            filtered_dict = dict(filter(lambda x: est_values_dict[x[0]] == x[1], condition_dict.items()))
            if filtered_dict == condition_dict:
                main_title_str = f'{corr_info} with filt:{filt_mode}-{filt_quan} and tr_batch({tr_batch}) input to {graph_enc} with gra_enc_l{gra_enc_l}-gra_enc_h{gra_enc_h}-gru_l{gru_l}-gru_h{gru_h}\nwith drop:{drop_pos}-{drop_p} and loss_fn:{loss_fns}\n min val-loss:{min_val_loss:8f}'
                logging.info(f"file_name:{log_path.parts[-1]}")
                logging.info(f"file_path:{log_path.parts[2:-2]}")
                logging.info(main_title_str)
                comparison_dict = dict(filter(lambda x: x[0] in [ "corr_info", "tr_batch", "filt_mode", "filt_quan", "loss_fns", "discr_loss_r", "discr_loss_disp_r", "drop_pos", "drop_p", "graph_enc", "gra_enc_l", "gra_enc_h", "gru_l", "gru_h", "min_tr_loss", "min_tr_l2_loss", "min_tr_discr_loss", "min_val_loss"], locals().items()))
                df = pd.concat([df, pd.DataFrame([comparison_dict])])
                if plot_pic:
                    plot_mts_corr_ad_tr_process(main_title=main_title_str, model_struct=model_struct_str,
                                                loss_history=loss_his_dict, embeds_history=embeds_his_dict,
                                                best_epoch=best_epoch_n, batchs_per_epoch=batchs_per_epoch_n)
            else:
                continue
        else:
            df = df.reindex(["corr_info", "tr_batch", "filt_mode", "filt_quan", "loss_fns", "discr_loss_r", "discr_loss_disp_r", "drop_pos", "drop_p", "graph_enc", "gra_enc_l", "gra_enc_h", "gru_l", "gru_h", "min_tr_loss", "min_tr_l2_loss", "min_tr_discr_loss", "min_val_loss"], axis=1)
            df = df.sort_values(["tr_batch", "gra_enc_l", "gra_enc_h", "gru_l", "gru_h", "filt_mode", "filt_quan", "graph_enc", "loss_fns", "drop_pos", "discr_loss_r", "discr_loss_disp_r", "drop_p"], ascending=False)
            df = df.reset_index(drop=True)
            df = df.style.set_caption('Info of MTSCorrAD model with different hyperparameters')
            display(df)
    except Exception as e:
        error_class = e.__class__.__name__ #⬞取得錯誤類型
        detail = e.args[0]  #⬞取得詳細內容
        cl, exc, tb = sys.exc_info() #⬞取得Call⬞Stack
        last_call_stack = traceback.extract_tb(tb)[-1] #⬞取得Call⬞Stack的最後一筆資料↵
        file_name = last_call_stack[0] #⬞取得發生的檔案名稱↵
        line_num = last_call_stack[1] #⬞取得發生的行號↵
        func_name = last_call_stack[2] #⬞取得發生的函數名稱
        err_msg = "File \"{}\", line {}, in {}: [{}] {}".format(file_name, line_num, func_name, error_class, detail)
        logging.error(f"file:{log_path.parts[-1]}, path:{log_path}")
        logging.error(f"===\n{err_msg}")
        logging.error(f"===\n{traceback.extract_tb(tb)}")


def plot_mts_corr_ad_tr_process(main_title: str, model_struct: str, loss_history: dict, embeds_history: dict, best_epoch: int, batchs_per_epoch: int):
    pred_embeds, y_embeds, last_y_embeds = embeds_history['pred_embeds'], embeds_history['y_embeds'], embeds_history['last_y_embeds']
    tr_gra_enc_embeds_disp_dict, val_gra_enc_embeds_disp_dict = embeds_history["tr_gra_enc_embeds_disp"], embeds_history["val_gra_enc_embeds_disp"]
    tr_pred_embeds_disp_dict, val_pred_embeds_disp_dict = embeds_history["tr_pred_embeds_disp"], embeds_history["val_pred_embeds_disp"]
    max_batch = batchs_per_epoch * len(loss_history['tr_loss'])  # epochs == len(loss_history['tr_loss'])
    xticks_intv = {"loss": 20,
                   "fr_ls_embeds": int(len(y_embeds)/10)}
    loss_xticks_label = list(range(max(0, best_epoch-100), max(201, best_epoch+101), xticks_intv["loss"]))
    fr_ls_embeds_xticks_label = list(range(0, batchs_per_epoch*2, xticks_intv["fr_ls_embeds"])) + [" "] + list(range(max_batch-xticks_intv["fr_ls_embeds"]*4, max_batch+1, xticks_intv["fr_ls_embeds"]))
    data_info_dict = [{"sub_title": 'train_loss_history',
                       "data": loss_history['tr_loss'],
                       "xticks": None,
                       "xlabel": "epochs"},
                      {"sub_title": 'val_loss_history',
                       "data": loss_history['val_loss'],
                       "xticks": None,
                       "xlabel": "epochs"},
                      {"sub_title": f"train_loss_history-epoch{(max(0, best_epoch-100), max(200, best_epoch+100))}",
                       "data": loss_history['tr_loss'][max(0, best_epoch-100):max(201, best_epoch+101)],
                       "xticks": {"label": loss_xticks_label, "intv": xticks_intv['loss']},
                       "xlabel": "epochs"},
                      {"sub_title": f"val_loss_history-epoch{(max(0, best_epoch-100), max(200, best_epoch+100))}",
                       "data": loss_history['val_loss'][max(0, best_epoch-100):max(201, best_epoch+101)],
                       "xticks": {"label": loss_xticks_label, "intv": xticks_intv['loss']},
                       "xlabel": "epochs"},
                      {"sub_title": "train, train_l2, train_discrimination loss & valdation loss",
                       "data": []},
                      {"sub_title": f"First train graph encoder embeds disparity",
                       "data": tr_gra_enc_embeds_disp_dict,
                       "xlabel": "epochs"},
                      {"sub_title": f"First train predition embeds disparity",
                       "data": tr_pred_embeds_disp_dict,
                       "xlabel": "epochs"},
                      {"sub_title": f"First val graph encoder embeds disparity",
                       "data": val_gra_enc_embeds_disp_dict,
                       "xlabel": "epochs"},
                      {"sub_title": f"First val predition embeds disparity",
                       "data": val_pred_embeds_disp_dict,
                       "xlabel": "epochs"},
                      {"sub_title": f'pred_embeds, embeds size:[{pred_embeds.shape[1]}]',
                       "data": pred_embeds,
                       "xticks": {"label": fr_ls_embeds_xticks_label, "intv": xticks_intv["fr_ls_embeds"]},
                       "xlabel": "batchs",
                       "axvline": (batchs_per_epoch, batchs_per_epoch*3+20)},
                      {"sub_title": f'y_embeds, embeds size:[{y_embeds.shape[1]}]',
                       "data": y_embeds,
                       "xticks": {"label": fr_ls_embeds_xticks_label, "intv": xticks_intv["fr_ls_embeds"]},
                       "xlabel": "batchs",
                       "axvline": (batchs_per_epoch, batchs_per_epoch*3+20)},
                      {"sub_title": f"y_embeds in last five epochs; embeds size:{y_embeds.shape[1]}",
                       "data": last_y_embeds,
                       "xticks": {"label": range(max_batch - batchs_per_epoch * 5, max_batch + 1, batchs_per_epoch), "intv": batchs_per_epoch},
                       "xlabel": "batchs",
                       "axvline": [i*batchs_per_epoch for i in range(1, 5)]},
                      {"sub_title": f"model structure",
                       "data": str(model_struct)}]

    # figrue settings
    line_style = {"linewidth": 2, "alpha": 0.5}
    axvline_style = {"color": 'k', "linewidth": 5, "linestyle": '--', "alpha": 0.3}
    nrows, ncols = 9, 2  # define the number of rows and columns for the plot
    gs = gridspec.GridSpec(nrows, ncols)  # create a GridSpec object with the desired layout
    fig = plt.figure(figsize=(25, 90))  # create a figure with the subplots specified by the GridSpec object
    fig.suptitle(main_title, fontsize=30)
    axes = []
    for nrow, ncol in product(range(nrows), range(ncols)):
        if nrow <= 4:
            ax = fig.add_subplot(gs[nrow, ncol])
        elif 4 < nrow < 6 and ncol == 0:
            ax = fig.add_subplot(gs[nrow, :2])
        elif nrow == 6 and ncol == 0:
            ax = fig.add_subplot(gs[nrow:, :2])
        else:
            continue
        axes.append(ax)

    try:
        for ax, data_plot in zip(axes, data_info_dict):
            ax.set_title(data_plot["sub_title"], fontsize=30)
            ax.yaxis.offsetText.set_fontsize(18)
            ax.tick_params(axis='both', which='major', labelsize=24)
            if isinstance(data_plot["data"], dict):
                [ax.plot(data_plot["data"][key], label=key) for key in data_plot["data"]]
                ax.legend(fontsize=18)
            elif isinstance(data_plot["data"], str):
                ax.annotate(text=f"{data_plot['data']}",
                            xy=(0.15, 0.5), bbox={'facecolor': 'green', 'alpha': 0.4, 'pad': 5},
                            fontsize=20, fontfamily='monospace', xycoords='axes fraction', va='center')
            else:
                ax.plot(data_plot["data"], **line_style)
            if pos_tuple := data_plot.get("axvline"):
                for x_pos in pos_tuple:
                    ax.axvline(x=x_pos, **axvline_style)
            if xlabel := data_plot.get("xlabel"):
                ax.set_xlabel(xlabel, fontsize=24)
            if t := data_plot.get("xticks"):
                ax.set_xticks(ticks=range(0, len(t["label"])*t["intv"], t["intv"]), labels=t["label"], rotation=45)
    except Exception as e:
        logging.error(f"Encounter error when draw figure of {data_plot['sub_title']}")
        raise e

    fig.tight_layout(rect=(0, 0, 1, 0.97))
    plt.show()
    plt.close()

In [None]:
mts_corr_model_log_dir = Path("./save_models/sp500_20082017_corr_ser_reg_corr_mat_hrchy_11_cluster-train_train/")
log_path_list1 = mts_corr_model_log_dir.glob("./*[!deprecated][!archive][!.ipynb_checkpoints]*/train_logs/*[!.ipynb_checkpoints]*[.json]")
log_path_list2 = mts_corr_model_log_dir.glob("./*[archive][!deprecated][!.ipynb_checkpoints]*/**/train_logs/*[!.ipynb_checkpoints]*[.json]")
log_path_list3 = mts_corr_model_log_dir.glob("./**/train_logs/*[!.ipynb_checkpoints]*[.json]")

# mts_corr_ad_tr_proc_est(log_path_list1, {"corr_info": "corr_s1_w10", "tr_batch": 32, "gra_enc_l": 1, "gra_enc_h": 4, "gru_l": 1, "gru_h": 8})
# mts_corr_ad_tr_proc_est(log_path_list1, {"corr_info": "corr_s1_w10", "gra_enc_l": 5, "gru_l": 1, "gru_h": 8})
# mts_corr_ad_tr_proc_est(log_path_list1, {"corr_info": "corr_s1_w10", "gra_enc_l": 5, "gra_enc_h": 16, "filt_mode": "keep_strong", "graph_enc":"GineEncoder"}, plot_pic=False)
mts_corr_ad_tr_proc_est(log_path_list1, {"corr_info": "corr_s1_w10"}, plot_pic=False)
# mts_corr_ad_tr_proc_est(log_path_list1, {"corr_info": "corr_s1_w10"})

In [None]:
specific_log_p = mts_corr_model_log_dir/f"corr_s1_w10/train_logs/epoch_251-20230305215527.json"

with open(specific_log_p, "r") as source:
    log_dict = json.load(source)

if log_dict.get('model_structure'):
    gin_l = len(re.findall("\(\d\)GINConv", log_dict.get('model_structure')))
    gin_h = int(re.search("(\(\d\)\:\sGINConv.*\n.*)(out_features\=)(\d*)", log_dict.get('model_structure')).group(3))
    gru_l = int(re.search("(\(gru1\)\:.*)(num_layers\=)(\d*)", log_dict.get('model_structure'))[0][-1] if re.search("(\(gru1\)\:.*)(num_layers\=)(\d*)", log_dict.get('model_structure')) else 1)
    gru_h = int(re.search("(\(gru1\)\:\sGRU\(\d*\,)\s(\d*)", log_dict.get('model_structure')).group(2))
else:
    gin_l = None
    gin_h = None
    gru_l = None
    gru_h = None
corr_info = [p for p in specific_log_p.parts if p.startswith("corr")][0]
best_epoch = log_dict['best_val_epoch'] if log_dict.get('best_val_epoch') else 500
min_val_loss = min(log_dict['val_loss_history'])
tr_batch = log_dict.get('train_batch') if log_dict.get('train_batch') else None
batchs_per_epoch = log_dict.get('batchs_per_epoch')
tr_loss = log_dict.get('train_loss_history')
val_loss = log_dict.get('val_loss_history')
pred_embeds = np.array(log_dict.get('graph_embeds_history').get('graph_embeds_pred')[:batchs_per_epoch*2]\
                       + [([np.nan]*(gin_l*gin_h)) for _ in range(20)]\
                       + log_dict.get('graph_embeds_history').get('graph_embeds_pred')[-batchs_per_epoch*2:])
y_embeds = np.array(log_dict.get('graph_embeds_history').get('y_graph_embeds')[:batchs_per_epoch*2]\
                    + [([np.nan]*(gin_l*gin_h)) for _ in range(20)]\
                    + log_dict.get('graph_embeds_history').get('y_graph_embeds')[-batchs_per_epoch*2:])
plt.figure(figsize=(14.5, 8))
plt.plot(y_embeds, linewidth=5, alpha=0.3)
plt.axvline(x=batchs_per_epoch, ymin=y_embeds[~np.isnan(y_embeds)].min(), ymax=y_embeds[~np.isnan(y_embeds)].max(),
                  color='k', linewidth=5, linestyle='--', alpha=0.3)
plt.axvline(x=batchs_per_epoch*3+20, ymin=y_embeds[~np.isnan(y_embeds)].min(), ymax=y_embeds[~np.isnan(y_embeds)].max(),
                  color='k', linewidth=5, linestyle='--', alpha=0.3)
plt.annotate(text=f"188", xy=(0.19, 0.5),
             bbox={'facecolor': 'gray', 'alpha': 0.4, 'pad': 5},
             fontsize=20, fontfamily='monospace', xycoords='axes fraction', va='center')
plt.annotate(text=f"187811", xy=(0.77, 0.5),
             bbox={'facecolor': 'gray', 'alpha': 0.4, 'pad': 5},
             fontsize=20, fontfamily='monospace', xycoords='axes fraction', va='center')
plt.title(f'y_embeds-[{y_embeds.shape[1]}]', fontsize=30)
xticks_label = list(range(0, 301, 100)) + list(range(187600, 188001, 100))
plt.xticks(ticks=list(range(0, 801, 100)), labels=xticks_label, fontsize=18)
plt.yticks(fontsize=24)
plt.show()
plt.close()

# Find the most differ graph

In [None]:
sys.path.append("/workspace/correlation-change-predict/ywt_library")
current_dir = Path(__file__).parent
data_config_path = current_dir/"../config/data_config.yaml"
with open(data_config_path) as f:
    data = dynamic_yaml.load(f)
    data_cfg = yaml.full_load(dynamic_yaml.dump(data))

# ## Data implement & output setting & testset setting
# data implement setting
data_implement = "SP500_20082017_CORR_SER_REG_CORR_MAT_HRCHY_11_CLUSTER"  # watch options by operate: logging.info(data_cfg["DATASETS"].keys())
# train set setting
train_items_setting = "-train_train"  # -train_train|-train_all
# setting of name of output files and pictures title
output_file_name = data_cfg["DATASETS"][data_implement]['OUTPUT_FILE_NAME_BASIS'] + train_items_setting
# setting of output files
logging.info(f"===== file_name basis:{output_file_name} =====")
graph_data_dir = Path(data_cfg["DIRS"]["PIPELINE_DATA_DIR"])/f"{output_file_name}-graph_data"
graph_arr = np.load(graph_data_dir/f"corr_s1_w10_graph.npy")  # each graph consist of 66 node & 66^2 edges

stride = 12
train_arr = graph_arr[:int(len(graph_arr)*0.9)]
val_arr = graph_arr[int(len(graph_arr)*0.9):int(len(graph_arr)*0.95)]
test_arr = graph_arr[int(len(graph_arr)*0.95):]
train_diff_arr = train_arr[stride:] - train_arr[:-stride] # this is what I want
max_diff_ind = np.argmax(train_diff_arr.sum(axis=1).sum(axis=1))
logging.info(f"train_arr.shape: {train_arr.shape}")
logging.info(f"train_diff_arr.shape: {train_diff_arr.shape}")
logging.info(f"train_arr[0][0][:5]: \n{train_arr[0][0][:5]}")
logging.info(f"max_difference index of train_arr: {max_diff_ind}")
logging.info(f"train_diff_arr[{max_diff_ind}][0]: \n{train_diff_arr[max_diff_ind][0]}")
logging.info(f"train_arr[{max_diff_ind}][0]: \n{train_arr[max_diff_ind][0]}")
logging.info(f"train_arr[{max_diff_ind+stride}][0]: \n{train_arr[max_diff_ind+stride][0]}")