In [None]:
from pathlib import Path
from pprint import pformat, pprint
import logging
import json
import re
import sys
from math import ceil
from itertools import repeat, chain, product
import traceback

import numpy as np
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
import dynamic_yaml
import yaml

logging.basicConfig(format='%(levelname)-8s [%(filename)s] %(message)s',
                    level=logging.DEBUG)
matplotlib_logger = logging.getLogger("matplotlib")
matplotlib_logger.setLevel(logging.ERROR)
mpl.rcParams[u'font.sans-serif'] = ['simhei']
mpl.rcParams['axes.unicode_minus'] = False
%load_ext pycodestyle_magic

# Draw the training process

In [None]:
def baseline_gru_tr_proc_est(log_path_list: list, condition_dict: dict,  plot_pic:bool = True):
    try:
        df = pd.DataFrame()
        for log_path in log_path_list:
            with open(log_path, "r") as source:
                log_dict = json.load(source)

            corr_info = str(next(filter(lambda p: p.startswith("corr"), log_path.parts)))
            for k in log_dict.keys():
                locals()[k] = log_dict[k]
            min_tr_loss = min(locals()["tr_loss_history"])
            model_struct_info_fields = {"drop_p": "\(dropout\): Dropout\(p=(?P<drop_p>\d*\.\d+|\d+), inplace=False\)",
                                        "gru_l": "\(gru\): GRU\(\d+, \d+, num_layers=(?P<gru_l>\d+)\)",
                                        "gru_h": "\(gru\): GRU\(\d+, (?P<gru_h>\d+), num_layers=\d+\)"}
            if model_struct_str := log_dict.get('model_structure'):
                for field, pattern in model_struct_info_fields.items():
                    match = re.search(pattern, model_struct_str)
                    if match:
                        locals()[field] = match.group(field)
                    else:
                        logging.info(f"Can't detect {field}")

            assert not(set(condition_dict.keys()) - set(locals().keys())), "one of condition_dict.keys() doesn't match the local variables if mts_corr_ad_est()"
            est_values_dict = locals()
            filtered_dict = dict(filter(lambda x: est_values_dict[x[0]] == x[1], condition_dict.items()))
            if filtered_dict == condition_dict:
                main_title_str = (f"{locals().get('corr_info')} with filt:{locals().get('filt_mode')}-{locals().get('filt_quan')} "
                                  f"and batch_size({locals().get('batch_size')}) "
                                  f"input to GRU with gru_l{locals().get('gru_l')}-gru_h{locals().get('gru_h')}\n"
                                  f"with drop: {locals().get('drop_p')} and loss_fn:{locals().get('loss_fn')}\n"
                                  f"min val-loss:{locals().get('min_val_loss'):8f} min tr-loss:{locals().get('min_tr_loss'):8f}")
                logging.info(f"file_name:{log_path.parts[-1]}")
                logging.info(f"file_path:{log_path.parts[2:-2]}")
                logging.info(f"main_title_str:\n{main_title_str}")
                record_fields = list(log_dict.keys()) + ["corr_info", "min_tr_loss"] + list(model_struct_info_fields.keys())
                comparison_dict = dict(filter(lambda x: x[0] in record_fields, locals().items()))
                df = pd.concat([df, pd.DataFrame([comparison_dict])])
                if plot_pic:
                    pass
                    plot_mts_corr_ad_tr_process(main_title=main_title_str, model_struct=model_struct_str, loss_history={k:log_dict[k] for k in record_fields if "history" in k},
                                                best_epoch=locals()['best_val_epoch'], batches_per_epoch=locals()['batches_per_epoch'])
            else:
                continue
        else:
            df = df.reindex(["corr_info", "epochs", "gra_nodes_v_mode", "filt_mode", "filt_quan", "loss_fn", "drop_p", "batch_size", "seq_len", "gru_l", "gru_h", "min_tr_loss", "min_val_loss", "min_val_loss_edge_acc"], axis=1)
            df = df.sort_values(["batch_size", "seq_len", "gru_l", "gru_h", "drop_p"], ascending=False)
            df = df.reset_index(drop=True)
            df.style.set_caption('Info of MTSCorrAD model with different hyperparameters')
            pd.options.display.float_format = '{:.6f}'.format
            display(df)
    except Exception as e:
        error_class = e.__class__.__name__ #⬞取得錯誤類型
        detail = e.args[0]  #⬞取得詳細內容
        cl, exc, tb = sys.exc_info() #⬞取得Call⬞Stack
        last_call_stack = traceback.extract_tb(tb)[-1] #⬞取得Call⬞Stack的最後一筆資料↵
        file_name = last_call_stack[0] #⬞取得發生的檔案名稱↵
        line_num = last_call_stack[1] #⬞取得發生的行號↵
        func_name = last_call_stack[2] #⬞取得發生的函數名稱
        err_msg = "File \"{}\", line {}, in {}: [{}] {}".format(file_name, line_num, func_name, error_class, detail)
        logging.error(f"file:{log_path.parts[-1]}, path:{log_path}")
        logging.error(f"===\n{err_msg}")
        logging.error(f"===\n{traceback.extract_tb(tb)}")

    return df


def plot_mts_corr_ad_tr_process(main_title: str, model_struct: str, loss_history: dict, best_epoch: int, batches_per_epoch: int):
    max_batch = batches_per_epoch * len(loss_history['tr_loss_history'])  # epochs == len(loss_history['tr_loss'])
    data_info_dict = [{"sub_title": 'train loss_history & edge_acc_history',
                       "data": {'tr_loss_history': loss_history['tr_loss_history'],
                                'tr_edge_acc_history': loss_history['tr_edge_acc_history']},
                       "xticks": None,
                       "xlabel": "epochs",
                       "double_y": True},
                      {"sub_title": 'val  loss_history & edge_acc_history',
                       "data": {'val_loss_history': loss_history['val_loss_history'],
                                'val_edge_acc_history': loss_history['val_edge_acc_history']},
                       "xticks": None,
                       "xlabel": "epochs",
                       "double_y": True},
                      {"sub_title": 'train gradient_history',
                       "data": loss_history['gradient_history'],
                       "xticks": None,
                       "xlabel": "epochs"},
                      {"sub_title": f"model structure",
                       "data": str(model_struct)}]

    # figrue settings
    line_style = {"linewidth": 2, "alpha": 0.5}
    axvline_style = {"color": 'k', "linewidth": 5, "linestyle": '--', "alpha": 0.3}
    fig, axs = plt.subplot_mosaic("""
                                  ab
                                  cc
                                  dd
                                  """,
                                  figsize=(30, 40), gridspec_kw={'hspace': 0.5, 'wspace': 0.3})
    fig.suptitle(main_title, fontsize=30)

    try:
        for ax, data_plot in zip(axs.values(), data_info_dict):
            ax.set_title(data_plot["sub_title"], fontsize=30)
            ax.yaxis.offsetText.set_fontsize(18)
            ax.tick_params(axis='both', which='major', labelsize=24)
            if isinstance(data_plot["data"], dict) and data_plot.get("double_y"):
                for i, key in enumerate(data_plot["data"]):
                    if i == 0:
                        ax.plot(data_plot["data"][key], label=key, **line_style)
                        ax.set_ylabel(key, fontsize=24)
                        ax.legend(fontsize=18)
                    else:
                        new_ax = ax.twinx()
                        new_ax.plot(data_plot["data"][key], label=key, color='r')
                        new_ax.set_ylabel(key, color='r', fontsize=24)
                        new_ax.legend(fontsize=18)
                        new_ax.tick_params(axis='both', colors='r', which='major', labelsize=24)
            elif isinstance(data_plot["data"], dict):
                [ax.plot(data_plot["data"][key], label=key, **line_style) for key in data_plot["data"]]
                ax.legend(fontsize=18)
            elif isinstance(data_plot["data"], str):
                ax.annotate(text=f"{data_plot['data']}",
                            xy=(0.15, 0.5), bbox={'facecolor': 'green', 'alpha': 0.4, 'pad': 5},
                            fontsize=20, fontfamily='monospace', xycoords='axes fraction', va='center')
            else:
                ax.plot(data_plot["data"], **line_style)
            if pos_tuple := data_plot.get("axvline"):
                for x_pos in pos_tuple:
                    ax.axvline(x=x_pos, **axvline_style)
            if xlabel := data_plot.get("xlabel"):
                ax.set_xlabel(xlabel, fontsize=24)
            if t := data_plot.get("xticks"):
                ax.set_xticks(ticks=range(0, len(t["label"])*t["intv"], t["intv"]), labels=t["label"], rotation=45)
    except Exception as e:
        logging.error(f"Encounter error when draw figure of {data_plot['sub_title']}")
        raise e

    fig.tight_layout(rect=(0, 0, 1, 0.97))
    plt.show()
    plt.close()

In [None]:
baseline_gru_log_dir = Path("./save_models/baseline_gru/sp500_20082017_corr_ser_reg_corr_mat_hrchy_11_cluster-train_train/")
log_path_list1 = baseline_gru_log_dir.glob("./*[!deprecated][!archive][!.ipynb_checkpoints]*/train_logs/*[!.ipynb_checkpoints]*[.json]")
log_path_list2 = baseline_gru_log_dir.glob("./*[archive][!deprecated][!.ipynb_checkpoints]*/**/train_logs/*[!.ipynb_checkpoints]*[.json]")
log_path_list3 = baseline_gru_log_dir.glob("./**/train_logs/*[!.ipynb_checkpoints]*[.json]")

# model_tr_summary_df = mts_corr_ad_tr_proc_est(log_path_list1, {"corr_info": "corr_s1_w10", "tr_batch": 32, "gra_enc_l": 1, "gra_enc_h": 4, "gru_l": 1, "gru_h": 8})
# model_tr_summary_df = mts_corr_ad_tr_proc_est(log_path_list1, {"corr_info": "corr_s1_w10", "gra_enc_l": 5, "gru_l": 1, "gru_h": 8})
# model_tr_summary_df = mts_corr_ad_tr_proc_est(log_path_list1, {"corr_info": "corr_s1_w10", "gra_enc_l": 5, "gra_enc_h": 16, "filt_mode": "keep_strong", "graph_enc":"GineEncoder"}, plot_pic=True)
# model_tr_summary_df = mts_corr_ad_tr_proc_est(log_path_list1, {"corr_info": "corr_s1_w10", "loss_fn": str(['MSELoss()', 'discr_loss'])}, plot_pic=True)
model_tr_summary_df =  baseline_gru_tr_proc_est(log_path_list1, {}, plot_pic=True)
# model_tr_summary_df = mts_corr_ad_tr_proc_est(log_path_list1, {"corr_info": "corr_s1_w10"}, plot_pic=False)

In [None]:
filt_model_tr_summary_df = model_tr_summary_df.loc[::, ["filt_mode", "gra_nodes_v_mode", "gra_enc_l", "gra_enc_h", "min_tr_loss", "min_val_loss"]]
filt_model_tr_summary_df

# Temp plotting

In [None]:
specific_log_dir = Path("save_models/baseline_gru/sp500_20082017_corr_ser_reg_corr_mat_hrchy_11_cluster-train_train/corr_s1_w10/train_logs/")
specific_log_p = specific_log_dir / "epoch_16-20230522143450.json"
with open(specific_log_p, "r") as source:
    log_dict = json.load(source)

print(log_dict.keys())

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(log_dict['train_loss_history'], label="train")
plt.plot(log_dict['val_loss_history'], label="val")
plt.legend(fontsize=16)
plt.title("MSELoss", fontsize=20)
plt.show()
plt.close()
plt.figure(figsize=(10, 6))
plt.plot(log_dict['train_edge_acc_history'], label="train")
plt.plot(log_dict['val_edge_acc_history'], label="val")
plt.legend(fontsize=16)
plt.title("Edge accracy", fontsize=20)
plt.show()
plt.close()

In [None]:
if log_dict.get('model_structure'):
    gru_l = int(re.search("(\(gru1\)\:.*)(num_layers\=)(\d*)", log_dict.get('model_structure'))[0][-1] if re.search("(\(gru1\)\:.*)(num_layers\=)(\d*)", log_dict.get('model_structure')) else 1)
    gru_h = int(re.search("(\(gru1\)\:\sGRU\(\d*\,)\s(\d*)", log_dict.get('model_structure')).group(2))
else:
    gru_l = None
    gru_h = None
corr_info = [p for p in specific_log_p.parts if p.startswith("corr")][0]
best_epoch = log_dict['best_val_epoch'] if log_dict.get('best_val_epoch') else 500
min_val_loss = min(log_dict['val_loss_history'])
tr_batch = log_dict.get('train_batch') if log_dict.get('train_batch') else None
batches_per_epoch = log_dict.get('batches_per_epoch')
tr_loss = log_dict.get('train_loss_history')
val_loss = log_dict.get('val_loss_history')
pred_embeds = np.array(log_dict.get('graph_embeds_history').get('graph_embeds_pred')[:batches_per_epoch*2]\
                       + [([np.nan]*(gin_l*gin_h)) for _ in range(20)]\
                       + log_dict.get('graph_embeds_history').get('graph_embeds_pred')[-batches_per_epoch*2:])
y_embeds = np.array(log_dict.get('graph_embeds_history').get('y_graph_embeds')[:batches_per_epoch*2]\
                    + [([np.nan]*(gin_l*gin_h)) for _ in range(20)]\
                    + log_dict.get('graph_embeds_history').get('y_graph_embeds')[-batches_per_epoch*2:])
plt.figure(figsize=(14.5, 8))
plt.plot(y_embeds, linewidth=5, alpha=0.3)
plt.axvline(x=batches_per_epoch, ymin=y_embeds[~np.isnan(y_embeds)].min(), ymax=y_embeds[~np.isnan(y_embeds)].max(),
                  color='k', linewidth=5, linestyle='--', alpha=0.3)
plt.axvline(x=batches_per_epoch*3+20, ymin=y_embeds[~np.isnan(y_embeds)].min(), ymax=y_embeds[~np.isnan(y_embeds)].max(),
                  color='k', linewidth=5, linestyle='--', alpha=0.3)
plt.annotate(text=f"188", xy=(0.19, 0.5),
             bbox={'facecolor': 'gray', 'alpha': 0.4, 'pad': 5},
             fontsize=20, fontfamily='monospace', xycoords='axes fraction', va='center')
plt.annotate(text=f"187811", xy=(0.77, 0.5),
             bbox={'facecolor': 'gray', 'alpha': 0.4, 'pad': 5},
             fontsize=20, fontfamily='monospace', xycoords='axes fraction', va='center')
plt.title(f'y_embeds-[{y_embeds.shape[1]}]', fontsize=30)
xticks_label = list(range(0, 301, 100)) + list(range(187600, 188001, 100))
plt.xticks(ticks=list(range(0, 801, 100)), labels=xticks_label, fontsize=18)
plt.yticks(fontsize=24)
plt.show()
plt.close()

In [None]:
sys.path.append("/workspace/correlation-change-predict/ywt_library")
current_dir = Path(__file__).parent
data_config_path = current_dir/"../config/data_config.yaml"
with open(data_config_path) as f:
    data = dynamic_yaml.load(f)
    data_cfg = yaml.full_load(dynamic_yaml.dump(data))



# ## Data implement & output setting & testset setting
# data implement setting
data_implement = "SP500_20082017_CORR_SER_REG_CORR_MAT_HRCHY_11_CLUSTER"  # watch options by operate: logging.info(data_cfg["DATASETS"].keys())
# train set setting
train_items_setting = "-train_train"  # -train_train|-train_all
# setting of name of output files and pictures title
output_file_name = data_cfg["DATASETS"][data_implement]['OUTPUT_FILE_NAME_BASIS'] + train_items_setting
# setting of output files
logging.info(f"===== file_name basis:{output_file_name} =====")
graph_data_dir = Path(data_cfg["DIRS"]["PIPELINE_DATA_DIR"])/f"{output_file_name}-graph_data"
graph_arr = np.load(graph_data_dir/f"corr_s1_w10_graph.npy")  # each graph consist of 66 node & 66^2 edges

stride = 12
train_arr = graph_arr[:int(len(graph_arr)*0.9)]
val_arr = graph_arr[int(len(graph_arr)*0.9):int(len(graph_arr)*0.95)]
test_arr = graph_arr[int(len(graph_arr)*0.95):]
train_diff_arr = train_arr[stride:] - train_arr[:-stride] # this is what I want
max_diff_ind = np.argmax(train_diff_arr.sum(axis=1).sum(axis=1))
logging.info(f"train_arr.shape: {train_arr.shape}")
logging.info(f"train_diff_arr.shape: {train_diff_arr.shape}")
logging.info(f"train_arr[0][0][:5]: \n{train_arr[0][0][:5]}")
logging.info(f"max_difference index of train_arr: {max_diff_ind}")
logging.info(f"train_diff_arr[{max_diff_ind}][0]: \n{train_diff_arr[max_diff_ind][0]}")
logging.info(f"train_arr[{max_diff_ind}][0]: \n{train_arr[max_diff_ind][0]}")
logging.info(f"train_arr[{max_diff_ind+stride}][0]: \n{train_arr[max_diff_ind+stride][0]}")