In [None]:
import re
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


sys.path.append("/workspace/multivariate-correlation-anomaly-detection/")
from utils.assorted_utils import split_and_norm_data, load_multiple_data, load_data_cfg, find_cross_items_pairs
from utils.etl_utils import calc_tr_val_corr_and_labels_distribution
from utils.log_utils import Log

JPY_LOGGER = Log(df_max_rows=50).init_logger(logger_name="ywt_jupyter")

# Prepare data

## Load Data

In [None]:
data_implement = "SYNTHETIC_COLL2_LAG19_WAVY"  # watch options by printing /config/data_config.yaml/["DATASETS"].keys()
# etl set setting
retrieve_items_setting = "-train_train"  # -train_train|-train_all
# set correlation type
corr_type = "pearson"  # "pearson" | "cross_corr"
# set target_df bins
target_df_bins = "bins_-10_-03_03_10"
# set CORR_WINDOW and CORR_STRIDE length
w_l=50 ; s_l = 1
# Decide how to calculate corr_ser
corr_ser_clac_method = "corr_ser_calc_regular"  # corr_ser_calc_regular|corr_ser_calc_abs

In [None]:
dataset_df, corr_df, target_df, corr_property_df = load_multiple_data(data_implement=data_implement,
                                                                      retrieve_items_setting=retrieve_items_setting,
                                                                      corr_type=corr_type, target_df_bins=target_df_bins,
                                                                      w_l=w_l, s_l=s_l,
                                                                      corr_ser_clac_method=corr_ser_clac_method)
display(dataset_df)
display(corr_df)
display(target_df)
display(corr_property_df)

## ETL setting

In [None]:
# set to retrieve partial corr_data
can_use_custom_idxs = False
DATA_CFG = load_data_cfg()
custom_idxs = DATA_CFG["DATASETS"][data_implement]['CORSS_ITEM_PAIRS_IDXS'] if can_use_custom_idxs else None
JPY_LOGGER.info("="*100)
JPY_LOGGER.info(f"custom_idxs: {custom_idxs}")
JPY_LOGGER.info("="*100)

In [None]:
tr_data, val_data, test_data = split_and_norm_data(model_input_df=corr_df, target_df=target_df, batch_size=64)
if custom_idxs:
    selected_tr_data = {"model_input": tr_data["model_input"][custom_idxs, ::], "target": tr_data["target"][custom_idxs, ::]}
    selected_val_data = {"model_input": val_data["model_input"][custom_idxs, ::], "target": val_data["target"][custom_idxs, ::]}
    selected_test_data = {"model_input": test_data["model_input"][custom_idxs, ::], "target": test_data["target"][custom_idxs, ::]}
    selected_item_pairs = corr_df.index[custom_idxs]
else:
    selected_tr_data = tr_data
    selected_val_data = val_data
    selected_test_data = test_data
    selected_item_pairs = corr_df.index
JPY_LOGGER.info("="*80)
JPY_LOGGER.info(f"tr_data[model_input].shape:{tr_data['model_input'].shape}, tr_data[target].shape:{tr_data['target'].shape}, val_data[model_input].shape:{val_data['model_input'].shape}, val_data[target].shape:{val_data['target'].shape}, test_data[model_input].shape:{test_data['model_input'].shape}, test_data[target].shape:{test_data['target'].shape}")
JPY_LOGGER.info(f"selected_tr_data[model_input].shape:{selected_tr_data['model_input'].shape}, selected_tr_data[target].shape:{selected_tr_data['target'].shape}")
JPY_LOGGER.info(f"selected_val_data[model_input].shape:{selected_val_data['model_input'].shape}, selected_val_data[target].shape:{selected_val_data['target'].shape}")
JPY_LOGGER.info(f"selected_test_data[model_input].shape:{selected_test_data['model_input'].shape}, selected_test_data[target].shape:{selected_test_data['target'].shape}")
JPY_LOGGER.info("="*80)

# Observe train_data and val_data class distribution

In [None]:
assert (np.unique(selected_tr_data['target']).shape[0] < 10) and (np.unique(selected_val_data['target']).shape[0] < 10), "number of classes should lower than 10"
tr_labels, tr_labels_freq_counts = np.unique(selected_tr_data['target'], return_counts=True)
val_labels, val_labels_freq_counts = np.unique(selected_val_data['target'], return_counts=True)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16,9))
colors_labels_map = {"-1.0": "lime", "0.0": "darkorange", "1.0": "dodgerblue"}
axes[0].pie(tr_labels_freq_counts, labels=tr_labels, autopct='%1.1f%%', textprops={'fontsize': 24}, colors=[colors_labels_map[str(label)] for label in tr_labels])
axes[0].set_title("Train", fontsize=32)
axes[1].pie(val_labels_freq_counts, labels=val_labels, autopct='%1.1f%%', textprops={'fontsize': 24}, colors=[colors_labels_map[str(label)] for label in val_labels])
axes[1].set_title("Validation", fontsize=32)
fig_title = re.sub(r"SP500_\d*_", "", data_implement)
fig_title = f'{fig_title}_cross_term' if custom_idxs else fig_title
fig.suptitle(f'{fig_title}', fontsize=40)
plt.show()
plt.close()

# Observe properties of Corrlelation series

## Display `corr_property_df` of `selected_item_pairs`

In [None]:
selected_mask = corr_property_df.index.isin(selected_item_pairs)
display_df = corr_property_df.loc[selected_mask, ::].reset_index()
display(display_df)

## plot distribution of all correlation of all item_pairs

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1)
fig.set_size_inches(6, 10)
all_item_pair_corrs = np.hstack(corr_df.values)
axes[0].hist(all_item_pair_corrs, bins=20)
axes[0].xaxis.set_tick_params(labelsize=18)
axes[1].boxplot(all_item_pair_corrs)
axes[1].yaxis.set_tick_params(labelsize=18)
plt.show()
plt.close()

# Find cross_items_pairs

In [None]:
items_1_data_implement = "SP500_20112015_PCA_CLUSTER_1"
items_2_data_implement = "SP500_20112015_PCA_CLUSTER_2"

cross_items_pairs, cross_items_pairs_idx, _ = find_cross_items_pairs(items_1_data_implement=items_1_data_implement, items_2_data_implement=items_2_data_implement,
                                                                     integrate_two_items=DATA_CFG["DATASETS"][data_implement]['TRAIN_SET'], integrate_two_items_corr_df=corr_df)
JPY_LOGGER.info("!"*100)
JPY_LOGGER.info(cross_items_pairs)
JPY_LOGGER.info(cross_items_pairs_idx)
JPY_LOGGER.info("!"*100)

# Observe several dataset distribution

In [None]:
data_implement_list = [f"TEMP_OBS_SYNTHETIC_COLL2_LAG{i}_WAVY_OBS_DIST" for i in range(2, 200)]
data_implement_list_dist_df = pd.DataFrame()
for data_implement in data_implement_list:
    # Load Data
    _, corr_df, target_df, _ = load_multiple_data(data_implement=data_implement,
                                                                 retrieve_items_setting=retrieve_items_setting,
                                                                 corr_type=corr_type, target_df_bins=target_df_bins,
                                                                 w_l=w_l, s_l=s_l,
                                                                 corr_ser_clac_method=corr_ser_clac_method)
    # set to retrieve partial corr_data
    tr_data, val_data, _ = split_and_norm_data(model_input_df=corr_df, target_df=target_df, batch_size=64)
    data_implement_distribution = calc_tr_val_corr_and_labels_distribution(tr_data=tr_data, val_data=val_data, corr_df=corr_df, data_implement=data_implement, custom_idxs=custom_idxs, plot_distribution=False)
    data_implement_list_dist_df = pd.concat([data_implement_list_dist_df, data_implement_distribution])

display(data_implement_list_dist_df.iloc[:20, ::])
mask = data_implement_list_dist_df.loc[::, "tr_class_-1.0":"tr_class_1.0"].max(axis=1)<0.43
display(data_implement_list_dist_df.iloc[mask.tolist(), ::])

# ...