In [None]:
from datetime import datetime
import sys

import pandas as pd

sys.path.append("/workspace/multivariate-correlation-anomaly-detection/")
from utils.assorted_utils import load_multiple_data, load_dirs, load_data_cfg
from utils.select_items_utils import gen_random_items, gen_corr_prop_filtered_items, gen_pca_cluster_filtered_items_each_cluster, gen_pca_cluster_filtered_pairs_each_cluster
from utils.cluster_utils import (calc_pca, obs_various_n_clusters_hrchy_cluster)
from utils.log_utils import Log

JPY_LOGGER = Log(df_max_rows=100).init_logger(logger_name="ywt_jupyter")

# Prepare data

## Load Data

In [None]:
data_implement = "SP500_20112015"  # watch options by printing /config/data_config.yaml/["DATASETS"].keys()
# etl set setting
retrieve_items_setting = "-train_all"  # -train_train|-train_all
# set correlation type
corr_type = "pearson"  # "pearson" | "cross_corr"
# set target_df bins
target_df_bins = "bins_-10_-03_03_10"
# set CORR_WINDOW and CORR_STRIDE length
w_l=50 ; s_l = 1
# Decide how to calculate corr_ser
corr_ser_clac_method = "corr_ser_calc_regular"  # corr_ser_calc_regular|corr_ser_calc_abs

In [None]:
pipeline_corr_data_dir, corr_dir, target_dir, corr_property_dir, cliques_dir, cluster_dir = load_dirs(data_implement=data_implement,
                                                                                                      retrieve_items_setting=retrieve_items_setting,
                                                                                                      corr_type=corr_type, target_df_bins=target_df_bins,
                                                                                                      w_l=w_l, s_l=s_l,
                                                                                                      corr_ser_clac_method=corr_ser_clac_method)
dataset_df, corr_df, target_df, corr_property_df = load_multiple_data(data_implement=data_implement,
                                                                      retrieve_items_setting=retrieve_items_setting,
                                                                      corr_type=corr_type, target_df_bins=target_df_bins,
                                                                      w_l=w_l, s_l=s_l,
                                                                      corr_ser_clac_method=corr_ser_clac_method)
display(dataset_df)
display(corr_df)
display(target_df)
display(corr_property_df)

# Random pick trainset

In [None]:
num_items = 10
selected_seed = None
rand_seed = load_data_cfg()["RANDOM_SEEDS"][selected_seed] if selected_seed is not None else None

JPY_LOGGER.info("="*80)
JPY_LOGGER.info(f"num_items: {num_items}")
JPY_LOGGER.info(f"rand_seed: {rand_seed}")
JPY_LOGGER.info("="*80)

In [None]:
final_pick_items = gen_random_items(all_items=dataset_df.columns.tolist(), ret_items_len=num_items, verbose=0, rand_seed=rand_seed)

JPY_LOGGER.info("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! final_pick_items !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
JPY_LOGGER.info(f"\n{final_pick_items}")
JPY_LOGGER.info("!"*100)

# Correlation Series Property filtered trainset

In [None]:
selected_corr_prop = "corr_mean"  # corr_mean|corr_std
selected_corr_prop_cond = "negative_corr_prop"
tmp_clique_dir = cliques_dir/f"{selected_corr_prop}/{selected_corr_prop_cond}"
fill_diag_val = 1
ret_items_len = 4
can_check_filtering_proc = True
corr_ser_std = corr_property_df.loc[::, "corr_ser_std"]
corr_ser_mean = corr_property_df.loc[::, "corr_ser_mean"]
selected_corr_prop_ser = corr_ser_mean if selected_corr_prop == "corr_mean" else corr_ser_std
item_names = tuple(dataset_df.columns)

In [None]:
final_pick_items = gen_corr_prop_filtered_items(item_pairs_ser=selected_corr_prop_ser, corr_prop_cond=selected_corr_prop_cond,
                                                item_names=item_names, fill_diag_val=fill_diag_val, ret_items_len=ret_items_len,
                                                cliques_dir=tmp_clique_dir, can_check_filtering_proc=can_check_filtering_proc)

JPY_LOGGER.info("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! final_pick_items !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
JPY_LOGGER.info(f"{final_pick_items}")
JPY_LOGGER.info("!"*100)

# PCA and Clustering filtering on items | PCA and Clustering filtering on item_pairs

## Observe hyper parameters of PCA & Cluster model

### Observe big range hyper parameters of PCA & Cluster model

In [None]:
filtering_on_items = True
filtering_on_item_pairs = False
assert filtering_on_items^filtering_on_item_pairs
if filtering_on_items:
    pca_input_data = dataset_df.T
    save_fig_dir = cluster_dir/"pca_hrchy/cluxter_with_items"
elif filtering_on_item_pairs:
    pca_input_data = corr_df
    save_fig_dir = cluster_dir/"pca_hrchy/cluxter_with_pairs"

obs_grid_pca_explanation_variance_thres = [1e-1]
pca_input_data_samples = pca_input_data.index
pca_input_data_featues = pca_input_data.columns
obs_grid_n_clusters = range(2, 50)
obs_linkage = "complete"
obs_cluster_metric = "euclidean"
save_fig_dir.mkdir(parents=True, exist_ok=True)

In [None]:
for obs_pca_explanation_variance_thres in obs_grid_pca_explanation_variance_thres:
    save_fig_path = save_fig_dir/f"{datetime.now().strftime('%Y%m%d%H%M%S')}_pca_thres({str(obs_pca_explanation_variance_thres).replace('.', '')})_silhouette_avg_vs_n_clusters"
    obs_reducted_data_df, obs_pri_components = calc_pca(data=pca_input_data, n_samples=len(pca_input_data_samples),
                                                        variance_thres=obs_pca_explanation_variance_thres, verbose=0)
    obs_cluster_conditions = {"n_samples": len(pca_input_data_samples), "n_features": len(obs_pri_components), "n_clusters_list": obs_grid_n_clusters,
                              "linkage": obs_linkage, "cluster_metric": obs_cluster_metric}
    obs_various_n_clusters_hrchy_cluster(data=obs_reducted_data_df, cluster_conds=obs_cluster_conditions, can_plot_each_cluster_info=False, save_fig_path=save_fig_path)

    JPY_LOGGER.info("="*100)
    JPY_LOGGER.info(f"pca_explanation_variance_thres:{obs_pca_explanation_variance_thres}")
    JPY_LOGGER.info(f"pca_input_data.shape:{pca_input_data.shape}, len(pca_input_data_samples):{len(pca_input_data_samples)}, len(pca_input_data_featues):{len(pca_input_data_featues)}")
    JPY_LOGGER.info(f"pca_input_data_samples[:3]:{pca_input_data_samples[:3]}")
    JPY_LOGGER.info(f"pca_input_data_featues[:3]:{pca_input_data_featues[:3]}")
    JPY_LOGGER.info(f"cluster_conditions: {obs_cluster_conditions}")
    JPY_LOGGER.info("="*100)

### Reduce range of hyper params of PCA and Clustering, and observe again

In [None]:
obs_selected_pca_explanation_variance_thres = 1e-1
obs_selected_n_clusters_list = [12, 35]

In [None]:
obs_reducted_data_df, obs_pri_components = calc_pca(data=pca_input_data, n_samples=len(pca_input_data_samples),
                                                    variance_thres=obs_selected_pca_explanation_variance_thres, verbose=0)
obs_cluster_conditions = {"n_samples": len(pca_input_data_samples), "n_features": len(obs_pri_components), "n_clusters_list": obs_selected_n_clusters_list,
                          "linkage": obs_linkage, "cluster_metric": obs_cluster_metric}
obs_various_n_clusters_hrchy_cluster(data=obs_reducted_data_df, cluster_conds=obs_cluster_conditions, can_plot_each_cluster_info=True)

JPY_LOGGER.info("="*100)
JPY_LOGGER.info(f"pca_explanation_variance_thres:{obs_selected_pca_explanation_variance_thres}")
JPY_LOGGER.info(f"pca_input_data.shape:{pca_input_data.shape}, len(pca_input_data_samples):{len(pca_input_data_samples)}, len(pca_input_data_featues):{len(pca_input_data_featues)}")
JPY_LOGGER.info(f"pca_input_data_samples[:3]:{pca_input_data_samples[:3]}")
JPY_LOGGER.info(f"pca_input_data_featues[:3]:{pca_input_data_featues[:3]}")
JPY_LOGGER.info(f"cluster_conditions: {obs_cluster_conditions}")
JPY_LOGGER.info("="*100)

## Select items

In [None]:
pca_kwargs = {"n_samples":len(pca_input_data_samples), "pca_explanation_variance_thres":  1e-1}
linkage = "complete"
cluster_metric = "euclidean"
selected_n_clusters = 35
cluster_kwargs = {"n_clusters": selected_n_clusters, "linkage": linkage, "cluster_metric": cluster_metric}

JPY_LOGGER.info("="*80)
JPY_LOGGER.info(f"pca_input_data.shape:{pca_input_data.shape}, len(pca_input_data_samples):{len(pca_input_data_samples)}, len(pca_input_data_featues):{len(pca_input_data_featues)}")
JPY_LOGGER.info(f"pca_input_data_samples[:3]:{pca_input_data_samples[:3]}")
JPY_LOGGER.info(f"pca_input_data_featues[:3]:{pca_input_data_featues[:3]}")
JPY_LOGGER.info(f"pca_kwargs:{pca_kwargs}")
JPY_LOGGER.info(f"cluster_kwargs:{cluster_kwargs}")
JPY_LOGGER.info("="*80)

In [None]:
if filtering_on_items:
    final_pick_items = gen_pca_cluster_filtered_items_each_cluster(pca_input_data=pca_input_data, pca_kwargs=pca_kwargs, cluster_kwargs=cluster_kwargs)
    JPY_LOGGER.info("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! final_pick_items !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    JPY_LOGGER.info(f"{final_pick_items}")
    JPY_LOGGER.info("!"*100)
elif filtering_on_item_pairs:
    final_pick_pairs, pairs_idx_each_cluster, ret_items = gen_pca_cluster_filtered_pairs_each_cluster(pca_input_data=pca_input_data, pca_kwargs=pca_kwargs, cluster_kwargs=cluster_kwargs)
    JPY_LOGGER.info("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! final_pick_pairs !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    JPY_LOGGER.info(f"final_pick_pairs: {final_pick_pairs}")
    JPY_LOGGER.info(f"pairs_idx_each_cluster: {pairs_idx_each_cluster}")
    JPY_LOGGER.info(f"ret_items: {ret_items}")
    JPY_LOGGER.info("!"*100)

# ...