In [None]:
import logging
import random
import sys
from itertools import combinations, product
from pprint import pformat
from pathlib import Path

import pandas as pd
import numpy as np
import dynamic_yaml
import yaml

sys.path.append("/workspace/multivariate-correlation-anomaly-detection/")
from utils.assorted_utils import load_multiple_data, load_dirs
from utils.cluster_utils import (convert_pairs_data_to_proximity_mat, filter_proximity_mat, calc_pca, obs_various_n_clusters_hrchy_cluster,
                                 calc_hrchy_cluster_given_n_clusters, filtered_small_n_samples_and_silhouette_min_cluster, select_cluster_labels_with_max_dist)
from utils.log_utils import Log

JPY_LOGGER = Log().init_logger(logger_name="ywt_jupyter", edit_config={"handlers": {"console": {"class": "logging.StreamHandler",
                                                                                                "level": "INFO",
                                                                                                "formatter": "simple",
                                                                                                "stream": "ext://sys.stdout"}}})

# Prepare data

## Load Data

In [None]:
data_implement = "SP500_20112015"  # watch options by printing /config/data_config.yaml/["DATASETS"].keys()
# etl set setting
retrieve_items_setting = "-train_all"  # -train_train|-train_all
# set correlation type
corr_type = "pearson"  # "pearson" | "cross_corr"
# set target_df bins
target_df_bins = "bins_-10_-03_03_10"
# set CORR_WINDOW and CORR_STRIDE length
w_l=50 ; s_l = 1
# Decide how to calculate corr_ser
corr_ser_clac_method = "corr_ser_calc_regular"  # corr_ser_calc_regular|corr_ser_calc_abs

In [None]:
pipeline_corr_data_dir, corr_dir, target_dir, corr_property_dir, cliques_dir = load_dirs(data_implement=data_implement,
                                                                                         retrieve_items_setting=retrieve_items_setting,
                                                                                         corr_type=corr_type, target_df_bins=target_df_bins,
                                                                                         w_l=w_l, s_l=s_l,
                                                                                         corr_ser_clac_method=corr_ser_clac_method)
dataset_df, corr_df, target_df, corr_property_df = load_multiple_data(data_implement=data_implement,
                                                                      retrieve_items_setting=retrieve_items_setting,
                                                                      corr_type=corr_type, target_df_bins=target_df_bins,
                                                                      w_l=w_l, s_l=s_l,
                                                                      corr_ser_clac_method=corr_ser_clac_method)
display(dataset_df)
display(corr_df)
display(target_df)
display(corr_property_df)

# Random pick trainset

In [None]:
num_items = 100

In [None]:
def gen_random_trainset(all_items: list, train_set_len: int = 100, verbose: int = 0):
    """
    Randon pick items for trainset # Not always necessary to operate
    """
    random.seed(10)
    train_set = sorted(random.sample(all_items, train_set_len))

    if verbose==1:
        logging.info(f"len(train_set):{len(train_set)}")
        pp = pprint.PrettyPrinter(width=500, compact=True)
        pp.pprint(train_set)

    return train_set

final_pick_items = gen_random_trainset(dataset_df.columns.tolist(), num_items)
JPY_LOGGER.info("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! final_pick_items !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
JPY_LOGGER.info(f"\n{final_pick_items}")
JPY_LOGGER.info("!"*100)

# Correlation Series Property filtered trainset

In [None]:
selected_corr_prop = "corr_mean"  # corr_mean|corr_std
selected_corr_prop_cond = "negative_corr_prop"
tmp_clique_dir = cliques_dir/f"{selected_corr_prop}/{selected_corr_prop_cond}"
fill_diag_val = 1
can_check_filtering_proc = True

In [None]:
def gen_corr_prop_filtered_trainset(item_pairs_ser: pd.Series, corr_prop_cond: str, item_names: tuple, fill_diag_val: int,
                                    cliques_dir: Path, can_check_filtering_proc: bool):
    corr_prop_proximity_df = convert_pairs_data_to_proximity_mat(item_pairs_ser=item_pairs_ser, item_names=item_names, fill_diag_val=fill_diag_val)
    corr_prop_mask_settings = {"positive_corr_prop": (corr_prop_proximity_df > 0),
                               "negative_corr_prop": (corr_prop_proximity_df < 0)}
    corr_prop_mask = corr_prop_mask_settings[corr_prop_cond]
    corr_prop_filtered_proximity_df, _ = filter_proximity_mat(proximity_mat=corr_prop_proximity_df.copy(), filter_mask=corr_prop_mask, tmp_clique_dir=tmp_clique_dir)
    train_set = corr_prop_filtered_proximity_df.columns.tolist()
    if can_check_filtering_proc:
        JPY_LOGGER.setLevel(logging.DEBUG)
        JPY_LOGGER.debug("####################### item_pairs_ser #######################")
        display(item_pairs_ser)
        JPY_LOGGER.debug("####################### corr_prop_proximity_df #######################")
        display(corr_prop_proximity_df)
        JPY_LOGGER.debug("####################### corr_prop_filtered_proximity_df #######################")
        display(corr_prop_filtered_proximity_df)
        train_set_pairs = [" & ".join(pair) for pair in combinations(train_set, 2)]
        train_set_pairs_mask = item_pairs_ser.index.isin(train_set_pairs)
        JPY_LOGGER.debug("####################### item_pairs_ser[filtered_item_pairs_mask] #######################")
        JPY_LOGGER.debug(f"\n{item_pairs_ser[train_set_pairs_mask]}")
        JPY_LOGGER.setLevel(logging.INFO)
    return train_set


corr_ser_std = corr_property_df.loc[::, "corr_ser_std"]
corr_ser_mean = corr_property_df.loc[::, "corr_ser_mean"]
selected_corr_prop_ser = corr_ser_mean if selected_corr_prop == "corr_mean" else corr_ser_std
item_names = tuple(dataset_df.columns)
corr_prop_filtered_items = gen_corr_prop_filtered_trainset(item_pairs_ser=selected_corr_prop_ser, corr_prop_cond=selected_corr_prop_cond,
                                                           item_names=item_names, fill_diag_val=fill_diag_val, cliques_dir=cliques_dir,
                                                           can_check_filtering_proc=can_check_filtering_proc)
if len(corr_prop_filtered_items) > 10:
    final_pick_items = gen_random_trainset(corr_prop_filtered_items, 4)
else:
    final_pick_items = corr_prop_filtered_items
JPY_LOGGER.info("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! final_pick_items !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
JPY_LOGGER.info(f"{final_pick_items}")
JPY_LOGGER.info("!"*100)

# PCA_items and Clustering_items filtered trainset

## Fisrt, Rnning PCA

In [None]:
pca_explanation_variance_thres = 1e-1

pca_input_data = dataset_df.T
pca_input_data_samples = pca_input_data.index
pca_input_data_featues = pca_input_data.columns
reducted_data_df, pri_components = calc_pca(data=pca_input_data, n_samples=len(pca_input_data_samples), variance_thres=pca_explanation_variance_thres)

JPY_LOGGER.info("="*80)
JPY_LOGGER.info(f"pca_explanation_variance_thres:{pca_explanation_variance_thres}")
JPY_LOGGER.info(f"pca_input_data.shape:{pca_input_data.shape}, len(pca_input_data_samples):{len(pca_input_data_samples)}, len(pca_input_data_featues):{len(pca_input_data_featues)}")
JPY_LOGGER.info(f"pca_input_data_samples[:3]:{pca_input_data_samples[:3]}")
JPY_LOGGER.info(f"pca_input_data_featues[:3]:{pca_input_data_featues[:3]}")
JPY_LOGGER.info(f"reducted_data_df.shape: {reducted_data_df.shape}")
JPY_LOGGER.info("="*80)
display(reducted_data_df)

## Seond, Observe clustering hyper-parameters

### Overview large range of hyper params of Clustering

In [None]:
grid_n_clusters = range(2, 51)
linkage = "complete"
cluster_metric = "euclidean"
cluster_conditions = {"n_samples": len(pca_input_data_samples), "n_features": len(pri_components), "n_clusters_list": grid_n_clusters, "linkage": linkage, "cluster_metric": cluster_metric}
obs_various_n_clusters_hrchy_cluster(data=reducted_data_df, cluster_conds=cluster_conditions, can_plot_each_cluster_info=False)

JPY_LOGGER.info("="*80)
JPY_LOGGER.info(f"cluster_conditions:{cluster_conditions}")
JPY_LOGGER.info("="*80)

### Reduce range of hyper params of Clustering, and observe again

In [None]:
selected_n_clusters_list = [12, 35]
linkage = "complete"
cluster_metric = "euclidean"
cluster_conditions = {"n_samples": len(pca_input_data_samples), "n_features": len(pri_components), "n_clusters_list": selected_n_clusters_list, "linkage": linkage, "cluster_metric": cluster_metric}

obs_various_n_clusters_hrchy_cluster(data=reducted_data_df, cluster_conds=cluster_conditions, can_plot_each_cluster_info=True)

## Third, Rnning cluster with specific hypter-parameters

In [None]:
linkage = "complete"
cluster_metric = "euclidean"
selected_n_clusters = 35
cluster_conditions = {"n_samples": len(pca_input_data_samples), "n_features": len(pri_components), "n_clusters_list": selected_n_clusters, "linkage": linkage, "cluster_metric": cluster_metric}
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 80)

JPY_LOGGER.info("="*80)
JPY_LOGGER.info(f"pca_explanation_variance_thres:{pca_explanation_variance_thres}")
JPY_LOGGER.info(f"pca_input_data.shape:{pca_input_data.shape}, len(pca_input_data_samples):{len(pca_input_data_samples)}, len(pca_input_data_featues):{len(pca_input_data_featues)}")
JPY_LOGGER.info(f"pca_input_data_samples[:3]:{pca_input_data_samples[:3]}")
JPY_LOGGER.info(f"pca_input_data_featues[:3]:{pca_input_data_featues[:3]}")
JPY_LOGGER.info(f"reducted_data_df.shape: {reducted_data_df.shape}")
JPY_LOGGER.info(f"cluster_conditions:{cluster_conditions}")
JPY_LOGGER.info("="*80)

In [None]:
each_sample_cluster_labels, cluster_centers, silhouette_avg, sample_silhouette_values, clusters_info_df = calc_hrchy_cluster_given_n_clusters(n_clusters=selected_n_clusters,
                                                                                                                                              data=reducted_data_df,
                                                                                                                                              cluster_conds=cluster_conditions)
filtered_1_clusters_info_df = filtered_small_n_samples_and_silhouette_min_cluster(clusters_info_df=clusters_info_df, min_cluster_n_samples=3, min_cluster_silhouette=0)
selected_cluter_labels, max_cluster_dist, filtered_2_clusters_info_df = select_cluster_labels_with_max_dist(filtered_1_clusters_info_df)

JPY_LOGGER.info("="*80)
JPY_LOGGER.info(f"max_cluster_dist: {max_cluster_dist}")
JPY_LOGGER.info(f"selected_cluter_labels: {selected_cluter_labels}")
JPY_LOGGER.info("="*80)
display(filtered_2_clusters_info_df)

## Select items

In [None]:
final_pick_items = dict()
all_input_items = pca_input_data_samples
for cluster_label in selected_cluter_labels:
    final_pick_items.update({f"cluster_label_{cluster_label}": all_input_items[each_sample_cluster_labels == cluster_label].tolist()})
JPY_LOGGER.info("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! final_pick_items !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
JPY_LOGGER.info(f"{final_pick_items}")
JPY_LOGGER.info("!"*100)

# ...