In [None]:
import logging
import random
import sys
from itertools import combinations
from pprint import pformat
from pathlib import Path

import pandas as pd
import dynamic_yaml
import yaml

sys.path.append("/workspace/correlation-change-predict/utils")
from utils import load_multiple_data, load_dirs
from cluster_utils import convert_pairs_data_to_proximity_mat, filter_proximity_mat

class UtilsFilter(logging.Filter):
    def filter(self, record):
        if record.name in ['utils', 'cluster_utils']:
            return False
        return True

logger_console = logging.StreamHandler()
logger_formatter = logging.Formatter('%(levelname)-8s [%(name)s.%(funcName)s] %(message)s')
logger_console.setFormatter(logger_formatter)
jpy_logger = logging.getLogger("ywt_jupyter")
jpy_logger.addFilter(UtilsFilter())
jpy_logger.addHandler(logger_console)
jpy_logger.setLevel(logging.INFO)

# Prepare data

## Load Data

In [None]:
data_implement = "SP500_20112015"  # watch options by printing /config/data_config.yaml/["DATASETS"].keys()
# etl set setting
retrieve_items_setting = "-train_all"  # -train_train|-train_all
# set correlation type
corr_type = "pearson"  # "pearson" | "cross_corr"
# set target_df bins
target_df_bins = "bins_-10_-03_03_10"
# set CORR_WINDOW and CORR_STRIDE length
w_l=50 ; s_l = 1
# Decide how to calculate corr_ser
corr_ser_clac_method = "corr_ser_calc_regular"  # corr_ser_calc_regular|corr_ser_calc_abs

In [None]:
pipeline_corr_data_dir, corr_dir, target_dir, corr_property_dir, cliques_dir = load_dirs(data_implement=data_implement,
                                                                                         retrieve_items_setting=retrieve_items_setting,
                                                                                         corr_type=corr_type, target_df_bins=target_df_bins,
                                                                                         w_l=w_l, s_l=s_l,
                                                                                         corr_ser_clac_method=corr_ser_clac_method)
dataset_df, corr_df, target_df, corr_property_df = load_multiple_data(data_implement=data_implement,
                                                                      retrieve_items_setting=retrieve_items_setting,
                                                                      corr_type=corr_type, target_df_bins=target_df_bins,
                                                                      w_l=w_l, s_l=s_l,
                                                                      corr_ser_clac_method=corr_ser_clac_method)
display(dataset_df)
display(corr_df)
display(target_df)
display(corr_property_df)

# Random pick trainset

In [None]:
num_items = 100

In [None]:
def gen_random_trainset(all_items: list, train_set_len: int = 100, verbose: int = 0):
    """
    Randon pick items for trainset # Not always necessary to operate
    """
    random.seed(10)
    train_set = sorted(random.sample(all_items, train_set_len))

    if verbose==1:
        logging.info(f"len(train_set):{len(train_set)}")
        pp = pprint.PrettyPrinter(width=500, compact=True)
        pp.pprint(train_set)

    return train_set

final_pick_items = gen_random_trainset(dataset_df.columns.tolist(), num_items)
jpy_logger.info("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! final_pick_items !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
jpy_logger.info(f"\n{final_pick_items}")
jpy_logger.info("!"*100)

# Correlation Series Property filtered trainset

In [None]:
selected_corr_prop = "corr_mean"  # corr_mean|corr_std
selected_corr_prop_cond = "negative_corr_prop"
tmp_clique_dir = cliques_dir/f"{selected_corr_prop}/{selected_corr_prop_cond}"
fill_diag_val = 1
can_check_filtering_proc = True

In [None]:
def gen_corr_prop_filtered_trainset(item_pairs_ser: pd.Series, corr_prop_cond: str, item_names: tuple, fill_diag_val: int,
                                    cliques_dir: Path, can_check_filtering_proc: bool):
    corr_prop_proximity_df = convert_pairs_data_to_proximity_mat(item_pairs_ser=item_pairs_ser, item_names=item_names, fill_diag_val=fill_diag_val)
    corr_prop_mask_settings = {"positive_corr_prop": (corr_prop_proximity_df > 0),
                               "negative_corr_prop": (corr_prop_proximity_df < 0)}
    corr_prop_mask = corr_prop_mask_settings[corr_prop_cond]
    corr_prop_filtered_proximity_df, _ = filter_proximity_mat(proximity_mat=corr_prop_proximity_df.copy(), filter_mask=corr_prop_mask, tmp_clique_dir=tmp_clique_dir)
    train_set = corr_prop_filtered_proximity_df.columns.tolist()
    if can_check_filtering_proc:
        jpy_logger.setLevel(logging.DEBUG)
        jpy_logger.debug("####################### item_pairs_ser #######################")
        display(item_pairs_ser)
        jpy_logger.debug("####################### corr_prop_proximity_df #######################")
        display(corr_prop_proximity_df)
        jpy_logger.debug("####################### corr_prop_filtered_proximity_df #######################")
        display(corr_prop_filtered_proximity_df)
        train_set_pairs = [" & ".join(pair) for pair in combinations(train_set, 2)]
        train_set_pairs_mask = item_pairs_ser.index.isin(train_set_pairs)
        jpy_logger.debug("####################### item_pairs_ser[filtered_item_pairs_mask] #######################")
        jpy_logger.debug(f"\n{item_pairs_ser[train_set_pairs_mask]}")
        jpy_logger.setLevel(logging.INFO)
    return train_set


corr_ser_std = corr_property_df.loc[::, "corr_ser_std"]
corr_ser_mean = corr_property_df.loc[::, "corr_ser_mean"]
selected_corr_prop_ser = corr_ser_mean if selected_corr_prop == "corr_mean" else corr_ser_std
item_names = tuple(dataset_df.columns)
corr_prop_filtered_items = gen_corr_prop_filtered_trainset(item_pairs_ser=selected_corr_prop_ser, corr_prop_cond=selected_corr_prop_cond,
                                                           item_names=item_names, fill_diag_val=fill_diag_val, cliques_dir=cliques_dir,
                                                           can_check_filtering_proc=can_check_filtering_proc)
if len(corr_prop_filtered_items) > 10:
    final_pick_items = gen_random_trainset(corr_prop_filtered_items, 4)
else:
    final_pick_items = corr_prop_filtered_items
jpy_logger.info("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! final_pick_items !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
jpy_logger.info(f"{final_pick_items}")
jpy_logger.info("!"*100)

# PCA_items and Clustering_items filtered trainset

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler


def 
data = [[-1, 2, 3], [-0.5, -25, 6], [0, 20, -5], [1, 18, 10], [6, -2, 3]]
scaler = MinMaxScaler()
print(scaler.fit(data))

pca = PCA(n_components=2)
pca.fit_transform()
data_array_pca = pca.transform(data_array_scaled)

# ...