In [None]:
import logging
import random
from pprint import pformat
from pathlib import Path
import sys
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


sys.path.append("/workspace/correlation-change-predict/utils")
from utils import split_and_norm_data, load_multiple_data

logging.basicConfig(level=logging.INFO)

# Prepare data

## Load Data

In [None]:
data_implement = "SP500_20112015_CORR_MEAN_NEGATIVE_KEEP"  # watch options by printing /config/data_config.yaml/["DATASETS"].keys()
# etl set setting
retrieve_items_setting = "-train_train"  # -train_train|-train_all
# set correlation type
corr_type = "pearson"  # "pearson" | "cross_corr"
# set CORR_WINDOW and CORR_STRIDE length
w_l=50 ; s_l = 1
# Decide how to calculate corr_ser
corr_ser_clac_method = "corr_ser_calc_regular"  # corr_ser_calc_regular|corr_ser_calc_abs

dataset_df, corr_df, target_df, corr_property_df = load_multiple_data(data_implement=data_implement,
                                                                      retrieve_items_setting=retrieve_items_setting,
                                                                      corr_type=corr_type, w_l=w_l, s_l=s_l,
                                                                      corr_ser_clac_method=corr_ser_clac_method)
display(dataset_df)
display(corr_df)
display(target_df)
display(corr_property_df)

## ETL setting

In [None]:
# set to retrieve partial corr_data
custom_idxs = [1, 2, 4, 6, 7, 9, 12, 14, 15, 17, 18, 20] # None|[int1, int2, ...]
custom_idxs = None

In [None]:
tr_data, val_data, test_data = split_and_norm_data(model_input_df=corr_df, target_df=target_df, batch_size=64)
if custom_idxs:
    selected_tr_data = {"model_input": tr_data["model_input"][custom_idxs, ::], "target": tr_data["target"][custom_idxs, ::]}
    selected_val_data = {"model_input": val_data["model_input"][custom_idxs, ::], "target": val_data["target"][custom_idxs, ::]}
    selected_test_data = {"model_input": test_data["model_input"][custom_idxs, ::], "target": test_data["target"][custom_idxs, ::]}
    selected_item_pairs = corr_df.index[custom_idxs]
else:
    selected_tr_data = tr_data
    selected_val_data = val_data
    selected_test_data = test_data
    selected_item_pairs = corr_df.index
logging.info(f"tr_data[model_input].shape:{tr_data['model_input'].shape}, tr_data[target].shape:{tr_data['target'].shape}, val_data[model_input].shape:{val_data['model_input'].shape}, val_data[target].shape:{val_data['target'].shape}, test_data[model_input].shape:{test_data['model_input'].shape}, test_data[target].shape:{test_data['target'].shape}")
logging.info(f"selected_tr_data[model_input].shape:{selected_tr_data['model_input'].shape}, selected_tr_data[target].shape:{selected_tr_data['target'].shape}, selected_val_data[model_input].shape:{selected_val_data['model_input'].shape}, selected_val_data[target].shape:{selected_val_data['target'].shape}, selected_test_data[model_input].shape:{selected_test_data['model_input'].shape}, selected_test_data[target].shape:{selected_test_data['target'].shape}")
logging.info(f"===selected_item_pairs:===")
display(pd.DataFrame(selected_item_pairs))
logging.info(f"===selected_item_pairs===")

# Observe train_data and val_data class distribution

In [None]:
assert (np.unique(selected_tr_data['target']).shape[0] < 10) and (np.unique(selected_val_data['target']).shape[0] < 10), "number of classes should lower than 10"
tr_labels, tr_labels_freq_counts = np.unique(selected_tr_data['target'], return_counts=True)
val_labels, val_labels_freq_counts = np.unique(selected_val_data['target'], return_counts=True)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16,9))
colors_labels_map = {"-1.0": "lime", "0.0": "darkorange", "1.0": "dodgerblue"}
axes[0].pie(tr_labels_freq_counts, labels=tr_labels, autopct='%1.1f%%', textprops={'fontsize': 24}, colors=[colors_labels_map[str(label)] for label in tr_labels])
axes[0].set_title("Train", fontsize=32)
axes[1].pie(val_labels_freq_counts, labels=val_labels, autopct='%1.1f%%', textprops={'fontsize': 24}, colors=[colors_labels_map[str(label)] for label in val_labels])
axes[1].set_title("Validation", fontsize=32)
fig_title = re.sub(r"SP500_\d*_CORR_MEAN_", "", data_implement)
fig.suptitle(f'{fig_title}', fontsize=40)
plt.show()
plt.close()

# Observe properties of Corrlelation series

## Display `corr_property_df` of `selected_item_pairs`

In [None]:
selected_mask = corr_property_df.index.isin(selected_item_pairs)
display(corr_property_df.loc[selected_mask, ::])

## plot distribution of all correlation of all item_pairs

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1)
fig.set_size_inches(6, 10)
all_item_pair_corrs = np.hstack(corr_df.values)
axes[0].hist(all_item_pair_corrs, bins=20)
axes[0].xaxis.set_tick_params(labelsize=18)
axes[1].boxplot(all_item_pair_corrs)
axes[1].yaxis.set_tick_params(labelsize=18)
plt.show()
plt.close()

# ...