In [1]:
from tqdm import tqdm
from pathlib import Path
import warnings
import sys
import logging
from pprint import pformat
from itertools import product

import pandas as pd
import numpy as np
import matplotlib as mpl
import torch
from torch.nn import Linear, GRU, Sequential, BatchNorm1d, ReLU, Dropout
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, GINConv
from torch_geometric.nn import global_mean_pool, global_add_pool
import dynamic_yaml
import yaml

sys.path.append("/workspace/correlation-change-predict/ywt_library")
import data_generation
from data_generation import data_gen_cfg, gen_corr_dist_mat
from stl_decompn import stl_decompn
from corr_property import calc_corr_ser_property


with open('../config/data_config.yaml') as f:
    data = dynamic_yaml.load(f)
    data_cfg = yaml.full_load(dynamic_yaml.dump(data))

warnings.simplefilter("ignore")
logging.basicConfig(level=logging.INFO)
matplotlib_logger = logging.getLogger("matplotlib")
matplotlib_logger.setLevel(logging.ERROR)
mpl.rcParams[u'font.sans-serif'] = ['simhei']
mpl.rcParams['axes.unicode_minus'] = False
# logger_list = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
# print(logger_list)

# %load_ext pycodestyle_magic
# %pycodestyle_on --ignore E501
logging.debug(pformat(data_cfg, indent=1, width=100, compact=True))
logging.info(pformat(data_gen_cfg, indent=1, width=100, compact=True))

INFO:root:{'CORR_STRIDE': 5, 'CORR_WINDOW': 5, 'DATA_DIV_STRIDE': 20, 'MAX_DATA_DIV_START_ADD': 0}


time: 1.79 s (started: 2023-01-29 18:23:59 +00:00)


## Data implement & output setting & testset setting

In [2]:
# data implement setting
data_implement = "SP500_20082017_CORR_SER_REG_CORR_MAT_HRCHY_11_CLUSTER"  # watch options by operate: print(data_cfg["DATASETS"].keys())
# train set setting
train_items_setting = "-train_train"  # -train_train|-train_all
# setting of name of output files and pictures title
output_file_name = data_cfg["DATASETS"][data_implement]['OUTPUT_FILE_NAME_BASIS'] + train_items_setting
logging.info(f"===== file_name basis:{output_file_name} =====")


graph_data_dir = Path(data_cfg["DIRS"]["PIPELINE_DATA_DIR"])/f"{output_file_name}-graph_data"

INFO:root:===== file_name basis:sp500_20082017_corr_ser_reg_corr_mat_hrchy_11_cluster-train_train =====


time: 1.08 ms (started: 2023-01-29 18:24:01 +00:00)


In [3]:
mts_corr_ad_cfg = {"tr_loader_batch": 12,  # each graph contains 5 days correlation, so 4 graphs means a month, 12 graphs means a quarter
                   "va_loader_batch": 4,  # each graph contains 5 days correlation, so 4 graphs means a month, 12 graphs means a quarter
                   "tt_loader_batch": 4,  # each graph contains 5 days correlation, so 4 graphs means a month, 12 graphs means a quarter
                   "gin_dim_h": 32,
                   "gru_layers": 1,
                   "gru_dim_out": 32,
                   }

time: 464 µs (started: 2023-01-29 18:24:01 +00:00)


## Load Graph Data

In [4]:
graph_arr = np.load(graph_data_dir/f"corr_calc_reg-corr_graph.npy")  # each graph consist of 66 node & 66^2 edges
logging.info(f"graph_arr.shape:{graph_arr.shape}")
graph_time_step = graph_arr.shape[0] - 1  # the graph of last "t" can't be used as train data
node_attr = torch.tensor(np.zeros((graph_arr.shape[1], 1)), dtype=torch.float32)  # each node has only one attribute
edge_index = torch.tensor(list(product(range(graph_arr.shape[1]), repeat=2)))
dataset = []
for g_t in range(graph_time_step):
    edge_attr = torch.tensor(np.hstack(graph_arr[g_t]).reshape(-1, 1), dtype=torch.float32)
    edge_attr_next_t = torch.tensor(np.hstack(graph_arr[g_t+1]))
    data = Data(x=node_attr, y=edge_attr_next_t, edge_index=edge_index.t().contiguous(), edge_attr=edge_attr)
    dataset.append(data)
else:
    mts_corr_ad_cfg["num_node_features"] = data.num_node_features
    mts_corr_ad_cfg["dim_out"] = data.y.shape[0]
    logging.info(f"data.num_node_features: {data.num_node_features}; data.num_edges: {data.num_edges}; data.num_edge_features: {data.num_edge_features}; data.is_undirected: {data.is_undirected()}; ")
    logging.info(f"data.x.shape: {data.x.shape}; data.y.shape: {data.y.shape}; data.edge_index.shape: {data.edge_index.shape}; data.edge_attr.shape: {data.edge_attr.shape}")

# Create training, validation, and test sets
train_dataset = dataset[:int(len(dataset)*0.9)]
val_dataset   = dataset[int(len(dataset)*0.9):int(len(dataset)*0.95)]
test_dataset  = dataset[int(len(dataset)*0.95):]

# Create mini-batches
train_loader = DataLoader(train_dataset, batch_size=mts_corr_ad_cfg["tr_loader_batch"], shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=mts_corr_ad_cfg["va_loader_batch"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=mts_corr_ad_cfg["tt_loader_batch"], shuffle=False)

# show info
logging.info(f'Training set   = {len(train_dataset)} graphs')
logging.info(f'Validation set = {len(val_dataset)} graphs')
logging.info(f'Test set       = {len(test_dataset)} graphs')
logging.debug('\nTrain loader:')
for i, subgraph in enumerate(train_loader):
    logging.debug(f' - Subgraph {i}: {subgraph} ; Subgraph {i}.num_graphs:{subgraph.num_graphs}')

logging.debug('\nValidation loader:')
for i, subgraph in enumerate(val_loader):
    logging.debug(f' - Subgraph {i}: {subgraph} ; Subgraph{i}.num_graphs:{subgraph.num_graphs}')

logging.debug('\nTest loader:')
for i, subgraph in enumerate(test_loader):
    logging.debug(f' - Subgraph {i}: {subgraph} ; Subgraph{i}.num_graphs:{subgraph.num_graphs}')

INFO:root:graph_arr.shape:(499, 66, 66)
INFO:root:data.num_node_features: 1; data.num_edges: 4356; data.num_edge_features: 1; data.is_undirected: True; 
INFO:root:data.x.shape: torch.Size([66, 1]); data.y.shape: torch.Size([4356]); data.edge_index.shape: torch.Size([2, 4356]); data.edge_attr.shape: torch.Size([4356, 1])
INFO:root:Training set   = 448 graphs
INFO:root:Validation set = 25 graphs
INFO:root:Test set       = 25 graphs


time: 157 ms (started: 2023-01-29 18:24:01 +00:00)


## Multi-Dimension Time-Series Correlation Anomly Detection Model

In [5]:
class MTSCorrAD(torch.nn.Module):
    """
    num_node_features: number of features per node in the graph, in this model every node has same size of features 
    gin_dim_h: output size of hidden layer of GINconv
    gru_layers: Number of recurrent layers of GRU
    gru_dim_out: The number of output size of GRU and features in the hidden state h of GRU
    dim_out: The number of output size of MTSCorrAD model
    """
    def __init__(self, num_node_features:int, gin_dim_h:int, gru_layers:int, gru_dim_out:int, dim_out:int, **kwargs):
        super(MTSCorrAD, self).__init__()
        self.ginconv1 = GINConv(
            Sequential(Linear(num_node_features, gin_dim_h),
                       BatchNorm1d(gin_dim_h), ReLU(),
                       Linear(gin_dim_h, gin_dim_h), ReLU()))
        self.ginconv2 = GINConv(
            Sequential(Linear(gin_dim_h, gin_dim_h),
                       BatchNorm1d(gin_dim_h), ReLU(),
                       Linear(gin_dim_h, gin_dim_h), ReLU()))
        self.ginconv3 = GINConv(
            Sequential(Linear(gin_dim_h, gin_dim_h),
                       BatchNorm1d(gin_dim_h), ReLU(),
                       Linear(gin_dim_h, gin_dim_h), ReLU()))
        self.gru1 = GRU(gin_dim_h*3, gru_dim_out, gru_layers)  # the input size of GRU depend on the number of layers of GINconv
        self.lin1 = Linear(gru_dim_out, dim_out)


    def forward(self, x, edge_index, batch_node_id):
        # Inter-series modeling
        # Node embeddings
        h1 = self.ginconv1(x, edge_index)
        h2 = self.ginconv2(h1, edge_index)
        h3 = self.ginconv3(h2, edge_index)

        # Graph-level readout
        h1 = global_add_pool(h1, batch_node_id)
        h2 = global_add_pool(h2, batch_node_id)
        h3 = global_add_pool(h3, batch_node_id)

        # Concatenate graph embeddings
        graph_embeds = torch.cat((h1, h2, h3), dim=1)  # the shape of graph_embeds: [batch_size, num_layers*gin_dim_h]

        # Temporal Modeling
        gru_output, gru_hn = self.gru1(graph_embeds)  # regarding batch_size as time-steps(sequence length) by using "unbatched" input
        fc_output = self.lin1(gru_output[-1])  # gru_output[-1] => only take last time-step

        return graph_embeds, fc_output

time: 1.27 ms (started: 2023-01-29 18:24:01 +00:00)


In [6]:
model =  MTSCorrAD(**mts_corr_ad_cfg).to("cuda")
criterion = torch.nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(),
                                  lr=0.01,
                                  weight_decay=0.01)
epochs = 100

model.train()
for epoch in range(epochs+1):
    total_loss = 0
    acc = 0
    val_loss = 0
    val_acc = 0

    # Train on batches
    for data in tqdm(train_loader):
        data.to("cuda")
        x, y = data.x, data.y[-mts_corr_ad_cfg["dim_out"]:]
        optimizer.zero_grad()
        graph_embeds, out = model(x, data.edge_index, data.batch)
        loss = criterion(out, y)
        total_loss += loss / len(train_loader)
        loss.backward()
        optimizer.step()
        # acc += accuracy(out.argmax(dim=1), data.y) / len(loader)

100% 38/38 [00:00<00:00, 95.85it/s] 
100% 38/38 [00:00<00:00, 160.48it/s]

time: 2.63 s (started: 2023-01-29 18:24:01 +00:00)



