In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

import matplotlib
from matplotlib.colors import ListedColormap

import os
import gc
import argparse
import torch
import optuna
import joblib
import pickle

from optuna.samplers import TPESampler
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
from tqdm import tqdm

import sensors.nn.models as models
import sensors.utils.utils as utils
import sensors.utils.fault_detection as fd

from sensors.utils.utils import roc_params, compute_auc

from importlib import reload
utils = reload(utils)

from pyprojroot import here
root_dir = str(here())

data_dir = '~/data/interim/'

matplotlib.rcParams.update({'font.size': 20})
matplotlib.rcParams.update({'font.family': 'DejaVu Serif'})

In [None]:
# study = joblib.load(root_dir+'/outputs/log_20parts_10level_1anomaly.pkl')
# study_b = joblib.load(root_dir+'/outputs/log_20parts_10level_1anomaly_b.pkl')
# df_study = pd.concat([study.trials_dataframe(), study_b.trials_dataframe()])

study = joblib.load(root_dir+'/outputs/log_20parts_10.0level_1anomaly_nokmeans.pkl')
df_study = study.trials_dataframe().drop(columns=['datetime_start', 'datetime_complete', 'duration'])

params = list(study.best_params.keys())
df_study.columns = df_study.columns.str.replace("user_attrs_", "")
df_study.columns = df_study.columns.str.replace("params_", "")

columns_to_show = params
columns_to_show.extend(['number', 'state',
                        'value_cscore', 'std_cscore', 'min_cscore', 'mean_auc', 'std_auc',
                        # 'cscore_list', 'auc_list'
                        ])

df_study = df_study[columns_to_show]
df_study.sort_values('value_cscore')

In [None]:
def train_cluster(N_epochs, model, X, G, device, weight_loss=0.25, lr=1e-3):
 
    loss_evo = []
    loss_mc_evo = []
    loss_o_evo = []

    X = torch.tensor(X)
    X = X.to(device)

    # Node coordinates
    C = torch.tensor(G.coords)
    A = torch.tensor(G.W.toarray()).float() #Using W as a float() tensor
    A = A.to(device)    

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # num_clusters_per_feature = [5, 6]
    # kmeans_feats = models.kmeans_features(C, num_clusters_per_feature).to(device).float()

    kmeans_feats = None

    model.train()
    model.reset_parameters()
    for epoch in tqdm(range(N_epochs)):

        optimizer.zero_grad()
        S, loss_mc, loss_o = model(X, A, kmeans_feats)
        loss = loss_mc + weight_loss*loss_o
        loss.backward()
        optimizer.step()
        loss_evo.append(loss.item())
        loss_mc_evo.append(loss_mc.item())
        loss_o_evo.append(loss_o.item())

    return S

def evaluate_model(model, weight_loss, N_epochs, data, labels, data_dfs, G, nodes_orig, device):

    cluster_score_list = []
    auc_list = []
    S_list = []
    nodes_list = []
    for i in range(data.shape[0]):
        X = data[i,:,:]
        label = labels[i,:]
        df_anomaly = data_dfs[i]

        nodes = nodes_orig.copy()

        S = train_cluster(N_epochs, model, X, G, device, weight_loss)
        S_list.append(S)

        cluster_score, auc, nodes = get_score(nodes, df_anomaly, S)
        nodes_list.append(nodes)
        
        cluster_score_list.append(cluster_score)
        auc_list.append(auc)
    
    return cluster_score_list, auc_list, S_list, nodes_list

def get_score(nodes, df_anomaly, S):

    nodes['pred'] = S.argmax(dim=1).cpu().numpy()
    nodes['score'] = S.softmax(dim=-1).detach().cpu().numpy().max(axis=1)
    nodes['anomaly'] = df_anomaly[['pid','anomaly']].groupby('pid').anomaly.max().values

    most_common_preds = nodes.query('anomaly!=0').groupby('anomaly')['pred'].apply(lambda x: x.mode()[0])

    nodes['new_pred'] = nodes['pred']
    nodes.loc[~nodes.pred.isin(most_common_preds.values),'new_pred'] = -1

    max_anomaly = nodes.groupby('new_pred')['anomaly'].transform('max')
    nodes.loc[nodes['new_pred'] != -1, 'new_pred'] = max_anomaly
    nodes.loc[nodes['new_pred'] == -1, 'new_pred'] = 0

    average = 'binary' if df_anomaly.anomaly.nunique()==2 else 'weighted'
    cluster_score = f1_score(y_true=nodes.anomaly, y_pred=nodes.new_pred, average=average)

    tpr, fpr, _ = roc_params(metric=nodes.score, label=(nodes.anomaly>0), interp=True)
    auc = compute_auc(tpr,fpr)

    return cluster_score, auc, nodes

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# epochs_list = [100, 250, 500, 750, 1000, 1500, 2000, 3000, 5000, 7500, 10000]
epochs_list = [500, 1000]
cscore_results = []
auc_results = []
S_results = []
nodes_results = []

for N_epochs in epochs_list:

    print(N_epochs)

    MANUAL_SEED = 0
    torch.manual_seed(MANUAL_SEED)  # set seed (manually) for generate random numbers with CPU
    torch.cuda.manual_seed(MANUAL_SEED)  # set seed (manually) for generate random numbers with GPU --> "CUDA = GPU"
    np.random.seed(MANUAL_SEED)


    # OBTAINING DATA
    dataset = 'df_StOlavs_D1L2B'
    df_orig = pd.read_parquet(data_dir + f'{dataset}.parq')

    df, nodes = fd.treat_nodes(df_orig)
    _, nodes['subgraph'] = fd.NNGraph(nodes, radius=15, subgraphs=True)

    main_graph = nodes.subgraph.value_counts().index[0]
    nodes = nodes.query('subgraph==@main_graph').copy()
    G = fd.NNGraph(nodes, radius=15)
    df = df[df.pid.isin(nodes.pid.unique())].copy()   

    data, labels, data_dfs = utils.generate_cluster_anomaly(df, nodes, G, data_size=25)

    n_timestamps = data.shape[2]

    # Possible hyperparameters
    n_clusters = 10
    n_extra_feats = 0 #########################
    conv1d_n_feats = 3
    conv1d_kernel_size = 60
    conv1d_stride = 30
    graphconv_n_feats = 30

    weight_loss = 1
    weight_coords = 0.25

    model = models.ClusterTS(conv1d_n_feats, conv1d_kernel_size, conv1d_stride, graphconv_n_feats,
                    n_timestamps, n_clusters, n_extra_feats, weight_coords)
    model = model.to(device)

    # np.random.seed(32)
    cluster_score_list, auc_list, S_list, nodes_list = evaluate_model(model, weight_loss, N_epochs, data, labels, data_dfs, G, nodes, device)
    
    cscore_results.append(cluster_score_list)
    auc_results.append(auc_list)
    S_results.append(S_list)
    nodes_results.append(nodes_list)


In [None]:
# epochs_list = [100, 250, 500, 750, 1000, 1500, 2000, 3000, 5000, 7500, 10000]

outfile = 'testing_epochs_nokmeans.pkl'
# Save variables
with open('../../outputs/pickles/' + outfile, 'wb') as f:
    pickle.dump([epochs_list, cscore_results, auc_results, S_results, nodes_results], f)

# Load variables
with open('../../outputs/pickles/' + outfile, 'rb') as f:
    epochs_list, cscore_results, auc_results, S_results, nodes_results = pickle.load(f)

In [None]:
px.line(x=epochs_list, y=[np.mean(l) for l in cscore_results], width=700).show()
px.line(x=epochs_list, y=[np.mean(l) for l in auc_results], width=700).show()

In [None]:
index_epoch = 1
S_list = S_results[index_epoch]
nodes_list = nodes_results[index_epoch]

index = 3
S = S_list[index]
nodes = nodes_list[index]
label = (nodes.anomaly>0).values

print(f'Epochs: {epochs_list[index_epoch]}, case: {index}')
print(f'Clustering score: {cscore_results[index_epoch][index]}')
print(f'AUC: {auc_results[index_epoch][index]}')

label_cmap = ListedColormap(plt.cm.viridis(np.linspace(0,1,nodes.anomaly.nunique())))

fig, ax = plt.subplots(ncols=2, figsize=(16,5))
plotting_params = {'edge_color':'darkgray', 'edge_width':1.5,'vertex_color':'black', 'vertex_size':50}
G.plotting.update(plotting_params)
G.plot_signal(label, ax=ax[0], plot_name='Label')

ax[0].collections[0].set_cmap(label_cmap)  # Modify the colormap of the plotted data
ax[0].axis('off')

G.plot_signal(np.array(S.argmax(dim=1).cpu()), ax=ax[1], plot_name='Clustering')
ax[1].collections[0].set_cmap('viridis')
ax[1].axis('off')

plt.show()

fig, ax = plt.subplots(ncols=2, figsize=(16,5))
plotting_params = {'edge_color':'darkgray', 'edge_width':1.5,'vertex_color':'black', 'vertex_size':50}
G.plotting.update(plotting_params)
G.plot_signal(nodes.new_pred.values, ax=ax[0], plot_name='Anomaly-adjusted Clustering')

ax[0].collections[0].set_cmap(label_cmap)  # Modify the colormap of the plotted data
ax[0].axis('off')

G.plot_signal(S.softmax(dim=-1).detach().cpu().numpy().max(axis=1), ax=ax[1], plot_name='Anomaly Score')
ax[1].collections[0].set_cmap('viridis')
ax[1].axis('off')

plt.show()

In [None]:
utils.visualize_map(nodes, color='anomaly', size=np.ones(nodes.pid.nunique()), size_max=10, title='Label',
                     hover_data=['cluster'], zoom=15, figsize=(600,600), colormap='viridis')
                    
utils.visualize_map(nodes, color='pred', size=np.ones(nodes.pid.nunique()), size_max=10, title='Clustering',
                    hover_data=['cluster'], zoom=15, figsize=(600,600), colormap='viridis')

utils.visualize_map(nodes, color='new_pred', size=np.ones(nodes.pid.nunique()), size_max=10, title='Anomaly-adjusted Clustering',
                     hover_data=['anomaly'], zoom=15, figsize=(600,600), colormap='viridis')
                    
utils.visualize_map(nodes, color='score', size=np.ones(nodes.pid.nunique()), size_max=10, title='Anomaly score',
                    hover_data=['cluster'], zoom=15, figsize=(600,600), colormap='viridis')