In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import contextily as cx
import plotly.graph_objects as go
import geopandas as gpd
import matplotlib.pyplot as plt

import os
import matplotlib
import subprocess
import torch
import joblib
import glob
import copy

from IPython.display import display_html
from shapely.geometry import MultiPoint
from sklearn.cluster import KMeans
from tsmoothie import LowessSmoother, ExponentialSmoother
from pyprojroot import here
from scipy.spatial import ConvexHull

import source.nn.models as models
import source.utils.utils as utils
import source.utils.fault_detection as fd

from source.utils.utils import roc_params, compute_auc, get_auc, best_mcc, best_f1score, otsuThresholding
from source.utils.utils import synthetic_timeseries
from source.utils.utils import plotly_signal

from importlib import reload
models = reload(models)
utils = reload(utils)
fd = reload(fd)

from pyprojroot import here
root_dir = str(here())

insar_dir = os.path.expanduser('~/data/raw/')
data_path = root_dir + '/data/interim/'
dataset_path = root_dir + "/data/datasets/"

matplotlib.rcParams.update({'font.size': 20})
matplotlib.rcParams.update({'font.family': 'DejaVu Serif'})

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

device = 'cuda:3'

graph_models = tuple([models.GCN2MLP, models.GConv2MLP, models.GCNAE, models.GConvAE, models.GUNet])
model_names = ['AE', 'GCN2MLP', 'GCNAE', 'GConv2MLP', 'GConvAE', 'GUNet', 'RAE_GRU', 'RAE_LSTM']



In [None]:
def get_model_from_name(model_name, n_nodes, input_dim, params, device='cuda'):
    # Determine the model class and parameters
    if 'RAE' in model_name:
        rnn_type = 'GRU' if 'GRU' in model_name else 'LSTM'
        model_params = {
            'n_features': n_nodes,
            'latent_dim': params['latent_dim'],
            'rnn_type': rnn_type,
            'rnn_act': 'relu',
            'device': device
        }
        model_class = getattr(models, 'RAE')
    elif 'GUNet' in model_name:
        model_params = {
            'in_channels': input_dim,
            'out_channels': input_dim,
            'hidden_channels': params['hidden_channels'],
            'depth': params['depth'],
            'pool_ratios': params['pool_ratios']
        }
        model_class = getattr(models, 'GUNet')
    else:  # Covers AE, GCN, and GConv models
        layer_dims = [input_dim]
        current_dim = 2 * input_dim
        n_layers = params['n_layers']
        for i in range(n_layers):
            next_dim = params[f'layer_dim_{i}']
            layer_dims.append(int(next_dim))
            current_dim = next_dim
        model_params = {'layer_dims': layer_dims}
        model_class = getattr(models, model_name)
    
    # Instantiate the model
    model = model_class(**model_params).to(device)    

    return model

def compute_num_parameters(study_file, n_nodes, input_dim, device='cpu'):
    # Load the study
    study = joblib.load(study_file)
    model_name = os.path.basename(study_file)[3:-4]
    
    results = []
    for trial in study.trials:
        params = trial.params

        model = get_model_from_name(model_name, n_nodes, input_dim, params, device=device)
        num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        
        results.append({**trial.params, 'num_parameters': num_params})
    
    # Convert to DataFrame
    df = study.trials_dataframe()
    df['num_parameters'] = [r['num_parameters'] for r in results]
    return df


# Remove default values when extracting parameters from the dictionary
def get_model_with_fewest_params(file, n_nodes, input_dim, top_factor=0.995, device='cuda'):
    # Compute the study DataFrame with number of parameters
    df_study = compute_num_parameters(file, n_nodes, input_dim, device=device)
    
    # Filter for trials within the top 0.5% of value_auc
    top_trials = df_study.query('value_auc > @top_factor * value_auc.max()')
    
    # Find the trial with the fewest number of parameters
    best_trial = top_trials.loc[top_trials['num_parameters'].idxmin()]
    
    # Extract the trial parameters
    trial_params = best_trial.filter(like='params_').to_dict()
    trial_params = {key.replace('params_', ''): value for key, value in trial_params.items()}
   
    model_name = os.path.basename(file)[3:-4]    
    model = get_model_from_name(model_name, n_nodes, input_dim, trial_params, device=device)

    # Return the model, number of parameters, and AUC value
    num_parameters = best_trial['num_parameters']

    model_info = {
            'model_example': model,
            'trial_params': trial_params,
            'num_parameters': num_parameters,
            # 'auc': auc_value,
            # 'auc_list': best_trial['user_attrs_auc_dataset'],
        }
    return model_name, model_info


def train_model(model, X, lr, n_epochs, edge_index=None, edge_weight=None, batch_size=None):
    
    rng_seed = 0
    torch.manual_seed(rng_seed)
    torch.cuda.manual_seed(rng_seed)
    np.random.seed(rng_seed)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.MSELoss()

    model.train()
    model.reset_parameters()

    if isinstance(model, models.RAE):

        for epoch in range(n_epochs):
                        
            optimizer.zero_grad()
            output = model(X.T.unsqueeze(0)).squeeze(0).T
            loss = criterion(output, X)
            loss.backward()
            optimizer.step()
        
        return output


    for epoch in range(n_epochs):
        optimizer.zero_grad()
        if isinstance(model, graph_models):
            output = model(X, edge_index=edge_index, edge_weight=edge_weight)
        else:
            output = model(X)
        loss = criterion(output, X)
        loss.backward()
        optimizer.step()
    return output

def pixel_mse(output,X):
    point_mse = torch.nn.MSELoss(reduction='none')
    return torch.mean(point_mse(output,X),axis=1)



In [None]:
dataset_path = root_dir + "/data/datasets/"

for dataset_name in ['Geological_anomaly', 'EGMS_anomaly']:
    print(f"\nProcessing dataset {dataset_name}")

    datafile = f'{dataset_name}/Training/dataset.pt'
    datasets = torch.load(dataset_path + datafile)

    optuna_files = glob.glob(root_dir + f'/outputs/HP_training/{dataset_name}/*.pkl')

    n_nodes = int(np.mean([dataset['data'].shape[0] for dataset in datasets]))
    input_dim = datasets[0]['data'].shape[1]

    print('Creating best model dictionary')
    model_dict = {}
    for file in optuna_files:
        model_name, model_info = get_model_with_fewest_params(file, n_nodes, input_dim, device=device)
        model_dict[model_name] = model_info

  
    # Save the updated model_dict for the current datafile
    output_path = os.path.join(root_dir, 'outputs/Optuna_analysis', f'model_dict_{dataset_name}.pkl')
    torch.save(model_dict, output_path)
    print(f"\nSaved model_dict for {datafile} to {output_path}")


In [None]:
# RUN Epoch_analysis.py

In [None]:
dataset_name = 'Geological_anomaly'
epoch_data = torch.load(root_dir + f'/outputs/Optuna_analysis/Epochs_{dataset_name}.pkl')
# For each model, generate the mean AUC evolution and mean Loss evolution across the 25 seeds and 108 datasets

for model_name in epoch_data.keys():

    auc_evolution = []
    loss_evolution = []
    # Iterate through each dataset
    for dataset_idx in range(len(epoch_data[model_name]['auc_evolution'])):
        # Iterate through each seed
        for seed in range(len(epoch_data[model_name]['auc_evolution'][dataset_idx])):

            auc_evolution.append(epoch_data[model_name]['auc_evolution'][dataset_idx][seed])
            loss_evolution.append(epoch_data[model_name]['loss_evolution'][dataset_idx][seed])


    # Store the mean AUC and Loss values in the epoch_data dictionary
    epoch_data[model_name]['mean_auc'] = np.mean(auc_evolution, axis=0)
    epoch_data[model_name]['mean_loss'] = np.median(loss_evolution, axis=0)

epoch_data_geological = epoch_data


dataset_name = 'EGMS_anomaly'
epoch_data = torch.load(root_dir + f'/outputs/Optuna_analysis/Epochs_{dataset_name}.pkl')
# For each model, generate the mean AUC evolution and mean Loss evolution across the 25 seeds and 108 datasets

for model_name in epoch_data.keys():

    auc_evolution = []
    loss_evolution = []
    # Iterate through each dataset
    for dataset_idx in range(len(epoch_data[model_name]['auc_evolution'])):
        # Iterate through each seed
        for seed in range(len(epoch_data[model_name]['auc_evolution'][dataset_idx])):

            auc_evolution.append(epoch_data[model_name]['auc_evolution'][dataset_idx][seed])
            loss_evolution.append(epoch_data[model_name]['loss_evolution'][dataset_idx][seed])


    # Store the mean AUC and Loss values in the epoch_data dictionary
    epoch_data[model_name]['mean_auc'] = np.mean(auc_evolution, axis=0)
    epoch_data[model_name]['mean_loss'] = np.median(loss_evolution, axis=0)

epoch_data_egms = epoch_data

In [None]:
epoch_data = epoch_data_geological
n_cols = 4
n_rows = 2
fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 5))  # Create subplots

# Plot for 'mean_loss'
variable = 'mean_loss'
(xmin, xmax) = (0, 150)  # X-axis limits


# Flatten axes for easier iteration
axes_flat = axes.flatten()

for idx, model_name in enumerate(model_names):
    median_loss = epoch_data[model_name][variable]
    axes_flat[idx].plot(median_loss, linewidth=2)

    if model_name in ['AE']:
        (ymin, ymax) = (10, 100)  # Y-axis limits
    else:
        (ymin, ymax) = (10, 100)  # Y-axis limits
    
    # Only show x labels and ticks for bottom row
    if idx < len(model_names) - n_cols:  # If not in bottom row
        axes_flat[idx].set_xticklabels([])
        axes_flat[idx].set_xlabel('')
    else:
        axes_flat[idx].set_xlabel("Epoch", fontsize=12)
        
    # Only show y labels and ticks for leftmost column
    if idx % n_cols == 0: 
        axes_flat[idx].set_ylabel("Loss", fontsize=12)
        
    axes_flat[idx].set_xlim(xmin, xmax)
    axes_flat[idx].set_ylim(ymin, ymax)
    axes_flat[idx].grid(True, linestyle='--', alpha=0.25)
    axes_flat[idx].set_title(f"{model_name.replace('_','')}", fontsize=14)
    axes_flat[idx].tick_params(axis='both', labelsize=10)

# Adjust layout and show the figure
plt.tight_layout()
plt.savefig(os.path.join(root_dir, "outputs/figs/AUC_evo_Geological.pdf"), format="pdf")
plt.show()


In [None]:
epoch_data = epoch_data_egms

n_cols = 4
n_rows = 2
fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 5))  # Create subplots

# Plot for 'mean_loss'
variable = 'mean_loss'
(xmin, xmax) = (0, 150)  # X-axis limits


# Flatten axes for easier iteration
axes_flat = axes.flatten()

for idx, model_name in enumerate(model_names):
    median_loss = epoch_data[model_name][variable]
    axes_flat[idx].plot(median_loss, linewidth=2)

    if model_name in ['AE']:
        (ymin, ymax) = (10, 80)  # Y-axis limits
    else:
        (ymin, ymax) = (70, 90)  # Y-axis limits
    
    # Only show x labels and ticks for bottom row
    if idx < len(model_names) - n_cols:  # If not in bottom row
        axes_flat[idx].set_xticklabels([])
        axes_flat[idx].set_xlabel('')
    else:
        axes_flat[idx].set_xlabel("Epoch", fontsize=12)
        
    # Only show y labels and ticks for leftmost column
    if idx % n_cols == 0: 
        axes_flat[idx].set_ylabel("Loss", fontsize=12)
    
    axes_flat[idx].set_xlim(xmin, xmax)
    axes_flat[idx].set_ylim(ymin, ymax)
    axes_flat[idx].grid(True, linestyle='--', alpha=0.25)
    axes_flat[idx].set_title(f"{model_name.replace('_','')}", fontsize=14)
    axes_flat[idx].tick_params(axis='both', labelsize=10)

# Adjust layout and show the figure
plt.tight_layout()
plt.savefig(os.path.join(root_dir, "outputs/figs/AUC_evo_EGMS.pdf"), format="pdf")
plt.show()


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))  # Create a figure with 1 subplot

# Define markers for each line
markers = ['o', 's', '^', 'D', 'v', '<', '>', 'p']

# Plot for 'mean_loss'
variable = 'mean_loss'
(xmin, xmax) = (0, 150)  # X-axis limits
(ymin, ymax) = (20, 50)  # Y-axis limits
x_pos = 60  # Position at the end of the curve

y_positions = [epoch_data[model_name][variable][x_pos] for model_name in epoch_data.keys()]
ranking = 2 * np.argsort(np.argsort(y_positions)[::-1])
x_offset = 20
offsets = ranking - np.mean(ranking)

for it, (model_name, marker) in enumerate(zip(epoch_data.keys(), markers)):
    median_loss = epoch_data[model_name][variable]
    line, = ax.plot(median_loss, label=model_name, linewidth=2, marker=marker, markevery=5, markersize=4)
    color = line.get_color()  # Get the color of the current line

    offset = offsets[it]
    y_pos = median_loss[x_pos]

    if (y_pos - offset) < ymin:
        offset = y_pos - ymin 
    if offset > y_pos:
        offset = 0
    if (y_pos - offset) > ymax:
        offset = y_pos - ymax + (ymax - ymin) / 25

    ax.annotate(model_name.replace("_", ""), 
                xy=(x_pos, y_pos), 
                xytext=(x_pos + x_offset, y_pos - offset),
                arrowprops=dict(arrowstyle='->', color=color),
                fontsize=14,
                color=color)

ax.set_xlabel("Epoch", fontsize=16)
ax.set_ylabel("Loss", fontsize=16)
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.grid(True, linestyle='--', alpha=0.25)
ax.set_title("Median Loss Evolution", fontsize=16)
ax.tick_params(axis='both', labelsize=14)  # Change tick label size

# Adjust layout and show the figure
plt.tight_layout()
plt.show()


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))  # Create a figure with 1 row and 2 columns

# Plot for 'mean_loss'
variable = 'mean_loss'
(xmin, xmax) = (0, 150)  # X-axis limits
(ymin, ymax) = (70, 100)  # Y-axis limits
x_pos = 40  # Position at the end of the curve

y_positions = [epoch_data[model_name][variable][x_pos] for model_name in epoch_data.keys()]
ranking = 1 * np.argsort(np.argsort(y_positions)[::-1])
x_offset = 10
offsets = ranking - np.mean(ranking)

it = 0
for model_name in epoch_data.keys():
    median_loss = epoch_data[model_name][variable]
    axes[0].plot(median_loss, label=model_name, linewidth=2)

    offset = offsets[it]
    it += 1
    y_pos = median_loss[x_pos]

    if (y_pos - offset) < ymin:
        offset = 0
    if offset > y_pos:
        offset = 0

    axes[0].annotate(model_name.replace("_", ""), xy=(x_pos, y_pos), xytext=(x_pos + x_offset, y_pos - offset),
                     arrowprops=dict(facecolor='black', arrowstyle='-'),
                     fontsize=12)

axes[0].set_xlabel("Epoch", fontsize=16)
axes[0].set_ylabel("Loss", fontsize=16)
axes[0].set_xlim(xmin, xmax)
axes[0].set_ylim(ymin, ymax)
axes[0].grid(True, linestyle='--', alpha=0.25)
axes[0].set_title("Median Loss Evolution", fontsize=16)
axes[0].tick_params(axis='both', labelsize=14)  # Change tick label size

# Plot for 'mean_auc'
variable = 'mean_auc'
(xmin, xmax) = (0, 25)  # X-axis limits
(ymin, ymax) = (0.7, 1.0)  # Y-axis limits
x_pos = 1  # Position at the end of the curve

y_positions = [epoch_data[model_name][variable][x_pos] for model_name in epoch_data.keys()]
ranking = 0.01 * np.argsort(np.argsort(y_positions)[::-1])
x_offset = 4
offsets = ranking - np.mean(ranking)

it = 0
for model_name in epoch_data.keys():
    mean_auc = epoch_data[model_name][variable]
    axes[1].plot(mean_auc, label=model_name, linewidth=2)

    offset = offsets[it]
    it += 1
    y_pos = mean_auc[x_pos]

    if (y_pos - offset) < ymin:
        offset = 0

    axes[1].annotate(model_name.replace("_", ""), xy=(x_pos, y_pos), xytext=(x_pos + x_offset, y_pos - offset),
                     arrowprops=dict(facecolor='black', arrowstyle='-'),
                     fontsize=12)

axes[1].set_xlabel("Epoch", fontsize=16)
axes[1].set_ylabel("AUC", fontsize=16)
axes[1].set_xlim(xmin, xmax)
axes[1].set_ylim(ymin, ymax)
axes[1].grid(True, linestyle='--', alpha=0.25)
axes[1].set_title("Mean AUC Evolution", fontsize=16)
axes[1].set_xticks([0, 5, 10, 15])
axes[1].tick_params(axis='both', labelsize=14)  # Change tick label size

# Adjust layout and show the figure
plt.tight_layout()
# plt.savefig(os.path.join(root_dir, "outputs/figs/training_Geological.pdf"), format="pdf")
plt.show()


In [None]:
dataset_name = 'Geological_anomaly'
datasets = torch.load(dataset_path + f'{dataset_name}/Training/dataset.pt')
model_dict = torch.load(root_dir + f'/outputs/Optuna_analysis/model_dict_{dataset_name}.pkl')

# Create a list to store all the data
data = []

# Iterate through each model
for model_name in model_dict.keys():
    # Get auc and loss evolution for this model
    auc_evolutions = model_dict[model_name]['auc_evolution']
    loss_evolutions = model_dict[model_name]['loss_evolution']
    
    # For each dataset
    for dataset_idx in range(len(auc_evolutions)):
        # Get the evolution sequences
        auc_seq = auc_evolutions[dataset_idx]
        loss_seq = loss_evolutions[dataset_idx]
        
        # For each epoch
        for epoch in range(len(auc_seq)):
            data.append({
                'model': model_name,
                'dataset_idx': dataset_idx,
                'epoch': epoch,
                'auc': auc_seq[epoch],
                'loss': loss_seq[epoch]
            })

# Create dataframe
df = pd.DataFrame(data)

In [None]:
def moving_average(sequence, window_size=50):
    return np.convolve(sequence, np.ones(window_size) / window_size, mode='valid')

for model_name in model_dict.keys():
    print(f"\nProcessing model {model_name}")
    
    auc_evolutions = model_dict[model_name]['auc_evolution']
    loss_evolutions = model_dict[model_name]['loss_evolution']

    # Start and end loss
    model_dict[model_name]['median_loss'] = np.median(loss_evolutions, axis=0)
    model_dict[model_name]['mean_loss'] = np.mean(loss_evolutions, axis=0)
    model_dict[model_name]['start_loss'] = np.mean(np.median(loss_evolutions, axis=0)[:5])
    model_dict[model_name]['end_loss'] = np.mean(np.median(loss_evolutions, axis=0)[-5:])
    model_dict[model_name]['mean_auc'] = np.mean(auc_evolutions, axis=0)

    # Filter valid sequences
    valid_indices = []
    for i, loss_seq in enumerate(loss_evolutions):
        if loss_seq[-1] <= 0.95 * loss_seq[0]:
            valid_indices.append(i)
    
    print(f"Valid sequences: {len(valid_indices)}")
    if not valid_indices:  # Skip if no valid sequences
        model_dict[model_name]['mean_correlations'] = 0
        model_dict[model_name]['correlation_mean'] = 0
        model_dict[model_name]['valids'] = 0
        continue
        
    # Keep only valid sequences
    filtered_auc = [moving_average(auc_evolutions[i]) for i in valid_indices]
    filtered_loss = [moving_average(loss_evolutions[i]) for i in valid_indices]
    
    # Compute correlations
    correlations_seq = [np.corrcoef(filtered_auc[i], filtered_loss[i])[0,1] for i in range(len(filtered_auc))]
    mean_corr = np.nanmean(correlations_seq)
    model_dict[model_name]['mean_correlations'] = mean_corr

    # Correlation of the mean curves
    avg_auc = np.mean(filtered_auc, axis=0)
    avg_loss = np.mean(filtered_loss, axis=0)
    corr = np.corrcoef(avg_auc, avg_loss)[0,1]
    model_dict[model_name]['correlation_mean'] = corr
    
    # Number of valid datasets
    model_dict[model_name]['valids'] = len(valid_indices)

print('')

# Summary
for model_name in model_dict.keys():
    print(f"{model_name:<10}: "
          f"AUC: {model_dict[model_name]['auc']:.3f} - "
          f"Valid Datasets: {model_dict[model_name]['valids']:<3} - "
          f"Mean Correlation: {model_dict[model_name]['mean_correlations']:<6.3f} - "
          f"Start Loss: {model_dict[model_name]['start_loss']:<10.3f} - "
          f"End Loss: {model_dict[model_name]['end_loss']:<6.3f}")

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(16, 6))  # Create a figure with 1 row and 2 columns

# Plot for 'median_loss'
variable = 'median_loss'
(xmin, xmax) = (0, 300)  # X-axis limits
(ymin, ymax) = (0, 80)  # Y-axis limits
x_pos = 50  # Position at the end of the curve

y_positions = [model_dict[model_name][variable][x_pos] for model_name in model_dict.keys()]
ranking = 8 * np.argsort(np.argsort(y_positions)[::-1])
x_offset = 60
offsets = ranking - np.mean(ranking)

it = 0
for model_name in model_dict.keys():
    median_loss = model_dict[model_name][variable]
    axes[0].plot(median_loss, label=model_name, linewidth=2)

    offset = offsets[it]
    it += 1
    y_pos = median_loss[x_pos]

    if (y_pos - offset) < ymin:
        offset = 0
    if offset > y_pos:
        offset = 0

    axes[0].annotate(model_name.replace("_", ""), xy=(x_pos, y_pos), xytext=(x_pos + x_offset, y_pos - offset),
                     arrowprops=dict(facecolor='black', arrowstyle='-'),
                     fontsize=12)

axes[0].set_xlabel("Epoch", fontsize=16)
axes[0].set_ylabel("Loss", fontsize=16)
axes[0].set_xlim(xmin, xmax)
axes[0].set_ylim(ymin, ymax)
axes[0].grid(True, linestyle='--', alpha=0.25)
axes[0].set_title("Median Loss Evolution", fontsize=16)
axes[0].tick_params(axis='both', labelsize=14)  # Change tick label size

# Plot for 'mean_auc'
variable = 'mean_auc'
(xmin, xmax) = (0, 15)  # X-axis limits
(ymin, ymax) = (0.85, 1.0)  # Y-axis limits
x_pos = 2  # Position at the end of the curve

y_positions = [model_dict[model_name][variable][x_pos] for model_name in model_dict.keys()]
ranking = 0.01 * np.argsort(np.argsort(y_positions)[::-1])
x_offset = 4
offsets = ranking - np.mean(ranking)

it = 0
for model_name in model_dict.keys():
    mean_auc = model_dict[model_name][variable]
    axes[1].plot(mean_auc, label=model_name, linewidth=2)

    offset = offsets[it]
    it += 1
    y_pos = mean_auc[x_pos]

    if (y_pos - offset) < ymin:
        offset = 0

    axes[1].annotate(model_name.replace("_", ""), xy=(x_pos, y_pos), xytext=(x_pos + x_offset, y_pos - offset),
                     arrowprops=dict(facecolor='black', arrowstyle='-'),
                     fontsize=12)

axes[1].set_xlabel("Epoch", fontsize=16)
axes[1].set_ylabel("AUC", fontsize=16)
axes[1].set_xlim(xmin, xmax)
axes[1].set_ylim(ymin, ymax)
axes[1].grid(True, linestyle='--', alpha=0.25)
axes[1].set_title("Mean AUC Evolution", fontsize=16)
axes[1].set_xticks([0, 5, 10, 15])
axes[1].tick_params(axis='both', labelsize=14)  # Change tick label size

# Adjust layout and show the figure
plt.tight_layout()
plt.savefig(os.path.join(root_dir, "outputs/figs/training_Geological.pdf"), format="pdf")
plt.show()


In [None]:
import plotly.graph_objects as go

# Initialize the figure
fig = go.Figure()

# Iterate through all model names and add their median losses to the plot
for model_name in model_names:
    mean_auc = model_dict[model_name]['mean_auc']
    fig.add_trace(go.Scatter(
        y=mean_auc,
        mode='lines',
        name=model_name,
        line=dict(width=2),
        # hovertemplate=f"{model_name}<br>Epoch: {{x}}<br>Loss: {{y:.2f}}<extra></extra>"
    ))

# Update layout
fig.update_layout(
    title="Mean AUC Evolution for All Models",
    xaxis_title="Epoch",
    yaxis_title="Mean AUC",
    template="plotly_white",
    legend_title="Model Names",
    width=1000,
    height=600,
    # yaxis_range=[0, 100],
    xaxis_range=[0,50]
)

# Show the figure
fig.show()

In [None]:
model_dict['AE']['trial_params']

In [None]:
model_dict['GCN2MLP']['median_loss']

In [None]:
from plotly.subplots import make_subplots

import plotly.graph_objects as go

# First filter the DataFrame
df_ae = df.query('model == "GCN2MLP"')

# Initialize figure
fig = go.Figure()

# Add the *initial* traces (first dataset_idx = 0 for example)
initial_df = df_ae.query('dataset_idx == 0')

fig.add_trace(go.Scatter(
    x=initial_df['epoch'],
    y=moving_average(initial_df['loss']),
    name='Loss',
    yaxis='y1'
))

fig.add_trace(go.Scatter(
    x=initial_df['epoch'],
    y=moving_average(initial_df['auc']),
    name='AUC',
    yaxis='y2'
))

# Create frames — one for each dataset_idx
frames = []
for dataset_idx in sorted(df_ae['dataset_idx'].unique()):
    frame_df = df_ae.query(f'dataset_idx == {dataset_idx}')
    frames.append(go.Frame(
        name=str(dataset_idx),
        data=[
            go.Scatter(x=frame_df['epoch'], y=moving_average(frame_df['loss'])),
            go.Scatter(x=frame_df['epoch'], y=moving_average(frame_df['auc']))
        ]
    ))

fig.frames = frames

# Set up axes
fig.update_layout(
    xaxis=dict(title='Epoch'),
    yaxis=dict(
        title='Loss',
        titlefont=dict(color='#1f77b4'),
        tickfont=dict(color='#1f77b4')
    ),
    yaxis2=dict(
        title='AUC',
        titlefont=dict(color='#ff7f0e'),
        tickfont=dict(color='#ff7f0e'),
        anchor='x',
        overlaying='y',
        side='right'
    ),
    title='Loss and AUC vs Epoch (Animated by Dataset)',
    legend=dict(#x=0.5, y=1.1,
                orientation='v'),
    
    # Animation controls
    updatemenus=[dict(
        type="buttons",
        showactive=False,
        # buttons=[dict(
        #     label="Play",
        #     method="animate",
        #     args=[None, {"frame": {"duration": 1, "redraw": True},
        #                  "fromcurrent": True}]
        # )]
    )]
)

# Slider to manually control frames
fig.update_layout(
    sliders=[{
        "steps": [{
            "args": [[str(dataset_idx)], 
                     {"frame": {"duration": 1, "redraw": True},
                      "mode": "immediate"}],
            "label": f"Dataset {dataset_idx}",
            "method": "animate",
        } for dataset_idx in sorted(df_ae['dataset_idx'].unique())],
        "transition": {"duration": 0},
        "x": 0,
        "y": -0.2,
        "currentvalue": {"prefix": "Dataset: "}
    }],
    yaxis=dict(
        title='Loss',
        titlefont=dict(color='#1f77b4'),
        tickfont=dict(color='#1f77b4'),
        range=[0, 100]   # Set the range for Loss axis (left y-axis)
    ),
    yaxis2=dict(
        title='AUC',
        titlefont=dict(color='#ff7f0e'),
        tickfont=dict(color='#ff7f0e'),
        anchor='x',
        overlaying='y',
        side='right',
        range=[0.5, 1.1]  # Set the range for AUC axis (right y-axis)
    )
)

fig.show()


# px.line(df.query('model=="AE"'), x='epoch', y=['loss','auc'], animation_frame='dataset_idx',)

In [None]:
import matplotlib.pyplot as plt

model_name = 'GCN2MLP'

loss_evolutions_ae = model_dict[model_name]['auc_evolution']

# Plot all loss evolutions with low opacity
plt.figure(figsize=(10, 6))
for loss_seq in loss_evolutions_ae:
    plt.plot(loss_seq, alpha=0.2, color='blue')

plt.title(f'Loss Evolutions for Model {model_name}')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.xlim(0, 300)
# plt.ylim(0, 10)
plt.grid(False)
plt.tight_layout()
plt.show()

--------------------------

----------------------

In [None]:
# RUN Initialization_analysis.py

In [None]:
dataset_name = 'Geological_anomaly'
initialization_metrics = torch.load(root_dir + f'/outputs/Optuna_analysis/initialization_{dataset_name}.pkl')

optuna_metrics = {}
for model in model_names:

    auc_list = initialization_metrics[model]['auc']
    f1_list = initialization_metrics[model]['f1']
    mcc_list = initialization_metrics[model]['mcc']

    optuna_metrics[model] = {
        'mean_auc': np.mean(np.mean(auc_list,axis=0)).round(3),
        'std_auc': np.std(np.mean(auc_list,axis=0)).round(3),
        'mean_f1': np.mean(np.mean(f1_list,axis=0)).round(3),
        'std_f1': np.std(np.mean(f1_list,axis=0)).round(3),
        'mean_mcc': np.mean(np.mean(mcc_list,axis=0)).round(3),
        'std_mcc': np.std(np.mean(mcc_list,axis=0)).round(3),
    }

# Create a nicely formatted table showing test metrics for all models
metrics_df = pd.DataFrame()
for model in optuna_metrics.keys():
    metrics_df.loc[model, 'AUC'] = f"{optuna_metrics[model]['mean_auc']:.3f} ± {optuna_metrics[model]['std_auc']:.3f}"
    metrics_df.loc[model, 'F1'] = f"{optuna_metrics[model]['mean_f1']:.3f} ± {optuna_metrics[model]['std_f1']:.3f}"
    metrics_df.loc[model, 'MCC'] = f"{optuna_metrics[model]['mean_mcc']:.3f} ± {optuna_metrics[model]['std_mcc']:.3f}"

print(metrics_df.to_string())

In [None]:
dataset_name = 'EGMS_anomaly'
initialization_metrics = torch.load(root_dir + f'/outputs/Optuna_analysis/initialization_{dataset_name}.pkl')

optuna_metrics = {}
for model in model_names:

    auc_list = initialization_metrics[model]['auc']
    f1_list = initialization_metrics[model]['f1']
    mcc_list = initialization_metrics[model]['mcc']

    optuna_metrics[model] = {
        'mean_auc': np.mean(np.mean(auc_list,axis=0)).round(3),
        'std_auc': np.std(np.mean(auc_list,axis=0)).round(3),
        'mean_f1': np.mean(np.mean(f1_list,axis=0)).round(3),
        'std_f1': np.std(np.mean(f1_list,axis=0)).round(3),
        'mean_mcc': np.mean(np.mean(mcc_list,axis=0)).round(3),
        'std_mcc': np.std(np.mean(mcc_list,axis=0)).round(3),
    }

# Create a nicely formatted table showing test metrics for all models
metrics_df = pd.DataFrame()
for model in optuna_metrics.keys():
    metrics_df.loc[model, 'AUC'] = f"{optuna_metrics[model]['mean_auc']:.3f} ± {optuna_metrics[model]['std_auc']:.3f}"
    metrics_df.loc[model, 'F1'] = f"{optuna_metrics[model]['mean_f1']:.3f} ± {optuna_metrics[model]['std_f1']:.3f}"
    metrics_df.loc[model, 'MCC'] = f"{optuna_metrics[model]['mean_mcc']:.3f} ± {optuna_metrics[model]['std_mcc']:.3f}"

print(metrics_df.to_string())

In [None]:
dataset_name = 'Geological_anomaly'
initialization_metrics = torch.load(root_dir + f'/outputs/Optuna_analysis/initialization_{dataset_name}.pkl')

model_names = ['AE', 'GCN2MLP', 'GCNAE', 'GConv2MLP', 'GConvAE', 'GUNet', 'RAE_GRU', 'RAE_LSTM']

for model in model_names:
    # STD Oof auc_values
    if model in auc_dataset.keys():
        print(f"{model:<10}: {np.mean(np.mean(auc_dataset[model], axis=0)):.3f}  - {np.std(np.mean(auc_dataset[model], axis=0)):.3f} ")

In [None]:
dataset_name = 'EGMS_anomaly'
auc_dataset = torch.load(root_dir + f'/outputs/Optuna_analysis/initialization_{dataset_name}.pkl')

model_names = ['AE', 'GCN2MLP', 'GCNAE', 'GConv2MLP', 'GConvAE', 'GUNet', 'RAE_GRU', 'RAE_LSTM']

for model in model_names:
    # STD Oof auc_values
    if model in auc_dataset.keys():
        print(f"{model:<10}: {np.mean(np.mean(auc_dataset[model], axis=0)):.3f}  - {np.std(np.mean(auc_dataset[model], axis=0)):.3f} ")

In [None]:

def pixel_mse(output,X):
    point_mse = torch.nn.MSELoss(reduction='none')
    return torch.mean(point_mse(output,X),axis=1)

def evaluate_epochs(model, X, lr, n_epochs, label, edge_index=None, edge_weight=None, batch_size=None, rng_seed=0):
    
    torch.manual_seed(rng_seed)
    torch.cuda.manual_seed(rng_seed)
    np.random.seed(rng_seed)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.MSELoss()

    model.train()
    model.reset_parameters()

    loss_evolution = []
    auc_evolution = []

    if isinstance(model, models.RAE):
        for epoch in range(n_epochs):
            optimizer.zero_grad()
            output = model(X.T.unsqueeze(0)).squeeze(0).T
            loss = criterion(output, X)
            loss.backward()
            optimizer.step()

            scores = pixel_mse(output, X).detach().cpu().numpy()
            auc = get_auc(scores, label, resolution=101).round(3)

            loss_evolution.append(loss.item())
            auc_evolution.append(auc)

    else:
        for epoch in range(n_epochs):
            optimizer.zero_grad()
            if isinstance(model, graph_models):
                output = model(X, edge_index=edge_index, edge_weight=edge_weight)
            else:
                output = model(X)
            loss = criterion(output, X)
            loss.backward()
            optimizer.step()

            scores = pixel_mse(output, X).detach().cpu().numpy()
            # auc = get_auc(scores, label, resolution=101).round(3)

            # loss_evolution.append(loss.item())
            # auc_evolution.append(auc)

    return scores


device = 'cuda:1'

graph_models = tuple([models.GCN2MLP, models.GConv2MLP, models.GCNAE, models.GConvAE, models.GUNet])

dataset_name = 'EGMS_anomaly'
datasets = torch.load(dataset_path + f'{dataset_name}/Training/dataset.pt')
model_dict = torch.load(root_dir + f'/outputs/Optuna_analysis/model_dict_{dataset_name}.pkl')

model_names = ['AE', 'GCN2MLP', 'GCNAE', 'GConv2MLP', 'GConvAE', 'GUNet', 'RAE_GRU', 'RAE_LSTM']

auc_results = {}

for model_name in model_names:  
    # Iterate through datasets
    loss_dataset = []
    auc_dataset = []
    for idx, dataset in enumerate(datasets, start=1):
        print(f"\rProcessing dataset {idx}/{len(datasets)} for model {model_name}", end="", flush=True)

        data = dataset['data']
        label = dataset['label'].any(axis=1)

        X = torch.tensor(data).float().to(device)

        auc_seed = []
        for seed in range(10):

            model = copy.deepcopy(model_dict[model_name]['model']).to(device)

            edge_index, edge_weight = (None, None)
            if isinstance(model, graph_models):
                edge_index = dataset['edge_index'].to(next(model.parameters()).device)
                edge_weight = dataset['edge_weight'].to(next(model.parameters()).device)
            if isinstance(model, models.RAE) and (model.n_features != 1):
                relevant_params = ['n_features', 'latent_dim', 'rnn_type', 'rnn_act', 'device']
                new_model_params = {key: getattr(model, key) for key in relevant_params}
                new_model_params['n_features'] = X.shape[0]
                new_model_params['device'] = device
                model = models.RAE(**new_model_params)
                model.to(new_model_params['device'])

            scores = evaluate_epochs(model, X,
                                        lr=model_dict[model_name]['trial_params']['lr'],
                                        n_epochs=model_dict[model_name]['trial_params']['n_epochs'],
                                        label=label,
                                        edge_index=edge_index,
                                        edge_weight=edge_weight,
                                        rng_seed=seed)

            auc = get_auc(scores, label, resolution=101).round(3)
            auc_seed.append(auc)

        auc_dataset.append(auc_seed)
    
    auc_results[model_name] = auc_dataset


In [None]:
auc_dataset

In [None]:
np.std(np.mean(auc_dataset,axis=0))

In [None]:
np.mean(auc_dataset, axis=0)

In [None]:
np.mean(auc_dataset, axis=0)

In [None]:
np.mean(auc_dataset, axis=0)

In [None]:
np.mean(auc_dataset, axis=0)

In [None]:
# Calculate and display standard deviation of AUC lists for all models
for model_name in model_dict.keys():
    auc_std = np.std(model_dict[model_name]['auc_list'])
    print(f"{model_name:<10}: {auc_std:.3f}")

In [None]:
study = joblib.load(root_dir + f'/outputs/HP_training/EGMS_anomaly/TR_GCNAE.pkl')

In [None]:
study.trials_dataframe().query('value_auc > 0.995 * value_auc.max()').sort_values('value_auc', ascending=False).head(40)