In [1]:
import MinkowskiEngine as ME
import matplotlib.pyplot as plt
import matplotlib as mpl
import importlib
import torchvision.transforms.v2 as transforms
import torchvision.transforms.v2.functional as F
from torch import nn

## Jupyter magic
%matplotlib inline
mpl.rcParams['figure.figsize'] = [8, 6]
mpl.rcParams['font.size'] = 16

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.device(device)
import numpy as np
SEED=12345
_=np.random.seed(SEED)
_=torch.manual_seed(SEED)



In [2]:
## Includes from my libraries for this project                                                                                                                                           
from ME_dataset_libs import CenterCrop, MaxRegionCrop, ConstantCharge, RandomCrop, RandomPixelNoise2D, FirstRegionCrop
from ME_dataset_libs import SingleModuleImage2D_solo_ME, solo_ME_collate_fn, solo_ME_collate_fn_with_meta
from ME_dataset_libs import make_dense, make_dense_from_tensor, Label

In [3]:
from FSD_training_analysis import get_models_from_checkpoint
import numpy as np
import FSD_training_analysis
importlib.reload(FSD_training_analysis)
from FSD_training_analysis import image_loop, reorder_clusters

def process_images_from_file(input_file):

    ## Load in the pre-calculated model weights
    file_dir = "/pscratch/sd/c/cwilk"
    encoder, proj_head, clust_head, args = get_models_from_checkpoint(file_dir+"/"+input_file)
    encoder.eval()
    proj_head.eval()
    clust_head.eval()

    encoder.to(device)
    proj_head.to(device)
    clust_head.to(device)

    # Modify the nominal transform
    nom_transform = FirstRegionCrop((800, 256), (768, 256))
    
    data_dir = "/pscratch/sd/c/cwilk/FSD/DATA"
    sim_dir = "/pscratch/sd/c/cwilk/FSD/SIMULATIONv2"
    max_data_events=100000
    max_sim_events=100000
    single_sim_dataset = SingleModuleImage2D_solo_ME(sim_dir, transform=nom_transform, max_events=max_sim_events, return_metadata=True)
    single_data_dataset = SingleModuleImage2D_solo_ME(data_dir, transform=nom_transform, max_events=max_data_events, return_metadata=True)

    ## Randomly chosen batching
    data_loader   = torch.utils.data.DataLoader(single_data_dataset,
                                                collate_fn=solo_ME_collate_fn_with_meta,
                                                batch_size=1024,
                                                shuffle=False,
                                                num_workers=4)    
    sim_loader    = torch.utils.data.DataLoader(single_sim_dataset,
                                                collate_fn=solo_ME_collate_fn_with_meta,
                                                batch_size=1024,
                                                shuffle=False,
                                                num_workers=4)

    ## Get the processed vectors of interest from the datasets                                                                                                                                                     
    data_processed = image_loop(encoder, proj_head, clust_head, data_loader)
    sim_processed = image_loop(encoder, proj_head, clust_head, sim_loader)

    ## Do some magic to re-order the clusters for presentation purposes                                                                                                                                            
    reorder_clusters(data_processed, sim_processed)
    
    return data_processed, sim_processed

In [4]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

def calc_metrics(data_processed):
    N, K = data_processed['clust'].shape
    silhouette_eucl = silhouette_score(data_processed['clust'], data_processed['clust_index'], metric="euclidean")
    print("Silhouette (euclidean) =", silhouette_eucl)
    calinski_harabasz = calinski_harabasz_score(data_processed['clust'], data_processed['clust_index'])
    print("Calinski-Harabasz =", calinski_harabasz)
    davies_bouldin = davies_bouldin_score(data_processed['clust'], data_processed['clust_index'])
    print("Davies-Bouldin =", davies_bouldin)
    return silhouette_eucl, calinski_harabasz, davies_bouldin

In [8]:
file_list = ["state_lat24_clust25_nchan64_1E-5_1024_PROJ0.5_CLUST0.5two_ent1E-1_soft1.0_arch12x4_poolmax_flat1_grow1_kern7_sep1_onecycle50_bigaugbilin_1M_DATA1_FSDCCFIX.pth",
             "state_lat32_clust25_nchan64_1E-5_1024_PROJ0.5_CLUST0.5two_ent1E-1_soft1.0_arch12x4_poolmax_flat1_grow1_kern7_sep1_onecycle50_bigaugbilin_1M_DATA1_FSDCCFIX.pth",
             "state_lat48_clust25_nchan64_1E-5_1024_PROJ0.5_CLUST0.5two_ent1E-1_soft1.0_arch12x4_poolmax_flat1_grow1_kern7_sep1_onecycle50_bigaugbilin_1M_DATA1_FSDCCFIX.pth",
             "state_lat64_clust25_nchan64_1E-5_1024_PROJ0.5_CLUST0.5two_ent1E-1_soft1.0_arch12x4_poolmax_flat1_grow1_kern7_sep1_onecycle50_bigaugbilin_1M_DATA1_FSDCCFIX.pth",
             "state_lat128_clust25_nchan64_1E-5_1024_PROJ0.5_CLUST0.5two_ent1E-1_soft1.0_arch12x4_poolmax_flat1_grow1_kern7_sep1_onecycle50_bigaugbilin_1M_DATA1_FSDCCFIX.pth",
             "state_lat256_clust25_nchan64_1E-5_1024_PROJ0.5_CLUST0.5two_ent1E-1_soft1.0_arch12x4_poolmax_flat1_grow1_kern7_sep1_onecycle50_bigaugbilin_1M_DATA1_FSDCCFIX.pth"]

file_list = ["state_lat128_clust20_nchan64_1E-5_1024_PROJ0.5_CLUST0.5two_ent0.1_soft1.0_arch12x4_poolmax_flat1_grow1_kern7_sep1_onecycle50_bigaugbilin_1M_DATA1_FSDCCFIX.pth",\
             "state_lat128_clust25_nchan64_1E-5_1024_PROJ0.5_CLUST0.5two_ent1E-1_soft1.0_arch12x4_poolmax_flat1_grow1_kern7_sep1_onecycle50_bigaugbilin_1M_DATA1_FSDCCFIX.pth",
             "state_lat128_clust30_nchan64_1E-5_1024_PROJ0.5_CLUST0.5two_ent1E-1_soft1.0_arch12x4_poolmax_flat1_grow1_kern7_sep1_onecycle50_bigaugbilin_1M_DATA1_FSDCCFIX.pth"]
lat_list = [24, 32, 48, 64, 128, 256]
sil_list = []
ch_list = []
db_list = []

## Loop over file
for f in file_list:
    data_processed, sim_processed = process_images_from_file(f)
    silhouette_eucl, calinski_harabasz, davies_bouldin = calc_metrics(data_processed)
    sil_list .append(silhouette_eucl)
    ch_list  .append(calinski_harabasz)
    db_list  .append(davies_bouldin)
    print("DATA:", silhouette_eucl, calinski_harabasz, davies_bouldin)
    #silhouette_eucl, calinski_harabasz, davies_bouldin = calc_metrics(sim_processed)
    #print("SIM:", silhouette_eucl, calinski_harabasz, davies_bouldin)

Silhouette (euclidean) = 0.65610313
Calinski-Harabasz = 44174.70585142474
Davies-Bouldin = 0.45744225931794746
DATA: 0.65610313 44174.70585142474 0.45744225931794746
Silhouette (euclidean) = 0.6243667
Calinski-Harabasz = 31275.265284895668
Davies-Bouldin = 0.4932641503041913
DATA: 0.6243667 31275.265284895668 0.4932641503041913
Silhouette (euclidean) = 0.58060044
Calinski-Harabasz = 22173.724078301817
Davies-Bouldin = 0.5515642606397176
DATA: 0.58060044 22173.724078301817 0.5515642606397176


In [None]:
## Oh ROOT, how I miss thee
import importlib
import ME_analysis_libs
importlib.reload(ME_analysis_libs)
from ME_analysis_libs import parse_binning, plot_metric_sim_data, plot_metric_by_label, plot_metric_by_cluster, plot_metric_data_vs_sim

plot_metric_data_vs_sim(data_processed['clust_index'],
                        sim_processed['clust_index'], 
                        sim_processed['labels'],
                        xtitle="Max. cluster index")