In [None]:
import MinkowskiEngine as ME
import matplotlib.pyplot as plt
import matplotlib as mpl
import importlib
import torchvision.transforms.v2 as transforms
import torchvision.transforms.v2.functional as F
from torch import nn

## Jupyter magic
%matplotlib inline
mpl.rcParams['figure.figsize'] = [8, 6]
mpl.rcParams['font.size'] = 16

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.device(device)
import numpy as np
SEED=12345
_=np.random.seed(SEED)
_=torch.manual_seed(SEED)

In [None]:
## Includes from my libraries for this project                                                                                                                                           
from ME_dataset_libs import CenterCrop, MaxRegionCrop, ConstantCharge, RandomCrop, RandomPixelNoise2D, FirstRegionCrop
from ME_dataset_libs import SingleModuleImage2D_solo_ME, solo_ME_collate_fn, solo_ME_collate_fn_with_meta
from ME_dataset_libs import make_dense, make_dense_from_tensor, Label

In [None]:
from FSD_training_analysis import get_models_from_checkpoint
nlatent=128
nclusters=20
lr="5E-6"
batch_size=1024
nevts="5M"
nsteps=50
data_frac=1
nhidden="_hid256"

aug_type="newbaseaug"
aug_prob="1"
nchan=64
clust_arch="two"
proj_arch="two"
enc_arch="d4silu"
enc_arch_pool="max"
enc_arch_flatten=0
enc_arch_slow_growth=1
enc_arch_first_kernel=7
enc_arch_sep_heads=0
softmax_temp=1.0
clust_temp=0.5
proj_temp=0.5
ent="_ent1E-1"
data_string="DATA"+str(data_frac)

## Load in the pre-calculated model weights
file_dir = "/pscratch/sd/c/cwilk"
chk_file = "state_lat"+str(nlatent)+nhidden+"_clust"+str(nclusters)+"_nchan"+str(nchan)+"_"+lr+"_"+str(batch_size)+\
    "_PROJ"+str(proj_temp)+proj_arch+"_CLUST"+str(clust_temp)+clust_arch+ent+"_soft"+str(softmax_temp)+\
    "_arch"+enc_arch+"_pool"+enc_arch_pool+"_flat"+str(enc_arch_flatten)+"_grow"+str(enc_arch_slow_growth)+\
    "_kern"+str(enc_arch_first_kernel)+"_sep"+str(enc_arch_sep_heads)+\
    "_onecycle"+str(nsteps)+"_"+aug_type+aug_prob+"_DROP0_WEIGHT_DECAY0_"+nevts+"_"+data_string+"_FSDCCFIX.pth"

encoder, heads, args = get_models_from_checkpoint(file_dir+"/"+chk_file)
encoder.eval()
for h in heads.values(): h.eval()

encoder.to(device)
for h in heads.values(): h.to(device)

print("Loaded:", chk_file)

In [None]:
## Setup the dataloader
from ME_dataset_libs import FirstRegionCrop
from torch.utils.data import ConcatDataset
import time
start = time.time() 

## Modify the nominal transform
nom_transform = transforms.Compose([
            FirstRegionCrop((800, 256), (768, 256)),
            # ConstantCharge(),
            ])

data_dir = "/pscratch/sd/c/cwilk/FSD/DATA" #_MIN200_v3"
# sim_dir = "/pscratch/sd/c/cwilk/FSD/DATAv6"
sim_dir = "/pscratch/sd/c/cwilk/FSD/SIMULATIONv2" #_MIN200_v2"
max_data_events=50000
max_sim_events=50000
single_sim_dataset = SingleModuleImage2D_solo_ME(sim_dir, transform=nom_transform, max_events=max_sim_events, return_metadata=True)
single_data_dataset = SingleModuleImage2D_solo_ME(data_dir, transform=nom_transform, max_events=max_data_events, return_metadata=True)
single_mixed_dataset = ConcatDataset([single_data_dataset, single_sim_dataset])

print("Time taken to load", single_data_dataset.__len__(),"data and", single_sim_dataset.__len__(), "images:", time.time() - start)

## Randomly chosen batching
single_loader = torch.utils.data.DataLoader(single_mixed_dataset,
                                            collate_fn=solo_ME_collate_fn_with_meta,
                                            batch_size=1024,
                                            shuffle=False,
                                            num_workers=4)
data_loader   = torch.utils.data.DataLoader(single_data_dataset,
                                            collate_fn=solo_ME_collate_fn_with_meta,
                                            batch_size=1024,
                                            shuffle=False,
                                            num_workers=4)

sim_loader    = torch.utils.data.DataLoader(single_sim_dataset,
                                            collate_fn=solo_ME_collate_fn_with_meta,
                                            batch_size=1024,
                                            shuffle=False,
                                            num_workers=4)

In [None]:
import numpy as np
import FSD_training_analysis
importlib.reload(FSD_training_analysis)
from FSD_training_analysis import image_loop, reorder_clusters
import time
start = time.time()
## Get the processed vectors of interest from the datasets                                                                                                                                                     
data_processed = image_loop(encoder, heads, data_loader, False)
sim_processed = image_loop(encoder, heads, sim_loader, False)
print("Time to process events:", time.time() - start)

start = time.time()
## Do some magic to re-order the clusters for presentation purposes                                                                                                                                            
reorder_clusters(data_processed, sim_processed)
print("Time to reorder events:", time.time() - start)


In [None]:
# Force reload so I can play with changes outside jupyter...
import importlib
import ME_analysis_libs
importlib.reload(ME_analysis_libs)
from ME_analysis_libs import compute_cluster_overlap, plot_overlap_matrix

overlap_matrix = compute_cluster_overlap(data_processed['clust'], 3)
plot_overlap_matrix(overlap_matrix, max_val=0.5)

In [None]:
## Oh ROOT, how I miss thee
import importlib
import ME_analysis_libs
importlib.reload(ME_analysis_libs)
from ME_analysis_libs import parse_binning, plot_metric_pass_fail, plot_metric_by_label, plot_metric_by_cluster, plot_metric_data_vs_sim, plot_metric_by_confidence

plot_metric_data_vs_sim(data_processed['clust_index'],
                        sim_processed['clust_index'], 
                        sim_processed['labels'],
                        xtitle="Max. cluster index")

#plot_metric_by_confidence(data_processed['clust_index'], 
#                          data_processed['clust_max'],
#                          xtitle="Max. cluster index")

#min_hits=250
#data_mask = (data_processed['nhits']>min_hits)
#plot_metric_pass_fail(data_processed['clust_index'], 
#                      data_mask, xtitle="Max. cluster index")

#sim_mask = (sim_processed['nhits']>min_hits)
#plot_metric_pass_fail(sim_processed['clust_index'], 
#                      sim_mask, xtitle="Max. cluster index")

# plot_metric_data_vs_sim(data_processed['xrange'], sim_processed['xrange'], sim_processed['labels'], nbinsx=50, xtitle="x-range")



In [None]:
## Look at the distribution of values
plot_metric_data_vs_sim(data_processed['clust_max'], sim_processed['clust_max'], sim_processed['labels'], nbinsx=50, xtitle="Top prob.")
plot_metric_by_cluster(data_processed['clust_max'], data_processed['clust_index'], nbinsx=50, xtitle="Top prob.")

In [None]:
## Look at the number of significant clusters
plot_metric_data_vs_sim(data_processed['nhits'], sim_processed['nhits'], sim_processed['labels'], nbinsx=70, x_max=1400, xtitle="N. hits")
plot_metric_data_vs_sim(data_processed['maxQ'], sim_processed['maxQ'], sim_processed['labels'], nbinsx=100, x_min=1.5, x_max=2.5, xtitle="Max. Q")
plot_metric_data_vs_sim(data_processed['sumQ'], sim_processed['sumQ'], sim_processed['labels'], nbinsx=70, x_max=1400, xtitle="Sum Q")


In [None]:
from scipy.stats import gaussian_kde
import matplotlib.colors as mcolors
from cuml.manifold import TSNE as cuML_TSNE
import cupy as cp
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize
from cuml.preprocessing import StandardScaler as cuMLScaler
def run_tsne_cuml_ALT(input_vect=None, zvect=None, alpha_vect=None, perp=30, exag=6, lr=2000.0, n_neighbors=50, tsne_results=None, ztitle="Cluster ID", save_name=None, norm=True):

    print("Running cuML t-SNE with: perplexity =", perp, "early exaggeration =", exag)

    input_vect = cp.asarray(input_vect, dtype=cp.float32)

    if norm:
        norms = cp.linalg.norm(input_vect, axis=1, keepdims=True)
        input_vect = input_vect / (norms + 1e-10)

    n_neighbors = 3*perp
    if n_neighbors > 1024: n_neighbors = 1024
    
    ## I haven't played with most of cuml's t-SNE parameters
    #tsne = cuML_TSNE(n_components=2, perplexity=perp, n_iter=5000, \
    #                 early_exaggeration=exag, learning_rate=lr, exaggeration_iter=250, \
    #                 learning_rate_method=None, square_distances=False, init='random', late_exaggeration=1, \
    #                 metric='cosine', method='fft', verbose=True, n_neighbors=n_neighbors)
    tsne = cuML_TSNE(n_components=2, perplexity=perp, n_iter=1000, \
                     learning_rate_method=None, early_exaggeration=exag, learning_rate=lr, method='exact', #barnes_hut',\
                     metric='cosine', square_distances=False, init='random', n_neighbors=n_neighbors, verbose=True)
    #tsne = cuML_TSNE(n_components=2, perplexity=perp, n_iter=5000, \
    #                 early_exaggeration=exag, learning_rate=lr, method='barnes_hut',\
    #                 metric='cosine', square_distances=False, verbose=True)
    
    if tsne_results is None:
        tsne_results = tsne.fit_transform(input_vect)
        scaler = cuMLScaler()
        tsne_results = scaler.fit_transform(tsne_results)  # tsne_results still on GPU
        tsne_results = cp.asnumpy(tsne_results)
        
    unique_labels = np.unique(zvect)
    n_clusters = len(unique_labels)

    # Use a qualitative colormap with enough colors
    all_colors = (
        plt.cm.tab20.colors +
        plt.cm.tab20b.colors +
        plt.cm.tab20c.colors +
        plt.cm.tab10.colors
    )
    cmap = mcolors.ListedColormap(all_colors[:n_clusters])

    alpha_vect = alpha_vect**3
    rgb_colors = np.array([cmap(i % n_clusters)[:3] for i in zvect])

    # add per-point alpha (density-based)
    rgb_colors = np.concatenate([rgb_colors, alpha_vect[:, None]], axis=1)
    norm = mcolors.BoundaryNorm(boundaries=np.arange(n_clusters + 1), ncolors=n_clusters)

    fig, ax = plt.subplots()
    ax.scatter(tsne_results[:, 0], tsne_results[:, 1], s=0.5, c=rgb_colors)

    cbar = fig.colorbar(
        plt.cm.ScalarMappable(norm=norm, cmap=cmap),ax=ax
    )
    cbar.set_label(ztitle)
    #plt.colorbar(gr, label=ztitle)
    plt.xlabel('t-SNE #0')
    plt.ylabel('t-SNE #1')
    if save_name: plt.savefig(save_name, dpi=150, bbox_inches='tight')
    plt.show()
    plt.close()

    return tsne_results

In [None]:
import importlib
import ME_analysis_libs
importlib.reload(ME_analysis_libs)
from ME_analysis_libs import run_tsne_cuml, run_tsne_skl
import time

## Actually run tsne (not always that useful)
perp=150
exag=16
lr=4000.0

data_mask = (data_processed['nhits']>250)
sim_mask = (sim_processed['nhits']>250)

ntsne=5000
data_latent_subset = data_processed['latent'][:ntsne] #[data_mask]
data_index_subset = data_processed['clust_index'][:ntsne] #[data_mask]
data_label_subset = data_processed['labels'][:ntsne]#[data_mask]
data_alpha_subset = data_processed['clust_max'][:ntsne]#[data_mask]
start = time.process_time() 
#data_tsne_results = run_tsne_skl(data_latent_subset, data_index_subset, data_alpha_subset, perp, exag, lr)
print("Time:", time.process_time() - start)
#print("t-SNE output min/max:", data_tsne_results.min(), data_tsne_results.max())
#print("t-SNE output std per dim:", data_tsne_results.std(axis=0))

#sim_latent_subset = sim_latent_vect[:50000].copy()
#sim_index_subset = sim_clust_index[:50000].copy()
#sim_label_subset = sim_label_vect[:50000].copy()
#sim_alpha_subset = sim_clust_max[:50000].copy()
#sim_tsne_results = run_tsne_cuml(sim_latent_subset, sim_index_subset, perp, exag, lr,alpha_vect=sim_alpha_subset)
#run_tsne_results = run_tsne_cuml(tsne_results=sim_tsne_results, zvect=sim_label_subset, ztitle="True label", norm=False)

## Merged
#mixed_latent_subset = np.concatenate((data_latent_subset, sim_latent_subset), axis=0)
#mixed_index_subset = np.concatenate((data_index_subset, sim_index_subset), axis=0)
#mixed_label_subset = np.concatenate((data_label_subset, sim_label_subset), axis=0)
#mixed_alpha_subset = np.concatenate((data_alpha_subset, sim_alpha_subset), axis=0)
#mixed_tsne_results = run_tsne_cuml(mixed_latent_subset, mixed_index_subset, perp, exag, lr, mixed_alpha_subset)
#run_tsne_results = run_tsne_cuml(tsne_results=mixed_tsne_results, zvect=mixed_label_subset, ztitle="True label", norm=False)



In [None]:
perp=150
exag=12
lr=500.0
data_tsne_results = run_tsne_skl(data_latent_subset, data_index_subset, data_alpha_subset, perp, exag, lr)

In [None]:
## Actually run tsne (not always that useful)
perp=30
exag=24
lr=200.0

for perp in [20, 30, 40, 50, 100]:
    for exag in [8, 12, 16, 20]:
        print(perp, exag)
        data_tsne_results = run_tsne_skl(data_latent_subset, data_index_subset, data_alpha_subset, perp, exag, lr)

In [None]:
perp=100
exag=20
lr=500
n_iter=2000
for lr in [50, 10000]: #200, 400, 600, 800, 1000, 2000, 5000]:
    print("Trying n_iter =", n_iter)
    data_tsne_results = run_tsne_skl(data_latent_subset, data_index_subset, data_alpha_subset, perp, exag, lr, n_iter=n_iter)

In [None]:
!pip install umap-learn

In [None]:
import umap
import matplotlib.colors as mcolors

def run_umap_cpu(input_vect=None, zvect=None, n_neighbors=100, min_distance=0.1, n_epochs=1000, alpha_vect=0.5, ztitle="Cluster ID", save_name=None, norm=True):
    
    if norm:
        norms = np.linalg.norm(input_vect, axis=1, keepdims=True)
        input_vect = input_vect / (norms + 1e-10)

    fit = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_distance,
        n_epochs=n_epochs,
        metric='cosine',
        random_state=42
    )

    umap_results = fit.fit_transform(input_vect)    

    x_low, x_high = np.percentile(umap_results[:,0], [0.1, 99.9])
    y_low, y_high = np.percentile(umap_results[:,1], [0.1, 99.9])
    
    unique_labels = np.unique(zvect)
    n_clusters = len(unique_labels)

    # Use a qualitative colormap with enough colors
    all_colors = (
        plt.cm.tab20.colors +
        plt.cm.tab20b.colors +
        plt.cm.tab20c.colors +
        plt.cm.tab10.colors
    )

    cmap = mcolors.ListedColormap(all_colors[:n_clusters])
    norm = mcolors.BoundaryNorm(boundaries=np.arange(n_clusters + 1), ncolors=n_clusters)

    gr = plt.scatter(umap_results[:, 0], umap_results[:, 1], s=0.5, alpha=alpha_vect, c=zvect, cmap=cmap, norm=norm)
    plt.colorbar(gr, label=ztitle)
    plt.xlim(x_low, x_high)
    plt.ylim(y_low, y_high)
    plt.xlabel('UMAP #0')
    plt.ylabel('UMAP #1')
    ax = plt.gca()
    ax.grid(False)
    if save_name: plt.savefig(save_name, dpi=150, bbox_inches='tight')
    plt.show()
    #plt.close()
    return


In [None]:
numap=20000
for n_neighbors in [5, 10, 20, 30, 50]:
    for min_distance in [0, 0.01, 0.05, 0.1]:
        print(n_neighbors, min_distance)
        run_umap_cpu(data_processed['latent'][:numap], data_processed['clust_index'][:numap], n_neighbors=n_neighbors, min_distance=min_distance, \
                     alpha_vect=data_processed["clust_max"][:numap], n_epochs=1000)

In [None]:
n_neighbors=50
for min_distance in [0.001, 0.05, 0.01]:
    for n_epochs in [500, 1000, 2000, 5000, 10000]:
        print(n_neighbors, min_distance, n_epochs)
        run_umap_cpu(data_processed['latent'][:numap], data_processed['clust_index'][:numap], n_neighbors=n_neighbors, min_distance=min_distance, \
                     alpha_vect=data_processed["clust_max"][:numap], n_epochs=n_epochs)

In [None]:
## Simple averages:
data_counts = np.bincount(data_clust_index)
sim_counts = np.bincount(sim_clust_index)
n_clusters = 20
data_frac = data_counts / data_counts.sum()
sim_frac = sim_counts / sim_counts.sum()

# per-cluster mean confidence
data_mean_conf = np.zeros(n_clusters)
sim_mean_conf = np.zeros(n_clusters)

for k in range(n_clusters):
    data_mean_conf[k] = data_clust_vect[data_clust_index==k, :].max(axis=1).mean()
    sim_mean_conf[k] = sim_clust_vect[sim_clust_index==k, :].max(axis=1).mean()

print("Data cluster sizes:", data_counts)
print("Data fraction > 0.01:", (data_frac>0.01).sum())
print("Data mean max-prob per cluster:", data_mean_conf)

print("Sim cluster sizes:", sim_counts)
print("Sim fraction > 0.01:", (sim_frac>0.01).sum())
print("Sim mean max-prob per cluster:", sim_mean_conf)


In [None]:
## Actually run tsne (not always that useful)
perp=30
exag=24
lr=200.0

for perp in [30, 40, 50, 100]:
    for exag in [8, 12, 16, 20]:
        print(perp, exag)
        data_tsne_results = run_tsne_skl(data_latent_subset, data_index_subset, data_alpha_subset, perp, exag, lr)

In [None]:
# A helper function to make column normalized histograms
from matplotlib.colors import LogNorm
def make_2D_histogram(x_vect, y_vect, norm='column', label_enum=Label):
    # Determine range of unique integer values
    x_min, x_max = x_vect.min(), x_vect.max()
    y_min, y_max = y_vect.min(), y_vect.max()

    # Define bin edges so each integer gets its own bin
    x_bins = np.arange(x_min, x_max + 2)  # +2 to include the last integer
    y_bins = np.arange(y_min, y_max + 2)

    # Compute the 2D histogram
    H, xedges, yedges = np.histogram2d(x_vect, y_vect, bins=[x_bins, y_bins])
    H = H.T
    
    # Column normalization: divide each column by its sum
    # Note: H shape is (len(x_bins)-1, len(y_bins)-1)
    column_sums = H.sum(axis=0, keepdims=True)
    row_sums = H.sum(axis=1, keepdims=True)

    
    if norm=='column': 
        H_normalized = np.divide(H, column_sums, where=column_sums != 0)
    elif norm=='row':
        H_normalized = np.divide(H, row_sums, where=row_sums != 0)
    else:
        print("Unknown norm option:, norm")
        return
        
    plt.figure(figsize=(8, 6))
    mesh = plt.pcolormesh(x_bins, y_bins, H_normalized, cmap='viridis', shading='auto')
    plt.colorbar(mesh, label='Normalized Frequency (per '+norm+')')
    plt.ylabel("Cluster ID")

    if label_enum is not None:
        x_ticks = np.arange(x_min, x_max + 1)
        x_labels = [label_enum.name_from_index(i) for i in x_ticks]
        plt.xticks(ticks=x_ticks + 1, labels=x_labels, rotation=45, ha='right')
    plt.yticks(ticks=y_bins[:-1])
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.show()

In [None]:
make_2D_histogram(label_vect, clust_index)

In [None]:
make_2D_histogram(label_vect, clust_index, 'row')

In [None]:
import importlib
import ME_analysis_libs
importlib.reload(ME_analysis_libs)
from ME_analysis_libs import plot_cluster_examples
## Now pull out a bank of example images for each cluster
for index in range(nclusters):
    print("Showing examples for cluster:", index, "which has", np.count_nonzero(data_processed['clust_index']==index), "values")
    plot_cluster_examples(single_data_dataset, data_processed['clust_index'], index, 8)#, data_processed['clust_max'])

    ## Or simulation
    #print("Showing examples for cluster:", index, "which has", np.count_nonzero(sim_processed['clust_index']==index), "values")
    #plot_cluster_examples(single_sim_dataset, sim_processed['clust_index'], index, 8, data_processed['clust_max'])    

In [None]:
## Dump out a large block of images for one cluster
from ME_analysis_libs import plot_cluster_bigblock
plot_cluster_bigblock(single_data_dataset, data_processed['clust_index'], 18, 10, 10, cluster_probs=data_processed['clust_max']) #, 'cluster_plots/v9_michel_like.png')

In [None]:
import json
from collections import defaultdict

## Dump out a file including the filenames and indices for the clustered images (for going back to the original files)
def dump_cluster_indices(index_label, cluster_labels, filenames, event_ids):

    # Inputs
    indices = np.where(cluster_labels == index_label)[0]

    selected_filenames = np.array(filenames)[indices]
    selected_event_ids = np.array(event_ids)[indices]

    # Group by filename
    grouped = defaultdict(list)
    for fname, eid in zip(selected_filenames, selected_event_ids):
        ## Restore the old naming for ease of interpretation
        grouped[fname.replace("_images.h5", ".hdf5")].append(int(eid))  # ensure JSON serializability

    # Save to JSON
    output_file = f'cluster_{index_label}_events.json'
    with open(output_file, 'w') as f:
        json.dump(grouped, f, indent=2)

    print(f"Saved grouped event list for cluster {index_label} to {output_file}")

In [None]:
dump_cluster_indices(16, sim_processed['clust_index'], sim_processed["filename"], sim_processed["event_id"])
dump_cluster_indices(17, sim_processed['clust_index'], sim_processed["filename"], sim_processed["event_id"])