# Member Classification Model

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import sys
from torch_geometric.data import DataLoader, Dataset, Data
import lightning.pytorch as pl
import seaborn as sns
import pandas as pd
import os
from tqdm import tqdm
import torch
import itertools
import yaml
from pytorch_lightning.loggers import WandbLogger

import matplotlib.pyplot as plt
from torch_geometric.utils import to_scipy_sparse_matrix
import scipy.sparse as sps
import xxhash
from torch_cluster import knn

from epic_clustering.utils import plot_clusters, get_cluster_pos
from epic_clustering.models import MemberClassification
from epic_clustering.scoring import weighted_v_score

## 1. Training Loop

Training took me about 2 hours on a single (A100) GPU. If you use a smaller GPU, you may need to decrease the batch size. The configuration I used for this submission is:

```
input_dir: /global/cfs/cdirs/m3443/data/PowerWeek/train/train/
project: PowerWeek_MemberClassification
checkpoint_dir: /global/cfs/cdirs/m3443/data/PowerWeek/checkpoints/

data_split: [2000, 10, 10]
batch_size: 20
input_features: 12
emb_hidden: 1024
nb_layer: 4
activation: ReLU

warmup: 10
lr: 0.01
patience: 30
max_epochs: 10
factor: 0.7
num_seeds: 40
```

In [2]:
with open("member_classification.yaml") as f:
    member_classification_config = yaml.safe_load(f)
model = MemberClassification(member_classification_config)
model.setup(stage="fit")

Converting to PyG data objects


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
100%|██████████| 10020/10020 [00:34<00:00, 290.32it/s]

Loaded 10000 training events, 10 validation events and 10 testing events





In [None]:
logger = WandbLogger(project=member_classification_config["project"])
trainer = pl.Trainer(devices=1, accelerator="gpu", max_epochs=300, logger=logger)
trainer.fit(model)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmurnanedaniel[0m. Use [1m`wandb login --relogin`[0m to force relogin


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
You are using a CUDA device ('NVIDIA A100-PCIE-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | network | Sequential | 3.2 M 
---------------------------------------
3.2 M     Trainable params
0         Non-trainable params
3.2 M     Total params
12.653    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0:  69%|██████▉   | 344/500 [03:07<01:25,  1.83it/s, v_num=12gl]

## 2. Inference!

In [None]:
checkpoint_file = "/global/cfs/cdirs/m3443/data/PowerWeek/checkpoints/classifier.ckpt"

In [None]:
model = MemberClassification.load_from_checkpoint(checkpoint_file)

In [None]:
model.hparams["data_split"] = [5000, 100, 100]

In [None]:
model.setup(stage="fit")

In [None]:
input_dir = "/global/cfs/cdirs/m3443/data/PowerWeek/train/train"
num_events = sum(model.hparams["data_split"])
csv_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.csv')])[:num_events//1000 + 1]
events_df = pd.concat([pd.read_csv(f) for f in sorted(csv_files)])
if num_events is not None:
    events_df = events_df[events_df["event"].isin(sorted(events_df["event"].unique())[:num_events])]
events_df['clusterID'] = events_df['clusterID'].astype(np.uint64) # Needed for some reason?

In [None]:
pd.options.mode.chained_assignment = None
def label_hits(event, events_df, pred_edges, max_dist=None):
    
    seeds_idx = pred_edges.unique()
    
    event_df = events_df[events_df.event == event.event_id]
    
    # Collect nonseeds in another tensor
    nonseeds_idx = torch.from_numpy(event_df.hit_number[~np.isin(event_df.hit_number.values, seeds_idx.long().numpy())].values).unique()

    # For each nonseed find closest seed with knn=1
    nonseeds_to_seeds = knn(torch.from_numpy(event_df[np.isin(event_df.hit_number.values, seeds_idx.long().numpy())][['posx', 'posy']].to_numpy()), torch.from_numpy(event_df[np.isin(event_df.hit_number.values, nonseeds_idx.long().numpy())][['posx', 'posy']].to_numpy()), 1)

    # Convert 0, .., N indices back to original seed_idx and nonseed_idx
    nonseeds_to_seeds = torch.stack([seeds_idx[nonseeds_to_seeds[1]], nonseeds_idx[nonseeds_to_seeds[0]]])
    
    if max_dist is not None:
        positions = torch.from_numpy(events_df[["posx", "posy", "posz"]].values)
        nonseeds_to_seeds = nonseeds_to_seeds[:, torch.sqrt(torch.sum((positions[nonseeds_to_seeds[0]] - positions[nonseeds_to_seeds[1]])**2, dim=-1)) < max_dist]
    
    # Add the seed-seed edges and the seed-nonseed edges into the same graph
    combined_graph = torch.cat([nonseeds_to_seeds, pred_edges], dim=-1)
    sparse_edges = to_scipy_sparse_matrix(combined_graph, num_nodes = len(event_df))
    
    # Perform a connected components algorithm on the graph
    _, candidate_labels = sps.csgraph.connected_components(sparse_edges, directed=False, return_labels=True)  
    labels = torch.from_numpy(candidate_labels).long()
    
    event_df['tmp_clusterID'] = labels

    # encode the labels to make sure it's unique across all events 
    str_ids = event_df['event'].astype('str') + "_" + event_df['tmp_clusterID'].astype('str')
    event_df['labelID'] = [xxhash.xxh64_intdigest(x, seed=0) for x in str_ids.values]
    
    return event_df

Let's test on the training data first

In [None]:
labelled_events_df = []
for event in tqdm(model.trainset):
    try:
        with torch.no_grad():
            edge_scores = model.cuda()(event.x.cuda()).cpu().squeeze()
        labelled_events_df.append(label_hits(event, events_df, event.edge_index[:, edge_scores > 0.6]))
    except:
        pass
labelled_events_df = pd.concat(labelled_events_df)
print(f"Vscore: {weighted_v_score(labels_true=labelled_events_df['clusterID'], labels_pred=labelled_events_df['labelID'], labels_weight=labelled_events_df['E'])[2]}")

### Test Dataset

Now, to build the test set

In [None]:
checkpoint_file = "/global/cfs/cdirs/m3443/data/PowerWeek/checkpoints/classifier.ckpt"

In [None]:
model = MemberClassification.load_from_checkpoint(checkpoint_file)

In [None]:
model.hparams["data_split"] = [10000, 0, 0]
model.hparams["input_dir"] = "/global/cfs/cdirs/m3443/data/PowerWeek/test/test"

In [None]:
model.setup(stage="fit")

In [None]:
input_dir = "/global/cfs/cdirs/m3443/data/PowerWeek/test/test"
num_events = sum(model.hparams["data_split"])
csv_files = sorted([os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.csv')])[:num_events//1000 + 1]
events_df = pd.concat([pd.read_csv(f) for f in tqdm(sorted(csv_files))])
if num_events is not None:
    events_df = events_df[events_df["event"].isin(sorted(events_df["event"].unique())[:num_events])]

In [None]:
labelled_events_df = []
for event in tqdm(model.trainset):
    try:
        with torch.no_grad():
            edge_scores = model.cuda()(event.x.cuda()).cpu().squeeze()
        labelled_events_df.append(label_hits(event, events_df, event.edge_index[:, edge_scores > 0.65]))
    except:
        print(f"Error with event {event}")
labelled_events_df = pd.concat(labelled_events_df)

There are some missing rows for some reason! Let's just add them back in with random labels...

In [None]:
missing_rows = events_df[~events_df.uniqueID.isin(labelled_events_df.uniqueID)]

In [None]:
missing_rows['labelID'] = np.random.randint(0, 1000000, (len(missing_rows)))

In [None]:
labelled_events_df = pd.concat([labelled_events_df, missing_rows])

Save the data

In [None]:
labelled_events_df["clusterID"] = labelled_events_df["labelID"]

In [None]:
labelled_events_df[["uniqueID", "clusterID"]].to_parquet("membership_classification.parquet")