In [1]:
from metaspace import SMInstance
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from deepims_clust import DeepClustering
from deepims_clust.utils import size_adaption, size_adaption_symmetric

import umap
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


# Datasets

In [2]:
evaluation_datasets = [
    '2022-12-07_02h13m50s',
    '2022-12-07_02h13m20s',
    '2022-12-07_02h10m45s',
    '2022-12-07_02h09m41s',
    '2022-12-07_02h08m52s',
    '2022-12-07_01h02m53s',
    '2022-12-07_01h01m06s'
                  ]

In [3]:
training_results = {}
training_images = {}
training_if = {}
polarity = '+'

sm = SMInstance()

for k in evaluation_datasets:
    ds = sm.dataset(id=k)
    results = ds.results(database=("HMDB", "v4"), fdr=0.2).reset_index()
    training_results[k] = results
    tmp = ds.all_annotation_images(fdr=0.2, database=("HMDB", "v4"), only_first_isotope=True)
    onsample = dict(zip(results['formula'].str.cat(results['adduct']), ~results['offSample']))
    formula = [x.formula+x.adduct for x in tmp if onsample[x.formula+x.adduct]]
    tmp = np.array([x._images[0] for x in tmp if onsample[x.formula+x.adduct]])
    training_images[k] = tmp
    training_if[k] = formula

100%|████████████████████████████████████████| 293/293 [00:01<00:00, 229.99it/s]
100%|████████████████████████████████████████| 287/287 [00:01<00:00, 233.70it/s]
100%|████████████████████████████████████████| 251/251 [00:01<00:00, 225.04it/s]
100%|████████████████████████████████████████| 313/313 [00:01<00:00, 251.81it/s]
100%|████████████████████████████████████████| 311/311 [00:01<00:00, 247.02it/s]
100%|████████████████████████████████████████| 237/237 [00:01<00:00, 219.68it/s]
100%|████████████████████████████████████████| 241/241 [00:01<00:00, 216.66it/s]


# Data preprocessing

In [4]:
[x.shape for x in training_images.values()]

[(245, 74, 84),
 (221, 88, 88),
 (194, 41, 78),
 (237, 56, 85),
 (276, 71, 82),
 (201, 35, 55),
 (186, 60, 44)]

In [5]:
padding_images = size_adaption_symmetric(training_images)
[x.shape for x in padding_images.values()]

[(245, 88, 88),
 (221, 88, 88),
 (194, 88, 88),
 (237, 88, 88),
 (276, 88, 88),
 (201, 88, 88),
 (186, 88, 88)]

In [6]:
tst = set(training_if['2022-12-07_02h13m50s']).intersection(set(training_if['2022-12-07_02h13m20s']), 
                                                          set(training_if['2022-12-07_02h10m45s']), 
                                                          set(training_if['2022-12-07_02h09m41s']), 
                                                          set(training_if['2022-12-07_02h08m52s']), 
                                                          set(training_if['2022-12-07_01h02m53s']), 
                                                          set(training_if['2022-12-07_01h01m06s']))

In [7]:
print(f'Intersection of all datasets: {len(tst)}')

Intersection of all datasets: 126


# Preparing data

In [8]:
training_dsid = [
    '2022-12-07_01h01m06s',
    '2022-12-07_02h13m20s',
    '2022-12-07_02h10m45s',
    '2022-12-07_02h09m41s',
    '2022-12-07_02h08m52s',
    '2022-12-07_01h02m53s',
    '2022-12-07_02h13m50s'
                  ]

testing_dsid = [
    
]

In [9]:
training_data = []
training_datasets = [] 
training_ions = []

testing_data = []
testing_datasets = [] 
testing_ions = []


for dsid, imgs in padding_images.items():
    
    if dsid in training_dsid:
        training_data.append(imgs)
        training_datasets += [dsid] * imgs.shape[0]
        training_ions += training_if[dsid]
    
    testing_data.append(imgs)
    testing_datasets += [dsid] * imgs.shape[0]
    testing_ions += training_if[dsid]
        
    
training_data = np.concatenate(training_data)
training_datasets = np.array(training_datasets)
training_ions = np.array(training_ions)

testing_data = np.concatenate(testing_data)
testing_datasets = np.array(testing_datasets)
testing_ions = np.array(testing_ions)

# Activation comparison

## Softmax

In [13]:
model_softmax = DeepClustering(
                                images=training_data,
                                dataset_labels=training_datasets,
                                ion_labels=training_ions,
                                num_cluster=8,
                                initial_upper=93,
                                initial_lower=37,
                                upper_iteration=1.5,
                                lower_iteration=1.5,
                                dataset_specific_percentiles=True,
                                random_flip=True,
                                knn=True, k=5,
                                lr=0.0001, batch_size=128,
                                pretraining_epochs=11,
                                training_epochs=16,
                                cae_encoder_dim=20,
                                use_gpu=True,
                                activation='softmax',
                                random_seed=1225
                            )

After 16 epochs, the upper bound will be: 69.0.
After 16 epochs, the lower bound will be: 61.0.


In [14]:
model_softmax.train()

CAE Final conv size = 3136
CNNClust final conv dim = 289
Pretraining Epoch: 0 Loss: 0.019240
Pretraining Epoch: 1 Loss: 0.009439
Pretraining Epoch: 2 Loss: 0.008414
Pretraining Epoch: 3 Loss: 0.008081
Pretraining Epoch: 4 Loss: 0.007900
Pretraining Epoch: 5 Loss: 0.007604
Pretraining Epoch: 6 Loss: 0.007081
Pretraining Epoch: 7 Loss: 0.006920
Pretraining Epoch: 8 Loss: 0.006813
Pretraining Epoch: 9 Loss: 0.006759
Pretraining Epoch: 10 Loss: 0.006730
Training Epoch: 0 Loss: 0.023644
Training Epoch: 1 Loss: 0.014068
Training Epoch: 2 Loss: 0.010097
Training Epoch: 3 Loss: 0.008973
Training Epoch: 4 Loss: 0.008524
Training Epoch: 5 Loss: 0.008159
Training Epoch: 6 Loss: 0.008331
Training Epoch: 7 Loss: 0.008014
Training Epoch: 8 Loss: 0.008014
Training Epoch: 9 Loss: 0.008031
Training Epoch: 10 Loss: 0.007692
Training Epoch: 11 Loss: 0.007723
Training Epoch: 12 Loss: 0.007713
Training Epoch: 13 Loss: 0.007759
Training Epoch: 14 Loss: 0.007913
Training Epoch: 15 Loss: 0.007499


0

## ReLU

## Sigmoid

In [11]:
model = DeepClustering(
                images=training_data,
                dataset_labels=training_datasets,
                ion_labels=training_ions,
                num_cluster=8,
                initial_upper=93,
                initial_lower=37,
                upper_iteration=1.5,
                lower_iteration=1.5,
                dataset_specific_percentiles=True,
                random_flip=True,
                knn=True, k=5,
                lr=0.0001, batch_size=128,
                pretraining_epochs=11,
                training_epochs=16,
                cae_encoder_dim=20,
                use_gpu=True,
                random_seed=1225
            )

After 16 epochs, the upper bound will be: 69.0.
After 16 epochs, the lower bound will be: 61.0.
