# ***DESI Human Glioma Section Spectra Classification***

This notebook shows the process of scrion spectra classification of the DESI Human Glioma preprocessed dataset.

### ***Import packages***

Before we begin, let"s import all the necessary packages for this notebook.
First we add the directory which has our python files:

In [1]:
import sys
sys.path.insert(0, "/sise/assafzar-group/assafzar/Leor/NanoBiopsy")

Next we import all the necessary packages for this notebook:

In [2]:
import gc
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from skimage import (filters)
from tqdm.notebook import tqdm
from pyimzml.ImzMLParser import ImzMLParser, getionimage
from nnbiopsy.bn_vae import BNVAE

%matplotlib inline

### ***Constants definitions***

Next, let"s define some constant variables for this notebook:

In [3]:
# Define folder that contains the preprocessed dhg dataset
DHG_IN_PATH = "/sise/assafzar-group/assafzar/Leor/DHG/Preprocessed"
# Define file that contains clinical state anotations
LABELS_PATH = "/sise/assafzar-group/assafzar/Leor/DHG/Clinical_state_anotations.csv"

### ***Reading MSI clinical state anotations***

Next, lets read the clinical state anotations for each MSI:

In [4]:
# Read clinical state anotations csv
labels_df = pd.read_csv(LABELS_PATH)

# Filtser only sample_type section = "s"
labels_df = labels_df[labels_df["sample_type"] == "s"]

### ***Get all tissue spectra from all MSI:***

Next, let"s get all tissue spectra from all MSI:

In [None]:
# Create list to save spectras info
x_coordinates = []
y_coordinates = []
sample_names = []
sample_nums = []
labels = []
idxs = []

# Loop over each MSI
for index, msi_row in tqdm(labels_df.iterrows(), total=labels_df.shape[0], desc="MSI Loop"):
  # Parse the MSI file 
  with ImzMLParser(os.path.join(DHG_IN_PATH, f"{msi_row.file_name}.imzML")) as reader:
    # Get local TIC image of msi in mz region [600, 900]
    local_tic_img = getionimage(reader, 750, tol=150)

    # Threshold image to seperate tissue spectra from background
    smooth = filters.gaussian(local_tic_img, sigma=1.5)
    thresh_mean = filters.threshold_mean(smooth)
    thresh_img = local_tic_img > thresh_mean

    # Get sample number
    sample_num = int(msi_row.file_name.replace("HG ", "").replace("_", "-").split("-")[0])

    # Get sample label
    sample_label = int(msi_row.who_grade > 2)

    # Loop over each spectra
    for idx, (x,y,z) in tqdm(enumerate(reader.coordinates), total=len(reader.coordinates), desc="Spectra Loop"):
      # Check if spectra is tissue
      if thresh_img[y - 1, x - 1]:
        # Keep x coordinate of spectra
        x_coordinates.append(x)
        # Keep y coordinate of spectra
        y_coordinates.append(y)
        # Keep sample name of spectra
        sample_names.append(msi_row.file_name)
        # Keep sample number of spectra
        sample_nums.append(sample_num)
        # Keep sample label of spectra
        labels.append(sample_label)
        # Keep unique id of spectra
        idxs.append(idx)

# Convert to numpy array
x_coordinates = np.array(x_coordinates)
y_coordinates = np.array(y_coordinates)
sample_names = np.array(sample_names)
sample_nums = np.array(sample_nums)
labels = np.array(labels)
idxs = np.array(idxs)

MSI Loop:   0%|          | 0/24 [00:00<?, ?it/s]

In [30]:
parsers = { file_name: ImzMLParser(os.path.join(DHG_IN_PATH, f"{file_name}.imzML")) for file_name in labels_df.file_name.unique()}

#
def spectra_gen(indexes):
  for i in indexes:
    file_name = sample_names[i]
    idx = idxs[i]
    _, spectra = parsers[file_name].getspectrum(int(idx))
    yield [spectra, spectra]

#
def get_tf_dataset(ids):
  #
  dataset = tf.data.Dataset.from_generator(
    spectra_gen, output_types=(tf.float32, tf.float32), output_shapes=((None,), (None,)),
    args=ids)

  #
  dataset = dataset.shuffle(
    buffer_size=len(z), seed=0, reshuffle_each_iteration=True)
  
  #
  dataset = dataset.map(lambda i: tf.py_function(
    func=load_spectra, inp=[i], Tout=[tf.float32, tf.float32]),
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
  #
  dataset = dataset.batch(256).map(_fixup_shape)
  #
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  return dataset

### ***LOOCV section spectra classification:***

Next, let"s apply LOOCV classification using a simple dense NN:

In [31]:
# Loop over each sample number
for exclude_sample in tqdm(np.unique(sample_nums)[:1]):
  # Clear graph
  K.clear_session()
  gc.collect()
  
  # Create filter for training data
  train_filter = (sample_nums != exclude_sample)
  
  # Get unique ids of training data
  train_ids = ids[train_filter]
  
  # Split training to train and validation
  train_ids, val_ids, _, _ = train_test_split(
    train_ids, labels[train_filter], test_size=0.2,
    random_state=0, stratify=labels[train_filter])
  
  # Create generators
  training_generator = get_tf_dataset(train_ids)
  validation_generator  = get_tf_dataset(val_ids)
  test_generator  = get_tf_dataset(ids=ids[~train_filter])
  
  # Create Callback to save the NN weights for best epoch by validation
  checkpoint_filepath = "/sise/assafzar-group/assafzar/Leor/NanoBiopsy/nnbiopsy/desi_human_glioma"
  model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor="val_loss",
    mode="min",
    save_best_only=True
  )
  
  # Create VAE model
  vae_model = BNVAE(92000, 512, 10)
  
  # Compile the VAE model
  optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
  vae_model.compile(optimizer, loss=tf.keras.losses.MeanSquaredError())

  # Train the VAE model
  history = vae_model.fit(x=training_generator,
                          validation_data=validation_generator,
                          epochs=10,
                          callbacks=[model_checkpoint_callback])
  
  # Load the saved weights into the model
  vae_model.load_weights(checkpoint_filepath)
  
  # Evalute The NN on test set
  test_eval = vae_model.evaluate(x=test_generator)

  0%|          | 0/1 [00:00<?, ?it/s]

2022-05-02 12:14:48.007377: W tensorflow/core/framework/op_kernel.cc:1741] Invalid argument: ValueError: callback pyfunc_80 is not found
Traceback (most recent call last):

  File "/home/leorro/.conda/envs/tfgpu_jup/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 232, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_80 is not found


2022-05-02 12:14:48.007544: W tensorflow/core/kernels/data/generator_dataset_op.cc:103] Error occurred when finalizing GeneratorDataset iterator: Invalid argument: ValueError: callback pyfunc_80 is not found
Traceback (most recent call last):

  File "/home/leorro/.conda/envs/tfgpu_jup/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 232, in __call__
    raise ValueError("callback %s is not found" % token)

ValueError: callback pyfunc_80 is not found


	 [[{{node PyFunc}}]]


Epoch 1/10
     15/Unknown - 15s 1s/step - loss: nan   

KeyboardInterrupt: 

In [None]:
"""
predictions = {}
true_labels = {}

# Loop over each sample number
for exclude_sample in tqdm(np.unique(sample_nums)[:3]):
  # Clear graph
  K.clear_session()
  gc.collect()
  
  # Create filter for training data
  train_filter = (sample_nums != exclude_sample)
  
  # Get unique ids of training data
  train_ids = ids[train_filter]
  
  # Split training to train and validation
  train_ids, val_ids, y_train, y_val = train_test_split(
    train_ids, labels[train_filter], test_size=0.2,
    random_state=0, stratify=labels[train_filter])

  # Create dense NN for classification
  model = keras.Sequential(
      [
        keras.Input(shape=(intensities.shape[1],)),
        layers.Dense(512, activation="relu"),
        layers.Dense(512, activation="relu"),
        layers.Dense(256, activation="relu"),
        layers.Dense(256, activation="relu"),
        layers.Dense(128, activation="relu"),
        layers.Dense(1, activation="sigmoid")
      ]
  )
  
  # Create Callback to save the NN weights for best validation accuracy epoch
  checkpoint_filepath = "/sise/assafzar-group/assafzar/Leor/NanoBiopsy/nnbiopsy/desi_human_glioma"
  model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor="val_accuracy",
    mode="max",
    save_best_only=True
  )
  
  # Compile the NN
  model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

  # Train the NN
  history = model.fit(x=intensities[np.isin(ids, train_ids)],
                      y=y_train,
                      batch_size=512,
                      validation_data=(intensities[np.isin(ids, val_ids)], y_val),
                      epochs=3,
                      shuffle=True,
                      callbacks=[model_checkpoint_callback])
  
  # Load the saved weights into the model
  model.load_weights(checkpoint_filepath)
  
  # Evalute The NN on test set
  test_eval = model.evaluate(x=intensities[~train_filter],
                             y=labels[~train_filter],
                             batch_size=512)
  
  #
  predictions[exclude_sample] = model.predict(x=intensities[~train_filter]).ravel()
  
  #
  true_labels[exclude_sample] = labels[~train_filter]
"""

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(
  np.concatenate(list(true_labels.values())),
  np.concatenate(list(predictions.values())),
  pos_label=0)
roc_auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr, "b", label = f"AUC = {roc_auc:.2f}")
plt.legend(loc = "lower right")
plt.plot([0, 1], [0, 1],"r--")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.show()