In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pyimzml.ImzMLParser import ImzMLParser
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from datetime import datetime
import gc

In [2]:
# Define folder that contains the preprocessed dhg dataset
DHG_IN_PATH = "/sise/assafzar-group/assafzar/Leor/DHG/Preprocessed"
# Define file to export
META_DATA_PATH = "/sise/assafzar-group/assafzar/Leor/DHG/Preprocessed/Metadata.csv"

In [3]:
# 
meta_data = pd.read_csv(META_DATA_PATH)
# 
meta_data = meta_data[meta_data.file_name.str.contains('s')]
# 
meta_data["label"] = (meta_data.who_grade > 2).astype(int)

In [4]:
#
parsers = { file_name: ImzMLParser(os.path.join(DHG_IN_PATH, f"{file_name}.imzML")) for file_name in meta_data.file_name.unique()}

In [5]:
class DataGenerator(keras.utils.Sequence):
  'Generates data for Keras'
  def __init__(self, df, parsers, batch_size=32,
               dim=(92000,), shuffle=True):
    'Initialization'
    self.dim = dim
    self.batch_size = batch_size
    self.df = df
    self.parsers = parsers
    self.shuffle = shuffle
    self.on_epoch_end()

  def __len__(self):
    'Denotes the number of batches per epoch'
    return int(np.floor(len(self.df) / self.batch_size))

  def __getitem__(self, index):
    'Generate one batch of data'
    # Generate indexes of the batch
    batch_df = self.df[index*self.batch_size:(index+1)*self.batch_size]
    # Generate data
    X, y = self.__data_generation(batch_df)
    return X, y

  def on_epoch_end(self):
    'Updates indexes after each epoch'
    if self.shuffle == True:
        self.df = self.df.sample(frac=1).reset_index(drop=True)

  def __data_generation(self, batch_df):
    'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
    # Initialization
    X = np.empty((self.batch_size, *self.dim))
    y = np.empty((self.batch_size), dtype=int)
    # Generate data
    for i, (_, row) in enumerate(batch_df.iterrows()):
      # get spectra
      _, spectra = self.parsers[row.file_name].getspectrum(row.idx)
      # Store sample
      X[i,] = spectra
      # Store class
      y[i] = row.label
    return X, y

In [None]:
#
now = datetime.now()
#
with open(f"/sise/assafzar-group/assafzar/Leor/log {now.strftime('%d-%m-%Y-%H-%M-%S')}.txt", "w") as file:
  #
  for exclude_sample in tqdm(meta_data.file_name.unique()):
    #
    gc.collect()
    # 
    meta_data_train, meta_data_val = train_test_split(
      meta_data[meta_data.file_name != exclude_sample], test_size=0.2)
    # 
    meta_data_test = meta_data[meta_data.file_name == exclude_sample]

    # Parameters
    params = {"dim": (92000,), "batch_size": 512,
              "shuffle": True, "parsers": parsers}

    # Generators
    training_generator = DataGenerator(meta_data_train, **params)
    validation_generator = DataGenerator(meta_data_val, **params)
    testing_generator = DataGenerator(meta_data_test, **params)

    #
    model = keras.Sequential(
        [
          keras.Input(shape=(92000,)),
          layers.Dense(512, activation='relu'),
          layers.Dense(512, activation='relu'),
          layers.Dense(256, activation='relu'),
          layers.Dense(256, activation='relu'),
          layers.Dense(128, activation='relu'),
          layers.Dense(1, activation='sigmoid')
        ]
    )
    # 
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    # 
    history = model.fit(x=training_generator, validation_data=validation_generator, epochs=10)
    # 
    test_eval = model.evaluate(testing_generator)
    
    #
    file.write(f"Excluded: {exclude_sample}\n")
    file.write(f"Train loss: {history.history['loss']}\n")
    file.write(f"Train accuracy: {history.history['accuracy']}\n")
    file.write(f"Validation loss: {history.history['val_loss']}\n")
    file.write(f"Validation accuracy: {history.history['val_accuracy']}\n")
    file.write(f"Test loss: {test_eval[0]}\n")
    file.write(f"Test accuracy: {test_eval[1]}\n\n")

  0%|          | 0/24 [00:00<?, ?it/s]2022-04-23 18:26:55.608590: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-23 18:26:55.608674: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-23 18:26:55.608722: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (cs-cpu-22): /proc/driver/nvidia/version does not exist
2022-04-23 18:26:55.611911: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-23 18:27:06.715414: I tensorflow/compiler/mlir/mlir_graph_

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  4%|▍         | 1/24 [31:37<12:07:19, 1897.35s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  8%|▊         | 2/24 [1:01:57<11:27:11, 1874.17s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 12%|█▎        | 3/24 [1:31:49<10:47:22, 1849.66s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 17%|█▋        | 4/24 [2:01:09<10:07:34, 1822.73s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 21%|██        | 5/24 [2:29:52<9:27:38, 1792.57s/it] 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 25%|██▌       | 6/24 [2:58:41<8:52:05, 1773.64s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 29%|██▉       | 7/24 [3:26:07<8:11:38, 1735.22s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 33%|███▎      | 8/24 [3:54:57<7:42:19, 1733.75s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 38%|███▊      | 9/24 [4:23:47<7:13:08, 1732.58s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 42%|████▏     | 10/24 [4:54:41<6:52:45, 1768.98s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 46%|████▌     | 11/24 [5:24:10<6:23:18, 1769.09s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 50%|█████     | 12/24 [5:55:55<6:01:58, 1809.88s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 54%|█████▍    | 13/24 [6:26:15<5:32:21, 1812.88s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 58%|█████▊    | 14/24 [6:56:55<5:03:31, 1821.17s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 62%|██████▎   | 15/24 [7:26:31<4:31:07, 1807.50s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 67%|██████▋   | 16/24 [7:55:53<3:59:09, 1793.74s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 71%|███████   | 17/24 [8:23:45<3:25:01, 1757.40s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 75%|███████▌  | 18/24 [8:54:14<2:57:52, 1778.72s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 79%|███████▉  | 19/24 [9:24:37<2:29:20, 1792.06s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 83%|████████▎ | 20/24 [9:54:52<1:59:55, 1798.92s/it]

Epoch 1/10

In [33]:
output = []

with open(f"/sise/assafzar-group/assafzar/Leor/log {now.strftime('%d-%m-%Y-%H-%M-%S')}.txt", "r") as file:
  for _ in tqdm(meta_data.file_name.unique()):
    exclude_sample = file.readline().replace("Excluded: ", "").replace("\n","")
    train_loss = list(file.readline().replace("Train loss: ", "").replace("[", "").replace("]", "").replace("\n","").split(","))[-1]
    train_accuracy = list(file.readline().replace("Train accuracy: ", "").replace("[", "").replace("]", "").replace("\n","").split(","))[-1]
    val_loss = list(file.readline().replace("Validation  loss: ", "").replace("[", "").replace("]", "").replace("\n","").split(","))[-1]
    val_accuracy = list(file.readline().replace("Validation accuracy: ", "").replace("[", "").replace("]", "").replace("\n","").split(","))[-1]
    test_loss = file.readline().replace("Test loss: ", "").replace("\n","")
    test_accuracy = file.readline().replace("Test accuracy: ", "").replace("\n","")
    _ = file.readline()
    output.append([exclude_sample, float(train_loss), float(train_accuracy), float(val_loss), float(val_accuracy), float(test_loss), float(test_accuracy)])
    
new_df = pd.DataFrame(columns=["exclude_sample", "train_loss", "train_accuracy", "val_loss", "val_accuracy", "test_loss", "test_accuracy"], data=output)

100%|██████████| 24/24 [00:00<00:00, 15185.29it/s]


In [52]:
new_df[["train_accuracy", "val_accuracy", "test_accuracy"]].mean() * 100

train_accuracy    99.993682
val_accuracy      99.766077
test_accuracy     83.034261
dtype: float64

In [53]:
new_df[["train_loss", "val_loss", "test_accuracy"]].sem() * 100

train_loss       0.015288
val_loss         0.089735
test_accuracy    6.756506
dtype: float64

In [83]:
from sklearn.dummy import DummyClassifier

output2 = []

for exclude_sample in tqdm(meta_data.file_name.unique()):
    #
    gc.collect()
    # 
    meta_data_train, meta_data_val = train_test_split(
      meta_data[meta_data.file_name != exclude_sample], test_size=0.2)
    # 
    meta_data_test = meta_data[meta_data.file_name == exclude_sample]
    
    #
    X_train = meta_data_train.drop(["label", "histology", "who_grade"], axis=1)
    Y_train = meta_data_train["label"]
    
    #
    X_test = meta_data_test.drop(["label", "histology", "who_grade"], axis=1)
    Y_test = meta_data_test["label"]
    
    # 
    dummy_clf = DummyClassifier(strategy="most_frequent")
    
    # 
    dummy_clf.fit(X_train, Y_train)
    
    #
    output2.append([dummy_clf.score(X_train, Y_train), dummy_clf.score(X_test, Y_test)])

100%|██████████| 24/24 [00:06<00:00,  3.49it/s]


In [86]:
new_df_2 = pd.DataFrame(columns=["dummy_train_accuracy", "dummy_test_accuracy"], data=output2)

In [88]:
new_df_2[["dummy_train_accuracy", "dummy_test_accuracy"]].mean() * 100

dummy_train_accuracy    63.467914
dummy_test_accuracy     66.666667
dtype: float64

In [89]:
new_df_2[["dummy_train_accuracy", "dummy_test_accuracy"]].sem() * 100

dummy_train_accuracy    0.496154
dummy_test_accuracy     9.829464
dtype: float64