In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pyimzml.ImzMLParser import ImzMLParser
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from datetime import datetime
import gc

In [2]:
# Define folder that contains the preprocessed dhg dataset
DHG_IN_PATH = "/sise/assafzar-group/assafzar/Leor/DHG/Preprocessed"
# Define file to export
META_DATA_PATH = "/sise/assafzar-group/assafzar/Leor/DHG/Preprocessed/Metadata.csv"

In [3]:
# 
meta_data = pd.read_csv(META_DATA_PATH)
# 
meta_data = meta_data[meta_data.file_name.str.contains('r')]
# 
meta_data["label"] = (meta_data.who_grade > 2).astype(int)

In [4]:
#
parsers = { file_name: ImzMLParser(os.path.join(DHG_IN_PATH, f"{file_name}.imzML")) for file_name in meta_data.file_name.unique()}

In [5]:
class DataGenerator(keras.utils.Sequence):
  'Generates data for Keras'
  def __init__(self, df, parsers, batch_size=32,
               dim=(92000,), shuffle=True):
    'Initialization'
    self.dim = dim
    self.batch_size = batch_size
    self.df = df
    self.parsers = parsers
    self.shuffle = shuffle
    self.on_epoch_end()

  def __len__(self):
    'Denotes the number of batches per epoch'
    return max(1, int(np.floor(len(self.df) / self.batch_size)))

  def __getitem__(self, index):
    'Generate one batch of data'
    # Generate indexes of the batch
    batch_df = self.df[index*self.batch_size:(index+1)*self.batch_size]
    # Generate data
    X, y = self.__data_generation(batch_df)
    return X, y

  def on_epoch_end(self):
    'Updates indexes after each epoch'
    if self.shuffle == True:
        self.df = self.df.sample(frac=1).reset_index(drop=True)

  def __data_generation(self, batch_df):
    'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
    # Initialization
    X = np.empty((self.batch_size, *self.dim))
    y = np.empty((self.batch_size), dtype=int)
    # Generate data
    for i, (_, row) in enumerate(batch_df.iterrows()):
      # get spectra
      _, spectra = self.parsers[row.file_name].getspectrum(row.idx)
      # Store sample
      X[i,] = spectra
      # Store class
      y[i] = row.label
    return X, y

In [6]:
#
now = datetime.now()
#
with open(f"/sise/assafzar-group/assafzar/Leor/log {now.strftime('%d-%m-%Y-%H-%M-%S')}.txt", "w") as file:
  #
  for exclude_sample in tqdm(meta_data.file_name.unique()):
    #
    gc.collect()
    # 
    meta_data_train, meta_data_val = train_test_split(
      meta_data[meta_data.file_name != exclude_sample], test_size=0.2)
    # 
    meta_data_test = meta_data[meta_data.file_name == exclude_sample]

    # Parameters
    params = {"dim": (92000,), "batch_size": 1024,
              "shuffle": True, "parsers": parsers}

    # Generators
    training_generator = DataGenerator(meta_data_train, **params)
    validation_generator = DataGenerator(meta_data_val, **params)
    testing_generator = DataGenerator(meta_data_test, **params)

    #
    model = keras.Sequential(
        [
          keras.Input(shape=(92000,)),
          layers.Dense(512, activation='relu'),
          layers.Dense(512, activation='relu'),
          layers.Dense(256, activation='relu'),
          layers.Dense(256, activation='relu'),
          layers.Dense(128, activation='relu'),
          layers.Dense(1, activation='sigmoid')
        ]
    )
    # 
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    # 
    history = model.fit(x=training_generator, validation_data=validation_generator, epochs=10)
    # 
    test_eval = model.evaluate(testing_generator)
    
    #
    file.write(f"Excluded: {exclude_sample}\n")
    file.write(f"Train loss: {history.history['loss']}\n")
    file.write(f"Train accuracy: {history.history['accuracy']}\n")
    file.write(f"Validation loss: {history.history['val_loss']}\n")
    file.write(f"Validation accuracy: {history.history['val_accuracy']}\n")
    file.write(f"Test loss: {test_eval[0]}\n")
    file.write(f"Test accuracy: {test_eval[1]}\n\n")

  0%|          | 0/24 [00:00<?, ?it/s]2022-04-24 21:23:00.359591: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-24 21:23:00.359626: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-24 21:23:00.359652: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (cs-cpu-01): /proc/driver/nvidia/version does not exist
2022-04-24 21:23:00.398812: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-24 21:23:06.601960: I tensorflow/compiler/mlir/ml

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  4%|▍         | 1/24 [41:43<15:59:40, 2503.51s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  8%|▊         | 2/24 [1:25:13<15:29:39, 2535.44s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 12%|█▎        | 3/24 [2:07:52<14:49:52, 2542.51s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 17%|█▋        | 4/24 [2:50:07<14:06:48, 2540.40s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 21%|██        | 5/24 [3:37:35<13:53:35, 2632.41s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 25%|██▌       | 6/24 [4:26:53<13:39:04, 2730.24s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 29%|██▉       | 7/24 [5:11:13<12:47:35, 2709.14s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 33%|███▎      | 8/24 [5:58:50<12:14:15, 2753.50s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 38%|███▊      | 9/24 [6:39:51<11:06:28, 2665.87s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 42%|████▏     | 10/24 [7:22:21<10:13:52, 2630.86s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 46%|████▌     | 11/24 [8:02:56<9:17:17, 2572.09s/it] 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 50%|█████     | 12/24 [8:44:56<8:31:21, 2556.75s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 54%|█████▍    | 13/24 [9:27:00<7:46:53, 2546.66s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 58%|█████▊    | 14/24 [10:08:53<7:02:45, 2536.55s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 62%|██████▎   | 15/24 [10:50:26<6:18:33, 2523.70s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 67%|██████▋   | 16/24 [11:30:18<5:31:12, 2484.05s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 71%|███████   | 17/24 [12:10:13<4:46:41, 2457.34s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 75%|███████▌  | 18/24 [12:53:14<4:09:26, 2494.41s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 79%|███████▉  | 19/24 [13:36:02<3:29:43, 2516.71s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 83%|████████▎ | 20/24 [14:19:14<2:49:16, 2539.11s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 88%|████████▊ | 21/24 [15:04:14<2:09:21, 2587.28s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 92%|█████████▏| 22/24 [15:47:25<1:26:16, 2588.47s/it]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


 96%|█████████▌| 23/24 [16:28:32<42:32, 2552.17s/it]  

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


100%|██████████| 24/24 [17:09:52<00:00, 2530.43s/it]


In [14]:
output = []

with open(f"/sise/assafzar-group/assafzar/Leor/log {now.strftime('%d-%m-%Y-%H-%M-%S')}.txt", "r") as file:
  for _ in tqdm(meta_data.file_name.unique()):
    exclude_sample = file.readline().replace("Excluded: ", "").replace("\n","")
    train_loss = list(file.readline().replace("Train loss: ", "").replace("[", "").replace("]", "").replace("\n","").split(","))[-1]
    train_accuracy = list(file.readline().replace("Train accuracy: ", "").replace("[", "").replace("]", "").replace("\n","").split(","))[-1]
    val_loss = list(file.readline().replace("Validation  loss: ", "").replace("[", "").replace("]", "").replace("\n","").split(","))[-1]
    val_accuracy = list(file.readline().replace("Validation accuracy: ", "").replace("[", "").replace("]", "").replace("\n","").split(","))[-1]
    test_loss = file.readline().replace("Test loss: ", "").replace("\n","")
    test_accuracy = file.readline().replace("Test accuracy: ", "").replace("\n","")
    _ = file.readline()
    output.append([exclude_sample, float(train_loss), float(train_accuracy), float(val_loss), float(val_accuracy), float(test_loss), float(test_accuracy)])
    
new_df = pd.DataFrame(columns=["exclude_sample", "train_loss", "train_accuracy", "val_loss", "val_accuracy", "test_loss", "test_accuracy"], data=output)

100%|██████████| 24/24 [00:00<00:00, 26262.27it/s]


In [15]:
new_df[["train_accuracy", "val_accuracy", "test_accuracy"]].mean() * 100

train_accuracy    98.109323
val_accuracy      98.044840
test_accuracy     74.058024
dtype: float64

In [17]:
new_df[["train_accuracy", "val_accuracy", "test_accuracy"]].sem() * 100

train_accuracy    1.791221
val_accuracy      1.795072
test_accuracy     7.973695
dtype: float64

In [18]:
from sklearn.dummy import DummyClassifier

output2 = []

for exclude_sample in tqdm(meta_data.file_name.unique()):
    #
    gc.collect()
    # 
    meta_data_train, meta_data_val = train_test_split(
      meta_data[meta_data.file_name != exclude_sample], test_size=0.2)
    # 
    meta_data_test = meta_data[meta_data.file_name == exclude_sample]
    
    #
    X_train = meta_data_train.drop(["label", "histology", "who_grade"], axis=1)
    Y_train = meta_data_train["label"]
    
    #
    X_test = meta_data_test.drop(["label", "histology", "who_grade"], axis=1)
    Y_test = meta_data_test["label"]
    
    # 
    dummy_clf = DummyClassifier(strategy="most_frequent")
    
    # 
    dummy_clf.fit(X_train, Y_train)
    
    #
    output2.append([dummy_clf.score(X_train, Y_train), dummy_clf.score(X_test, Y_test)])

100%|██████████| 24/24 [00:06<00:00,  3.51it/s]


In [19]:
new_df_2 = pd.DataFrame(columns=["dummy_train_accuracy", "dummy_test_accuracy"], data=output2)

In [20]:
new_df_2[["dummy_train_accuracy", "dummy_test_accuracy"]].mean() * 100

dummy_train_accuracy    57.55302
dummy_test_accuracy     62.50000
dtype: float64

In [21]:
new_df_2[["dummy_train_accuracy", "dummy_test_accuracy"]].sem() * 100

dummy_train_accuracy     0.488037
dummy_test_accuracy     10.094661
dtype: float64