In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow import keras
import glob
import PIL, cv2
import os
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../scripts/')
from swin_transformer import final_model
%load_ext autoreload
%autoreload 2
import tensorflow.keras.backend as K

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
allowed_gpus = [0]
gpus = tf.config.list_physical_devices('GPU')
final_gpu_list = [gpus[x] for x in allowed_gpus]
tf.config.set_visible_devices(final_gpu_list, 'GPU')
for gpu in final_gpu_list:
    tf.config.experimental.set_memory_growth(gpu, True)
strategy = tf.distribute.MirroredStrategy()
AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [3]:
csv_file = pd.read_csv('../../files/train.csv')

In [4]:
csv_file.head()

Unnamed: 0,image_id,center_id,patient_id,image_num,label
0,006388_0,11,006388,0,CE
1,008e5c_0,11,008e5c,0,CE
2,00c058_0,11,00c058,0,LAA
3,01adc5_0,11,01adc5,0,LAA
4,026c97_0,4,026c97,0,CE


In [5]:
csv_file['fixed_paths'] = csv_file['image_id'].apply(lambda x: '../../files/resized_train/' + x + '.png')

In [6]:
csv_file.head()

Unnamed: 0,image_id,center_id,patient_id,image_num,label,fixed_paths
0,006388_0,11,006388,0,CE,../../files/resized_train/006388_0.png
1,008e5c_0,11,008e5c,0,CE,../../files/resized_train/008e5c_0.png
2,00c058_0,11,00c058,0,LAA,../../files/resized_train/00c058_0.png
3,01adc5_0,11,01adc5,0,LAA,../../files/resized_train/01adc5_0.png
4,026c97_0,4,026c97,0,CE,../../files/resized_train/026c97_0.png


In [7]:
for x in csv_file['fixed_paths'].values:
    if os.path.exists(x) == False:
        print(x)

In [8]:
def split_datasets(csv_file, test_size = 0.01):
    train, test = train_test_split(csv_file, test_size=test_size)
    train, val = train_test_split(train, test_size=test_size)
    return train, val, test

In [9]:
def read_imgs(img, label):
    img = tf.io.read_file(img)
    img = tf.image.decode_png(img, channels=3)
    img = img / 255
    return img, label

In [10]:
def get_data(csv_file, repeat=True, shuffle=True, batch=True, batch_size=16):
    imgs, labels = csv_file['fixed_paths'].values.tolist(), [1 if x == 'CE' else 0 for x in csv_file['label'].values.tolist()]
    tensor = tf.data.Dataset.from_tensor_slices((imgs, labels))
    tensor = tensor.cache()
    if repeat:
        tensor = tensor.repeat()
    if shuffle:
        tensor = tensor.shuffle(256 * REPLICAS)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        tensor = tensor.with_options(opt)
    tensor = tensor.map(read_imgs, num_parallel_calls=AUTO)
    if batch:
        tensor = tensor.batch(batch_size * REPLICAS)
    tensor = tensor.prefetch(AUTO)
    return tensor

In [11]:
train, val, test = split_datasets(csv_file, test_size=0.05)

In [12]:
train_dataset = get_data(train)
val_dataset = get_data(val)
test_dataset = get_data(test)

In [13]:
input_shape = (256, 256, 3)
patch_size = (2,2)
num_patch_x = input_shape[0] // patch_size[0]
num_patch_y = input_shape[1] // patch_size[1]
embed_dim = 64
num_heads = 8
window_size = 2
dropout_rate = 0.0
qkv_bias = True
num_mlp = 256
shift_size = 1

In [14]:
test_arr = np.random.random((10, 256, 256, 3))

In [15]:
with strategy.scope():
    model = final_model(input_shape, patch_size, embed_dim, num_heads, window_size, dropout_rate, qkv_bias, num_mlp, shift_size)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

In [16]:
K.clear_session()
# log_dir = f"{os.environ['tb_path']}segmentation/res50_baseline_bs_128_is_32/"
# if os.path.exists(log_dir) == False:
#     os.makedirs(log_dir)
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir)
# weights_path = f'/home/ubuntu/ml-data-training/ship_seg_weights/segmentation/res50_baseline_bs_128_is_32/'
# weights_save = CallbackForSavingModelWeights(weights_path)
batch_size = 4
train_dataset = get_data(train, batch_size=batch_size)
val_dataset = get_data(val, shuffle=False, repeat=False, batch_size=batch_size)
# model_config_file
input_shape = (256, 256, 3)
patch_size = (2,2)
num_patch_x = input_shape[0] // patch_size[0]
num_patch_y = input_shape[1] // patch_size[1]
embed_dim = 64
num_heads = 8
window_size = 2
dropout_rate = 0.0
qkv_bias = True
num_mlp = 256
shift_size = 1
# model traning loop
with strategy.scope():
    model = final_model(input_shape, patch_size, embed_dim, num_heads, window_size, dropout_rate, qkv_bias, num_mlp, shift_size)
    model_hist = model.fit(
        train_dataset,
        steps_per_epoch = len(train) // (batch_size * REPLICAS),
        epochs = 100,
        verbose = 1,
        validation_data = val_dataset,
        # callbacks = [tensorboard_callback]
    )

Epoch 1/100


ResourceExhaustedError: Graph execution error:

Detected at node 'model/swin_transformer_1/sequential_1/dense_7/Tensordot/MatMul' defined at (most recent call last):
    File "/usr/lib/python3.8/threading.py", line 890, in _bootstrap
      self._bootstrap_inner()
    File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
      self.run()
    File "/usr/lib/python3/dist-packages/keras/engine/training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "/usr/lib/python3/dist-packages/keras/engine/training.py", line 1023, in train_step
      y_pred = self(x, training=True)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/training.py", line 561, in __call__
      return super().__call__(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/functional.py", line 511, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/usr/lib/python3/dist-packages/keras/engine/functional.py", line 668, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/ubuntu/mayo_strip_ai/notebooks/../scripts/swin_transformer.py", line 247, in call
      x = self.mlp(x)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/training.py", line 561, in __call__
      return super().__call__(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/sequential.py", line 413, in call
      return super().call(inputs, training=training, mask=mask)
    File "/usr/lib/python3/dist-packages/keras/engine/functional.py", line 511, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/usr/lib/python3/dist-packages/keras/engine/functional.py", line 668, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/layers/core/dense.py", line 244, in call
      outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
Node: 'model/swin_transformer_1/sequential_1/dense_7/Tensordot/MatMul'
Detected at node 'model/swin_transformer_1/sequential_1/dense_7/Tensordot/MatMul' defined at (most recent call last):
    File "/usr/lib/python3.8/threading.py", line 890, in _bootstrap
      self._bootstrap_inner()
    File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
      self.run()
    File "/usr/lib/python3/dist-packages/keras/engine/training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "/usr/lib/python3/dist-packages/keras/engine/training.py", line 1023, in train_step
      y_pred = self(x, training=True)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/training.py", line 561, in __call__
      return super().__call__(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/functional.py", line 511, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/usr/lib/python3/dist-packages/keras/engine/functional.py", line 668, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/ubuntu/mayo_strip_ai/notebooks/../scripts/swin_transformer.py", line 247, in call
      x = self.mlp(x)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/training.py", line 561, in __call__
      return super().__call__(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/sequential.py", line 413, in call
      return super().call(inputs, training=training, mask=mask)
    File "/usr/lib/python3/dist-packages/keras/engine/functional.py", line 511, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/usr/lib/python3/dist-packages/keras/engine/functional.py", line 668, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/usr/lib/python3/dist-packages/keras/layers/core/dense.py", line 244, in call
      outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
Node: 'model/swin_transformer_1/sequential_1/dense_7/Tensordot/MatMul'
2 root error(s) found.
  (0) RESOURCE_EXHAUSTED:  OOM when allocating tensor with shape[65536,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model/swin_transformer_1/sequential_1/dense_7/Tensordot/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

	 [[ReadVariableOp/_140]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

  (1) RESOURCE_EXHAUSTED:  OOM when allocating tensor with shape[65536,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model/swin_transformer_1/sequential_1/dense_7/Tensordot/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_7455]