# Jupyter Notebook to train a model

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from tf_explain.callbacks.grad_cam import GradCAMCallback

# use random seed to reproduce results
np.random.seed(42)
tf.random.set_seed(42)

2021-12-14 22:14:37.851189: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


### lists possible devices (CPU, GPU), used to check if GPU is recognized/exists

In [2]:
tf.config.get_visible_devices()

2021-12-14 22:14:41.881330: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-14 22:14:41.888562: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-12-14 22:14:41.963126: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-14 22:14:41.963932: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1080 computeCapability: 6.1
coreClock: 1.8225GHz coreCount: 20 deviceMemorySize: 7.93GiB deviceMemoryBandwidth: 298.32GiB/s
2021-12-14 22:14:41.963988: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2021-12-14 22:14:41.993918: I tensorflow/stream_executor/platform/d

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Methods

### used to save a trained model as a json file and its weights as a h5 file

In [3]:
def save_model(model, model_name):
    my_model = model.to_json()
    with open(f'./saved_models/{model_name}.json', "w") as file:
        file.write(my_model)
    # serialize weights to HDF5
    model.save_weights(f'./saved_models/{model_name}_weights.h5')

### used to build the base model using predefined architectures
currently: vgg16, xception, resnet

In [4]:
def build_base_model(architecture, weights):
    input = tf.keras.Input(shape=(224, 224, 3))
    if architecture == 'vgg16':
        return tf.keras.applications.vgg16.VGG16(weights=weights, include_top=False, input_tensor=input)
    if architecture == 'xception':
        return tf.keras.applications.xception.Xception(weights=weights, include_top=False, input_tensor=input)
    if architecture == 'resnet':
        return tf.keras.applications.resnet.ResNet50(weights=weights, include_top=False, input_tensor=input)

### gets base model as input and builds a new top layer and returns the model with custom top layers

In [5]:
def build_model(base_model):
    flat = keras.layers.Flatten(name='flatten')(base_model.output)
    dense_1 = keras.layers.Dense(1000)(flat)
    dropout = keras.layers.Dropout(0.25)(dense_1)
    batch = keras.layers.BatchNormalization()(dropout)
    output = keras.layers.Dense(1, activation='sigmoid')(batch)
    return tf.keras.Model(base_model.input, output)

### gets a model as input and returns a model compiled with the adam optimizer

In [6]:
def compile_model(model, alpha, beta1, beta2, metrics):
    optimizer = keras.optimizers.Adam(learning_rate=alpha, beta_1=beta1, beta_2=beta2)
    model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=metrics)
    return model

### gets a model as input and trains it on the data-set with the defined callbacks and epochs

In [7]:
def train_model(model, train_set, validation_set, epochs, callback):
    return model.fit(train_set,
                     validation_data=validation_set,
                     epochs=epochs,
                     callbacks=callback)

### gets a model as input and changes its layers trainable attribute

In [8]:
def set_layers_trainable(trainable, input_model):
    for layer in input_model.layers:
        layer.trainable = trainable

## Data preprocessing

preprocessing of the images applied when loading image data set from disk with tensorflows flow_from_directory


In [9]:
image_gen = keras.preprocessing.image.ImageDataGenerator(rotation_range=20,  # rotate the image 20 degrees
                                                         width_shift_range=0.2,
                                                         height_shift_range=0.2,
                                                         rescale=1 / 255,  # Rescale the image by normalzing it.
                                                         shear_range=0.15,
                                                         # Shear means cutting away part of the image (max 20%)
                                                         zoom_range=0.15,  # Zoom in by 15% max
                                                         horizontal_flip=True,  # Allow horizontal flipping
                                                         fill_mode='nearest'
                                                         # Fill in missing pixels with the nearest filled value
                                                         )

path to the data set

In [10]:
train_data_path = './data/data_full/train'  #local notebook
validation_data_path = './data/data_full/val'  #local notebook
test_data_path = './data/data_full/test'  #local notebook

generate training set by loading the images from their directories with flow_from_directory
important: the folder structure has to match! i.e {train} -> {ok,def}
at the "same time" the data augmentation is applied on the images through the ImageDataGenerator

In [11]:
batch_size = 15
train_image_gen = image_gen.flow_from_directory(train_data_path,
                                                target_size=(224, 224),
                                                batch_size=batch_size,
                                                class_mode='binary')

Found 48271 images belonging to 2 classes.


In [12]:
valid_image_gen = image_gen.flow_from_directory(validation_data_path,
                                                target_size=(224, 224),
                                                batch_size=batch_size,
                                                class_mode='binary')

Found 6033 images belonging to 2 classes.


In [13]:
test_image_gen = image_gen.flow_from_directory(test_data_path,
                                               target_size=(224, 224),
                                               batch_size=batch_size,
                                               class_mode='binary')

Found 6036 images belonging to 2 classes.


# Training of the model

In [14]:
vgg16 = build_base_model('vgg16', 'imagenet')
vgg16_model = build_model(vgg16)
vgg16_model = compile_model(vgg16_model, 0.0001, 0.9, 0.999, ['accuracy', 'Recall', 'Precision', 'AUC'])
set_layers_trainable(False, vgg16)
history = train_model(vgg16_model, train_image_gen, valid_image_gen, 5, [])
set_layers_trainable(True, vgg16)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs", histogram_freq=5, write_graph=True, write_images=True)
custom_early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
vgg16_model = compile_model(vgg16_model, 0.0001, 0.9, 0.999, ['accuracy', 'Recall', 'Precision', 'AUC'])
history = train_model(vgg16_model, train_image_gen, valid_image_gen, 100,
                      [custom_early_stopping, tensorboard_callback])
loss, accuracy, recall, precision, auc = vgg16_model.evaluate(test_image_gen)
# F1 score
f1 = 2 * ((precision * recall) / (precision + recall))

print(f"loss: {loss}, \n"
      f"accuracy: {accuracy}, \n"
      f"recall: {recall}, \n"
      f"precision: {precision}, \n"
      f"auc: {auc}, \n"
      f"F1: {f1}")

2021-12-14 22:14:57.069384: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-14 22:14:57.069764: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-14 22:14:57.069886: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1080 computeCapability: 6.1
coreClock: 1.8225GHz coreCount: 20 deviceMemorySize: 7.93GiB deviceMemoryBandwidth: 298.32GiB/s
2021-12-14 22:14:57.069922: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library li

Epoch 1/5


2021-12-14 22:14:59.749570: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2021-12-14 22:15:00.065581: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.7
2021-12-14 22:15:00.961046: W tensorflow/stream_executor/gpu/asm_compiler.cc:63] Running ptxas --version returned 256
2021-12-14 22:15:00.983751: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: ptxas exited with non-zero error code 256, output: 
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


2021-12-14 23:15:41.640556: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-12-14 23:15:41.640601: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2021-12-14 23:15:41.640947: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1365] Profiler found 1 GPUs
2021-12-14 23:15:41.649733: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcupti.so.10.1
2021-12-14 23:15:41.751164: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1415] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
2021-12-14 23:15:41.751350: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.


Epoch 1/100
   1/3219 [..............................] - ETA: 1:06:08 - loss: 0.0024 - accuracy: 1.0000 - recall: 0.0000e+00 - precision: 0.0000e+00 - auc: 0.0000e+00

2021-12-14 23:15:43.180591: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-12-14 23:15:43.180615: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2021-12-14 23:15:43.181058: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1415] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_INSUFFICIENT_PRIVILEGES


   3/3219 [..............................] - ETA: 14:00 - loss: 0.0015 - accuracy: 1.0000 - recall: 0.6667 - precision: 0.6667 - auc: 0.6667

2021-12-14 23:15:43.419661: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
2021-12-14 23:15:43.432291: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:228]  GpuTracer has collected 0 callback api events and 0 activity events. 
2021-12-14 23:15:43.437447: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2021-12-14 23:15:43.444705: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./logs/train/plugins/profile/2021_12_14_23_15_43
2021-12-14 23:15:43.446454: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to ./logs/train/plugins/profile/2021_12_14_23_15_43/pop-os.trace.json.gz
2021-12-14 23:15:43.468245: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./logs/train/plugins/profile/2021_12_14_23_15_43
2021-12-14 23:15:43.471072: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
loss: 0.00174473796505481, 
accuracy: 0.9993373155593872, 
recall: 0.9970030188560486, 
precision: 0.9989989995956421, 
auc: 0.9999971985816956, 
F1: 0.9980000112461148


In [15]:
save_model(vgg16_model, 'vgg16_full_data')

In [16]:
xception = build_base_model('xception', 'imagenet')
xception_model = build_model(xception)
xception_model = compile_model(xception_model, 0.0001, 0.9, 0.999, ['accuracy', 'Recall', 'Precision', 'AUC'])
set_layers_trainable(False, xception)
history = train_model(xception_model, train_image_gen, valid_image_gen, 5, [])
set_layers_trainable(True, xception)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs", histogram_freq=5, write_graph=True, write_images=True)
custom_early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
xception_model = compile_model(xception_model, 0.0001, 0.9, 0.999, ['accuracy', 'Recall', 'Precision', 'AUC'])
history = train_model(xception_model, train_image_gen, valid_image_gen, 100,
                      [custom_early_stopping, tensorboard_callback])
loss, accuracy, recall, precision, auc = xception_model.evaluate(test_image_gen)
# F1 score
f1 = 2 * ((precision * recall) / (precision + recall))

print(f"loss: {loss}, \n"
      f"accuracy: {accuracy}, \n"
      f"recall: {recall}, \n"
      f"precision: {precision}, \n"
      f"auc: {auc}, \n"
      f"F1: {f1}")

Epoch 1/5


2021-12-15 06:56:20.617303: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.43GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-12-15 06:56:21.353271: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.39GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-12-15 06:56:21.523010: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.45GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-12-15 06:56:24.426324: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Alloc



2021-12-15 07:04:49.239290: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.06GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-12-15 07:04:49.917327: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.09GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-12-15 07:04:51.587959: W tensorflow/core/common_runtime/bfc_allocator.cc:248] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.40GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


2021-12-15 08:24:13.078165: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-12-15 08:24:13.078195: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2021-12-15 08:24:13.078255: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1415] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_NOT_INITIALIZED
2021-12-15 08:24:13.078272: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.


Epoch 1/100
   1/3219 [..............................] - ETA: 2:59:28 - loss: 0.2328 - accuracy: 0.9333 - recall: 1.0000 - precision: 0.5000 - auc: 1.0000

2021-12-15 08:24:16.701621: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-12-15 08:24:16.701644: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2021-12-15 08:24:16.701702: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1415] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_NOT_INITIALIZED


   2/3219 [..............................] - ETA: 20:39 - loss: 0.1749 - accuracy: 0.9500 - recall: 1.0000 - precision: 0.6500 - auc: 1.0000  

2021-12-15 08:24:17.528764: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
2021-12-15 08:24:17.541214: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:228]  GpuTracer has collected 0 callback api events and 0 activity events. 
2021-12-15 08:24:17.545013: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2021-12-15 08:24:17.548816: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./logs/train/plugins/profile/2021_12_15_08_24_17
2021-12-15 08:24:17.550144: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to ./logs/train/plugins/profile/2021_12_15_08_24_17/pop-os.trace.json.gz
2021-12-15 08:24:17.585278: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./logs/train/plugins/profile/2021_12_15_08_24_17
2021-12-15 08:24:17.591682: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

KeyboardInterrupt: 

In [None]:
save_model(xception_model, 'xception_full_data')

In [None]:
resnet = build_base_model('resnet', 'imagenet')
resnet_model = build_model(resnet)
resnet_model = compile_model(resnet_model, 0.0001, 0.9, 0.999, ['accuracy', 'Recall', 'Precision', 'AUC'])
set_layers_trainable(False, resnet)
history = train_model(resnet_model, train_image_gen, valid_image_gen, 5, [])
set_layers_trainable(True, resnet)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs", histogram_freq=5, write_graph=True, write_images=True)
custom_early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
resnet_model = compile_model(resnet_model, 0.0001, 0.9, 0.999, ['accuracy', 'Recall', 'Precision', 'AUC'])
history = train_model(resnet_model, train_image_gen, valid_image_gen, 100,
                      [custom_early_stopping, tensorboard_callback])
loss, accuracy, recall, precision, auc = resnet_model.evaluate(test_image_gen)
# F1 score
f1 = 2 * ((precision * recall) / (precision + recall))

print(f"loss: {loss}, \n"
      f"accuracy: {accuracy}, \n"
      f"recall: {recall}, \n"
      f"precision: {precision}, \n"
      f"auc: {auc}, \n"
      f"F1: {f1}")

In [None]:
save_model(resnet_model, 'resnet_full_data')