# PREPROCESS

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
import cv2
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import keras.backend as K
import keras
import gc
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Dropout, BatchNormalization
from keras.models import Model
from keras import regularizers

In [None]:
# Loading the training data
train_df = pd.read_csv('train/train_set.csv', index_col="Id")
labels = train_df.columns
train_df["img"] = [np.load('train/img/train_{}.npy'.format(idx)) for idx, _ in train_df.iterrows()]
train_df["seg"] = [np.load('train/seg/train_{}.npy'.format(idx)) for idx, _ in train_df.iterrows()]
print("The training set contains {} examples.".format(len(train_df)))

In [None]:
# Loading the test data
# test_df = pd.read_csv('test/test_set.csv', index_col="Id")
# test_df["img"] = [np.load('test/img/test_{}.npy'.format(idx)) for idx, _ in test_df.iterrows()]
# test_df["seg"] = [-1 * np.ones(img.shape[:2], dtype=np.int8) for img in test_df["img"]]
# print("The test set contains {} examples.".format(len(test_df)))


In [None]:
#take first 20 columns the values of 0 and 1 of the dataframe as labels
labels_df = train_df.iloc[:, :20]
train_labels = labels_df.values

In [None]:
# Define a function to resize images
def resize_images(img_series, size=(224, 224)):
    return img_series.apply(lambda img: cv2.resize(img, size))

# Resize and convert the images to a numpy array
train_images = np.stack(resize_images(train_df["img"]).values)
test_images = np.stack(resize_images(test_df["img"]).values)


In [None]:

# Divide full training set into training and validation set:

X_train, X_val, y_train, y_val = train_test_split(train_images, train_labels, test_size=0.1, random_state=0)

In [None]:
# Normalization
print(np.max(X_train))
X_train = X_train / 255
X_val  = X_val / 255
print(np.max(X_train))

In [None]:
# Creating class weights for the weighted loss
weights = np.empty([20, 2])
for i in range(20):
    weights[i] = compute_class_weight(class_weight="balanced",
                               classes=np.array([0, 1]),
                               y=y_train[:, i])
weights = weights.astype(np.float32)
print(weights)

In [None]:
# Weighed loss function
def get_weighted_loss(weights):
    def weighted_loss(y_true, y_pred):
        return K.mean((weights[:,0]**(1-y_true))*(weights[:,1]**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1)
    return weighted_loss

In [None]:
# F1 metric
def get_f1(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  recall = true_positives / (possible_positives + K.epsilon())
  f1_val = 2*(precision*recall)/(precision+recall + K.epsilon())
  return f1_val

In [None]:
## WARNING: deletes model and everything
# I run this to free up my memory after running a model
from keras import backend as K
K.clear_session()
gc.collect()
del model

In [None]:
# Labels as floet for calculation in the weighted loss
y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)


# MODEL

In [None]:
# MODEL PARAMETERS
OPT = keras.optimizers.Adam(learning_rate=0.0001)
# OPT = keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)
EPOCHS = 50
BATCH_SIZE = 32
NAME = "InceptionResNetV2_augmented.h5"

In [None]:
# saving best weights based on the F1 score
save_best = ModelCheckpoint(NAME, monitor='val_get_f1', verbose=0, 
                              save_best_only=True, mode='max', save_freq="epoch")


In [None]:
# Creating data augmentation generator for training
train_datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')

validation_datagen = ImageDataGenerator()

In [None]:
# Importing the pretrained model of choice
from keras.applications import InceptionResNetV2
# from keras.applications import VGG16

In [None]:
# MODEL

base_model = InceptionResNetV2(include_top=False,
                                      weights="imagenet",     # "imagenet" when transfer learning
                                      input_shape=(224, 224, 3),
                                      pooling = "avg"
                                      )
x = base_model.output
x = Dropout(0.5)(x)
x = Dense(1024, activation="relu")(x)

predictions = Dense(20, activation="sigmoid")(x)

model = Model(inputs=base_model.input, outputs=predictions)

# TRAINING

In [None]:
# Freezing pretrained layers + Compiling model

# FALSE when transfer learning
for layer in base_model.layers:
    layer.trainable = False

model.compile(optimizer=OPT, loss=get_weighted_loss(weights), metrics=["accuracy", get_f1])
# For recall and precision metrics
# model.compile(optimizer=OPT, loss=get_weighted_loss(weights), metrics=["accuracy", tf.keras.metrics.Precision(name='Precision'), 
#                        tf.keras.metrics.Recall(name='Recall')])

In [None]:
# TRAINING (the fully connected added layers)

history = model.fit(train_datagen.flow(X_train, y_train, batch_size=BATCH_SIZE),
                    epochs=EPOCHS,
                    validation_data = validation_datagen.flow(X_val, y_val, batch_size=BATCH_SIZE),
                    shuffle=True,
                    callbacks=[save_best]
                    )

In [None]:
# Plot loss

loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Plot accuracy
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'y', label='Training accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Plot F1 score
f1 = history.history['get_f1']
val_f1 = history.history['val_get_f1']
epochs = range(1, len(f1) + 1)
plt.plot(epochs, f1, 'y', label='Training F1')
plt.plot(epochs, val_f1, 'r', label='Validation F1')
plt.title('Training and validation F1')
plt.xlabel('Epochs')
plt.ylabel('F1')
plt.legend()
plt.show()

In [None]:
# When using the precision and recall metrics

# Plot precision
precision = history.history['Precision']
val_precision = history.history['val_Precision']
epochs = range(1, len(precision) + 1)
plt.plot(epochs, precision, 'y', label='Training precision')
plt.plot(epochs, val_precision, 'r', label='Validation precision')
plt.title('Training and validation precision')
plt.xlabel('Epochs')
plt.ylabel('Precision')
plt.legend()
plt.show()

In [None]:
# When using the precision and recall metrics

# Plot recall
recall = history.history['Recall']
val_recall = history.history['val_Recall']
epochs = range(1, len(recall) + 1)
plt.plot(epochs, recall, 'y', label='Training recall')
plt.plot(epochs, val_recall, 'r', label='Validation recall')
plt.title('Training and validation recall')
plt.xlabel('Epochs')
plt.ylabel('Recall')
plt.legend()
plt.show()

In [None]:
# Plot recall-precision trade-off
# Used to determine the threshold (manually)

from sklearn.metrics import f1_score, precision_score, recall_score

fig, ax = plt.subplots(figsize= (5,5))
ax.set_title("Precision recall")
ax.set_ylabel("Precision")
ax.set_xlabel("Recall")
ax.set_xlim(xmin=0, xmax=1)
ax.set_ylim(ymin=0, ymax=1)

x = []
y = []
for threshold in np.arange(0, 1.1, 0.1):
  y_pred_val = model.predict(X_val)
  y_pred_val[y_pred_val>=threshold] = 1
  y_pred_val[y_pred_val<threshold] = 0
  p = precision_score(y_val, y_pred_val, average='samples', zero_division=0)
  r = recall_score(y_val, y_pred_val, average='samples')
  ax.annotate(str(round(threshold,2)), (r, p))
  x.append(r)
  y.append(p)
  
ax.plot(x, y)
ax.lines[-1].set_label(NAME.split("_")[0])

plt.legend(loc="upper right")
plt.show()

In [None]:
# Recall - Precision - F1-score based on threshold
# F1-scores for each class


# SET THE THRESHOLD
threshold = 0.70



print(f'\n-------- Metrics for Pretrained {NAME.split("_")[0]}:---------\n ')

y_pred_val = model.predict(X_val)

y_pred_val[y_pred_val>=threshold] = 1
y_pred_val[y_pred_val<threshold] = 0

print('Precision: ', precision_score(y_val, y_pred_val, average='samples', zero_division=0)) 
print('Recall: ', recall_score(y_val, y_pred_val, average='samples')) 
print('F1-score: ', f1_score(y_val, y_pred_val, average='samples'), '\n') 

print('Per class F1 score: ')
f1_scores = f1_score(y_val, y_pred_val, average=None)
for i in range(len(f1_scores)):
  print(list(sorted(labels))[i], round(f1_scores[i],2))

# INFERENCE

In [None]:
# Load the model if it was not trained
# model.load_weights("NAME")

In [None]:
# Predictions on validation set

preds = model.predict(X_val)


# Using random images from validation set, change numb to specific number if wanted
numb = np.random.randint(0, len(X_val))


print(f"Predicted class probabilities: {preds[numb]}")
print(f"Ground truth: {y_val[numb]}")
# result = [num if num > 0.3 else 0 for num in preds[numb]]
# print(result)

mask = [1 if num > threshold else 0 for num in preds[numb]]
max_class = lbls[np.argmax(preds[numb])]
true_idx = np.argwhere(y_val[numb] == 1).flatten()
true_classes = []
for idx in true_idx:
    true_classes.append(lbls[idx])

c = y_val[numb] * np.array(preds[numb])

plt.imshow(X_val[numb])
plt.title(" - ".join([lbl for i, lbl in enumerate(lbls) if mask[i] == 1]))
plt.text(100, 25, f"MAX: {max_class}, {np.format_float_positional(np.max(preds[numb]), precision=2)}", color="red", fontsize=15)
plt.text(100, 50, f"CORRECT: {' '.join(true_classes)}", color="blue", fontsize=15)
plt.text(100, 75, f"prob: {'   '.join([np.format_float_positional(score, precision=2) for score in c[np.nonzero(c)]])}", color="blue", fontsize=15)