## Importation des modules utiles

In [None]:
import numpy as np
import pandas as pd

import os
import itertools
from glob import glob
import cv2
import random
from copy import deepcopy

# Plot
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib

# manipulation image
from PIL import Image
import imageio

# augmentation image
from albumentations import Compose, VerticalFlip, HorizontalFlip, Rotate, GridDistortion

In [None]:
# Pour le deep learning
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, RepeatVector,TimeDistributed, Input, GlobalAveragePooling2D
from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D, Flatten, Reshape, Dropout

# callbacks
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, CSVLogger, Callback
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.preprocessing.image import save_img
from tensorflow.keras.preprocessing import image
from sklearn.model_selection import train_test_split

from tensorflow.keras import backend as K
from tensorflow.keras.utils import Sequence

# modele Keras
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet121

# script python de fonctions utiles
from clouds_graph_functions import make_confusion_matrix

# metrics scikit learn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (f1_score,
                             precision_score,
                             recall_score)

## Definition du chemin d'accès aux données

In [None]:
NUAGES_PATH = '/kaggle/input/understanding_cloud_organization/'

NUAGES_TRAIN_PATH = NUAGES_PATH + 'train_images/'

# DataFrame

In [None]:
train_df = pd.read_csv(NUAGES_PATH + 'train.csv')

In [None]:
train_fns = sorted(glob(NUAGES_TRAIN_PATH + '*.jpg'))

print('Il y a {} images dans le jeu d\'entrainement.'.format(len(train_fns)))

In [None]:
train_df.head()

# Preprocessing

In [None]:
train_df = train_df[~train_df['EncodedPixels'].isnull()]
train_df['ImageId'] = train_df['Image_Label'].map(lambda x: x.split('_')[0])
train_df['ClassId'] = train_df['Image_Label'].map(lambda x: x.split('_')[1])
classes = train_df['ClassId'].unique()
train_df = train_df.groupby('ImageId')['ClassId'].agg(set).reset_index()
for class_name in classes:
    train_df[class_name] = train_df['ClassId'].map(lambda x: 1 if class_name in x else 0)

In [None]:
train_df.head()

In [None]:
labels_list = ["Fish", "Flower", "Sugar", "Gravel"]

# création d'un dictionnaire avec des vecteurs one_hot_encoding
img_2_ohe_vector = {img:vec for img, vec in zip(train_df['ImageId'], train_df.iloc[:, 2:].values)}

## Création d'un jeu d'entrainement et d'un jeu de validation

In [None]:
train_images, val_images = train_test_split(train_df['ImageId'].values, 
                                        test_size=0.2, 
                                        stratify=train_df['ClassId'].map(lambda x: str(sorted(list(x)))), # sorting present classes in lexicographical order, just to be sure
                                        random_state=10)

## Paramètres généraux

In [None]:
BATCH_SIZE = 12
HEIGHT = 224
WIDTH = 224
CHANNELS = 3
NB_CLASSES = 4

## Création d'une classe DataGenerator

In [None]:
class DataGenenerator(Sequence):
    def __init__(self, images_list=None, folder_imgs=NUAGES_TRAIN_PATH, 
                 batch_size=BATCH_SIZE, shuffle=True, augmentation=None,
                 resized_height=HEIGHT, resized_width=WIDTH, num_channels=CHANNELS):
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.augmentation = augmentation
        if images_list is None:
            self.images_list = os.listdir(folder_imgs)
        else:
            self.images_list = deepcopy(images_list)
        self.folder_imgs = folder_imgs
        self.len = len(self.images_list) // self.batch_size
        self.resized_height = resized_height
        self.resized_width = resized_width
        self.num_channels = num_channels
        self.num_classes = NB_CLASSES
        self.is_test = not 'train' in folder_imgs
        if not shuffle and not self.is_test:
            self.labels = [img_2_ohe_vector[img] for img in self.images_list[:self.len*self.batch_size]]

    def __len__(self):
        return self.len
    
    def on_epoch_start(self):
        if self.shuffle:
            random.shuffle(self.images_list)

    def __getitem__(self, idx):
        current_batch = self.images_list[idx * self.batch_size: (idx + 1) * self.batch_size]
        X = np.empty((self.batch_size, self.resized_height, self.resized_width, self.num_channels))
        y = np.empty((self.batch_size, self.num_classes))

        for i, image_name in enumerate(current_batch):
            path = os.path.join(self.folder_imgs, image_name)
            img = cv2.resize(cv2.imread(path), (self.resized_width, self.resized_height)).astype(np.float32)
            if not self.augmentation is None:
                augmented = self.augmentation(image=img)
                img = augmented['image']
            X[i, :, :, :] = img/255.0
            if not self.is_test:
                y[i, :] = img_2_ohe_vector[image_name]
        return X, y

    def get_labels(self):
        if self.shuffle:
            images_current = self.images_list[:self.len*self.batch_size]
            labels = [img_2_ohe_vector[img] for img in images_current]
        else:
            labels = self.labels
        return np.array(labels)

## Augmentation des données

In [None]:
albumentations_train = Compose([VerticalFlip(), HorizontalFlip(), Rotate(limit=20), GridDistortion()], p=1)

## Création des instances de la classe DataGenerator

In [None]:
train_generator = DataGenenerator(images_list=train_images,
                                  augmentation=albumentations_train)

valid_generator = DataGenenerator(images_list=val_images,
                                  shuffle=False)

## Choix du modèle de base

In [None]:
MODELS = {"vgg16": VGG16,
          "resnet50": ResNet50,
          "densenet121": DenseNet121}

CHOIX_MODEL = "vgg16"

## Définition des callbacks

In [None]:
# ModelCheckpoint callback : pour enregistrer les poids du modele.
checkpoint = ModelCheckpoint("./model_" + CHOIX_MODEL + ".h5",
                             monitor='val_loss',
                             mode='min',
                             save_best_only=True,
                             save_weights_only=True)

early_stopping = EarlyStopping(monitor='val_loss',
                                mode='min',
                                min_delta=0.01,
                                patience=5,
                                restore_best_weights=True,
                                verbose=1)

reduce_learning_rate = ReduceLROnPlateau(monitor='val_loss',
                                         mode='min',
                                        episilon=0.01,
                                        patience=3,
                                        factor=0.1,
                                        cooldown=3,
                                        min_lr=1e-6,
                                        verbose=1)

# CSVLogger callback : pour enregistrer l'historique d'entrainement.
csv_logger = CSVLogger("./training_" + CHOIX_MODEL + ".log")

## Métriques spécifiques pour la classification multilabels

In [None]:
# l'accuracy ne fonctionne pas pour la classification multilabels, il faut donc définir d'autres métriques

class Metrics(Callback):
    def __init__(self, validation_generator, validation_steps=None, threshold=0.5):
        self.validation_generator = validation_generator
        self.validation_steps = validation_steps or len(validation_generator)
        self.threshold = threshold

    def on_train_begin(self, logs={}):
        self.val_f1_scores = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        gen_1, gen_2 = itertools.tee(self.validation_generator)
        y_true = np.vstack(next(gen_1)[1] for _ in range(self.validation_steps)).astype('int')
        y_pred = (self.model.predict(gen_2, steps=self.validation_steps) > self.threshold).astype('int')
        _val_f1 = f1_score(y_true, y_pred, average='weighted')
        _val_recall = recall_score(y_true, y_pred, average='weighted')
        _val_precision = precision_score(y_true, y_pred, average='weighted')
        self.val_f1_scores.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print(f" - val_f1_score: {_val_f1:.5f} - val_precision: {_val_precision:.5f} - val_recall: {_val_recall:.5f}")
        return

## Création du modèle

In [None]:
base_model = MODELS[CHOIX_MODEL](include_top=False,
                               weights="imagenet",
                               input_shape=(HEIGHT, WIDTH, CHANNELS))

# entrainement des couches du modele
for layer in base_model.layers:
    layer.trainable = False

# Construction du modele
model = Sequential()
model.add(base_model)
model.add(Flatten())
model.add(Dense(units = NB_CLASSES, activation = "sigmoid"))
# Dans le cas de la classification multi labels, on peut avoir plus d'un label par image
# On veut que les probabilités soient indépendantes d'un label à l'autre
# On utilise donc la fonction d'activation "sigmoid"

# compilation du modele
model.compile(loss="binary_crossentropy", optimizer= "adam")

## Entrainement du modèle

### Chargement des poids du modèle si existant

In [None]:
input_model_path = "../input/clouds-classification/model_" + CHOIX_MODEL + ".h5"

try:
    model.load_weights(input_model_path)
    print("model.h5 pré entrainé chargé!")
except:
    print("model.h5 non pré entrainé, non chargé")
    pass

In [None]:
EPOCHS = 1

model_info= model.fit_generator(generator=train_generator,
                                validation_data=valid_generator,
                                callbacks = [Metrics(valid_generator), checkpoint, early_stopping, reduce_learning_rate, csv_logger],
                                epochs=EPOCHS,
                                verbose=True)

### Courbes de pertes

In [None]:
plt.figure(figsize=(12,6))
plt.plot(model_info.history['loss'])
plt.plot(model_info.history['val_loss'])
plt.title('Model loss by epoch')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='right');


# Predictions et analyse des résultats

## Chargement des poids du modèle après entrainement

In [None]:
output_model_path = "./model_" + CHOIX_MODEL + ".h5"

try:
    try:
        # Si entrainement du modele, les nouveaux poids sont dans output
        model.load_weights(output_model_path)
        print("model.h5 en output chargé!")
    except:
        print("model.h5 non entrainé en output, non chargé")
        try:
            # Si modele pré entrainé en input
            model.load_weights(input_model_path)
            print("model.h5 pré entrainé chargé!")
        except:
            print("model.h5 non pré entrainé en input, non chargé")
            pass
except:
    print("aucun model.h5 chargé")
    pass

## Test de prédiction sur une image

In [None]:
imageId = "002be4f.jpg"

# chargement de l'image
img = image.load_img(NUAGES_TRAIN_PATH + imageId, target_size=(HEIGHT, WIDTH))
# transformation de l'image en array
img_array = image.img_to_array(img)
# création d'un batch d'une image
img_batch = np.expand_dims(img_array, axis=0)
# normalisation de la valeur des pixels
img_preprocessed = img_batch/255

prediction = model.predict(img_preprocessed)
true_value = img_2_ohe_vector[imageId]

In [None]:
print("\n" + "Correspondance labels: " + str(labels_list) + "\n")
print("valeur predite: ", prediction)
print("valeur réelle: ", true_value)

## Test de prédiction sur plusieurs images

In [None]:
test_dir = NUAGES_TRAIN_PATH
test_images = val_images[0:5]

check_generator = DataGenenerator(images_list=test_images,
                                  batch_size=1,
                                  shuffle=False)

batch_pred_labels = model.predict(check_generator, 
                                 workers=1,
                                 verbose=1)
true_value_vect = []

rows = (len(test_images) - 1) // 4 + 1
plt.figure(figsize=(15, 5 * rows))
for index, filename in enumerate(test_images):
    plt.subplot(rows, 4, index + 1)
    img = image.load_img(test_dir+filename, target_size=(HEIGHT, WIDTH))
    plt.imshow(img)
    plt.axis('off')
    plt.title("Image " + str(index))
    print("\n" + "Correspondance labels: " + str(labels_list) + "\n")
    print("vrais labels et valeurs predites de l'image " + str(index) + " avec comme id: " + str(filename))
    true_value = img_2_ohe_vector[filename]
    true_value_vect.append(true_value)
    prediction = batch_pred_labels[index]
    print(true_value, ";", prediction)

# Matrice de confusion

### Generation des images test + prediction des labels

In [None]:
test_images = val_images

check_generator = DataGenenerator(images_list=test_images,
                                  batch_size=1,
                                  shuffle=False)

batch_pred_labels = model.predict(check_generator, 
                                 workers=1,
                                 verbose=1)

## Création des vecteurs binaires de valeurs vrais et valeurs prédites en fonction d'un seuil

In [None]:
SEUIL = 0.5

true_values_vect = []
prediction_values_vect = []

for index, filename in enumerate(test_images):
    # vrais valeurs
    true_value = img_2_ohe_vector[filename]
    true_values_vect.append(true_value)
    # valeurs de prediction
    prediction = batch_pred_labels[index]
    prediction_values_vect.append(np.where(prediction > SEUIL, 1, 0))
    
true_values_vect = np.asarray(true_values_vect)
prediction_values_vect = np.asarray(prediction_values_vect)

## Affichage des différentes matrices de confusion pour chaque label

In [None]:
for i in range(NB_CLASSES):
    cf_matrix = confusion_matrix(true_values_vect[:, i], prediction_values_vect[:, i])
    labels = ["Vrai negatif", "Faux positif", "Faux negatif", "Vrai positif"]
    categories = ["0", labels_list[i]]
    make_confusion_matrix(cf_matrix,
                      group_names=labels,
                      categories=categories,
                      title=labels_list[i],
                      figsize=(8,8),
                      cmap="Blues")