In [1]:
# System libraries
import os.path
from pathlib import Path

# Import DS librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import train_test_split
import mlflow

# Tensorflow Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers,models
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import Callback, EarlyStopping,ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import Model
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.python.saved_model import signature_constants

# Import Vision librairies
import PIL
from pathlib import Path
from PIL import UnidentifiedImageError

2022-09-15 18:47:26.831088: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-15 18:47:26.831125: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def generate_data_for_training(data_path, batch_size):

    image_dir = Path(data_path)
    # Get filepaths and labels
    filepaths = list(image_dir.glob(r'**/*.JPG')) + list(image_dir.glob(r'**/*.jpg')) + list(image_dir.glob(r'**/*.png')) + list(image_dir.glob(r'**/*.PNG'))
    labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], filepaths))
    filepaths = pd.Series(filepaths, name='Filepath').astype(str)
    labels = pd.Series(labels, name='Label')
    # Concatenate filepaths and labels
    image_df = pd.concat([filepaths, labels], axis=1)

    # Separate in train and test data
    train_df, test_df = train_test_split(image_df, test_size=0.2, shuffle=True, random_state=42)

    # Train data generator
    train_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v3.preprocess_input,
        validation_split=0.2
    )

    # Test data generator
    test_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v3.preprocess_input
    )

    # Split the data into three categories.
    train_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=batch_size,
        shuffle=True,
        seed=42,
        subset='training'
    )

    val_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=batch_size,
        shuffle=True,
        seed=42,
        subset='validation'
    )

    test_images = test_generator.flow_from_dataframe(
        dataframe=test_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=batch_size,
        shuffle=False
    )

    return train_images, val_images, test_images

In [3]:
### Pretrained model

# Resize Layer
resize_and_rescale = tf.keras.Sequential([
  layers.experimental.preprocessing.Resizing(224,224),
  layers.experimental.preprocessing.Rescaling(1./255),
])

# Load the pretained model
pretrained_model = tf.keras.applications.MobileNetV2(
    input_shape=(224, 224, 3),
    include_top=False,
    weights='imagenet',
    pooling='avg'
)
pretrained_model.trainable = False

2022-09-15 18:47:30.922374: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-09-15 18:47:30.922988: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-15 18:47:30.923115: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-09-15 18:47:30.923218: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-09-15 18:47:30.923269: W tensorflow/stream_executor/platform/default/dso_loader.cc:6

In [4]:
# Set path to data
data_path = "../data"

# You should ensure you have a local mlflow server running
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("sea_animals_classification")

with mlflow.start_run(run_name="Experiment 1"):

    ### hyperparams
    denses = [256,256,19]
    dropout = [0.2, 0.2]
    adam_param = 0.00001
    batch_size = 4
    
    ### Data Generators
    train_images, val_images, test_images = generate_data_for_training(data_path, batch_size)
    
    ### Model
    inputs = pretrained_model.input
    x = resize_and_rescale(inputs)
    x = Dense(denses[0], activation='relu')(pretrained_model.output)
    x = Dropout(dropout[0])(x)
    x = Dense(denses[1], activation='relu')(x)
    x = Dropout(dropout[1])(x)
    outputs = Dense(denses[2], activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)

    ### Mlflow logging params
    mlflow.log_param("adam", adam_param)
    mlflow.log_param("dropout", dropout)
    mlflow.log_param("nb_dense", denses)
    mlflow.log_param("batch_size", batch_size)
    print("Training parameters logged to tracking server.")

    # Create checkpoint callback
    checkpoint_path = "sea_animal_classification_model_checkpoint"
    checkpoint_callback = ModelCheckpoint(checkpoint_path,
                                        save_weights_only=True,
                                        monitor="val_accuracy",
                                        save_best_only=True)

    # Setup EarlyStopping callback to stop training if model's val_loss doesn't improve for 3 epochs
    early_stopping = EarlyStopping(monitor = "val_loss",
                                patience = 3,
                                restore_best_weights = True)

    model.compile(
        optimizer=Adam(adam_param),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(
        train_images,
        steps_per_epoch=len(train_images),
        validation_data=val_images,
        validation_steps=len(val_images),
        epochs=2,
        callbacks=[
            early_stopping,
            checkpoint_callback,
        ]
    )

    ### Evaluate model
    results = model.evaluate(test_images, verbose=1)

    mlflow.log_metric("train_loss", history.history['loss'][0])
    mlflow.log_metric("train_accuracy", history.history['accuracy'][0]*100)
    mlflow.log_metric("val_loss", history.history['loss'][1])
    mlflow.log_metric("val_accuracy", history.history['accuracy'][1]*100)
    mlflow.log_metric("test_loss", results[0])
    mlflow.log_metric("test_accuracy", results[1]*100)
    print("Metrics logged to tracking server.")

    tf.keras.models.save_model(model, "./model")
    print("Model saved locally")

    mlflow.log_artifacts("./model")
    print("Model artifact logged.")

    if mlflow.get_tracking_uri() != 'http://localhost:5000':
        try:
            tag=[tf.compat.v1.saved_model.tag_constants.SERVING]
            key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
            mlflow.tensorflow.log_model(tf_saved_model_dir="./model",
                                    tf_meta_graph_tags=tag,
                                    tf_signature_def_key=key,
                                    artifact_path="model",
                                    registered_model_name="MobileNetV2")
            print("Model sent to registry.")
        except Exception as e:
            print("Impossible to log model to registry: {}".format(e))
    else:
        print("No mlflow repository found. Not saving model to it.")

MlflowException: API request to http://localhost:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=sea_animals_classification (Caused by ResponseError('too many 500 error responses'))