In [1]:
# Ensure you deployed mlflow
%env MLFLOW_TRACKING_URI=http://localhost:5000
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ACCESS_KEY_ID=minio
%env AWS_SECRET_ACCESS_KEY=minio123

# System libraries
import os.path
from pathlib import Path
import time

# Import DS librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import train_test_split
import mlflow

# Tensorflow Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers,models
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import Callback, EarlyStopping,ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import Model
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.python.saved_model import signature_constants

env: MLFLOW_TRACKING_URI=http://localhost:5000
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123


2022-09-19 19:03:03.169601: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-19 19:03:03.169649: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def generate_data_for_training(data_path, batch_size):

    image_dir = Path(data_path)
    # Get filepaths and labels
    filepaths = list(image_dir.glob(r'**/*.JPG')) + list(image_dir.glob(r'**/*.jpg')) + list(image_dir.glob(r'**/*.png')) + list(image_dir.glob(r'**/*.PNG'))
    labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], filepaths))
    filepaths = pd.Series(filepaths, name='Filepath').astype(str)
    labels = pd.Series(labels, name='Label')
    # Concatenate filepaths and labels
    image_df = pd.concat([filepaths, labels], axis=1)

    # Separate in train and test data
    train_df, test_df = train_test_split(image_df, test_size=0.2, shuffle=True, random_state=42)

    # Train data generator
    train_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
        validation_split=0.2
    )

    # Test data generator
    test_generator = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
    )

    # Split the data into three categories.
    train_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=batch_size,
        shuffle=True,
        seed=42,
        subset='training'
    )

    val_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=batch_size,
        shuffle=True,
        seed=42,
        subset='validation'
    )

    test_images = test_generator.flow_from_dataframe(
        dataframe=test_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=batch_size,
        shuffle=False
    )

    return train_images, val_images, test_images

In [3]:
### Pretrained model

# Resize Layer
resize_and_rescale = tf.keras.Sequential([
  layers.experimental.preprocessing.Resizing(224,224),
  layers.experimental.preprocessing.Rescaling(1./255),
])

# Load the pretained model
pretrained_model = tf.keras.applications.MobileNetV2(
    input_shape=(224, 224, 3),
    include_top=False,
    weights='imagenet',
    pooling='avg'
)
pretrained_model.trainable = False

2022-09-19 19:03:05.219383: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-09-19 19:03:05.219540: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-19 19:03:05.219585: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-09-19 19:03:05.219616: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-09-19 19:03:05.219645: W tensorflow/stream_executor/platform/default/dso_loader.cc:6

In [4]:
# Set path to data
data_path = "../data"

# You should ensure you have a local mlflow server running
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("sea_animals_classification")

with mlflow.start_run(run_name="Exploration"):

    ### hyperparams
    denses = [256,256,19]
    dropout = [0.2, 0.2]
    adam_param = 0.00001
    nb_epochs = 2
    batch_size = 4
    
    ### Data Generators
    train_images, val_images, test_images = generate_data_for_training(data_path, batch_size)
    
    ### Model
    inputs = pretrained_model.input
    x = resize_and_rescale(inputs)
    x = Dense(denses[0], activation='relu')(pretrained_model.output)
    x = Dropout(dropout[0])(x)
    x = Dense(denses[1], activation='relu')(x)
    x = Dropout(dropout[1])(x)
    outputs = Dense(denses[2], activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)

    ### Mlflow logging params
    mlflow.log_param("adam", adam_param)
    mlflow.log_param("dropout", dropout)
    mlflow.log_param("nb_dense", denses)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("nb_epochs", nb_epochs)
    print("Training parameters logged to tracking server.")

    model.compile(
        optimizer=Adam(adam_param),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    start_training_time = time.time()
    history = model.fit(
        train_images,
        steps_per_epoch=len(train_images),
        validation_data=val_images,
        validation_steps=len(val_images),
        epochs=nb_epochs
    )
    elapsed_training_time = time.time() - start_training_time

    ### Evaluate model
    results = model.evaluate(test_images, verbose=1)

    mlflow.log_metric("Training time", elapsed_training_time)
    mlflow.log_metric("Training accuracy", history.history['accuracy'][0]*100)
    mlflow.log_metric("Validation accuracy", history.history['accuracy'][1]*100)
    mlflow.log_metric("Test accuracy", results[1]*100)
    print("Metrics logged to tracking server.")

    tf.keras.models.save_model(model, "./model")
    print("Model saved locally")

    try:
        mlflow.log_artifacts("./model")
        print("Model artifact logged.")
    except Exception as e:
        print("Impossible to log model artifacts: {}".format(e))

    try:
        tag=[tf.compat.v1.saved_model.tag_constants.SERVING]
        key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
        mlflow.tensorflow.log_model(tf_saved_model_dir="./model",
                                tf_meta_graph_tags=tag,
                                tf_signature_def_key=key,
                                artifact_path="model",
                                registered_model_name="MobileNetV2")
        print("Model sent to registry.")
    except Exception as e:
        print("Impossible to log model to registry: {}".format(e))

Found 7515 validated image filenames belonging to 19 classes.
Found 1878 validated image filenames belonging to 19 classes.
Found 2349 validated image filenames belonging to 19 classes.
Training parameters logged to tracking server.
Epoch 1/2


2022-09-19 19:03:06.557172: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/2
Metrics logged to tracking server.


2022-09-19 19:07:05.326752: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./model/assets




Model saved locally


2022/09/19 19:07:25 INFO mlflow.tensorflow: Validating the specified TensorFlow model by attempting to load it in a new TensorFlow graph...


Impossible to log model artifacts: Connection was closed before we received a valid response from endpoint URL: "http://localhost:9000/mlflow/1/8a01cfd416a449deb028dc6b29c3340a/artifacts/keras_metadata.pb".


2022/09/19 19:07:30 INFO mlflow.tensorflow: Validation succeeded!


Impossible to log model to registry: Connection was closed before we received a valid response from endpoint URL: "http://localhost:9000/mlflow/1/8a01cfd416a449deb028dc6b29c3340a/artifacts/model/conda.yaml".


In [7]:
try:
    mlflow.log_artifacts("./model")
    print("Model artifact logged.")
except Exception as e:
    print("Impossible to log model artifacts: {}".format(e))

Impossible to log model artifacts: Connection was closed before we received a valid response from endpoint URL: "http://localhost:9000/mlflow/1/ca12e7533ab347a6a3bd710e86c87600/artifacts/keras_metadata.pb".
