<img src="https://lakefs.io/wp-content/uploads/2022/09/lakeFS-Logo.svg" alt="lakeFS logo" width=200/>

# ML Data Version Control and Reproducibility at Scale

## Setup

**(you shouldn't need to change anything in this section, just run it)**

### lakeFS endpoint and credentials

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Storage Information

Change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for lakeFS repository will be stored.

In [None]:
storageNamespace = 's3://example/' + repo_name # e.g. "s3://bucket"

### Are you running this demo in LOCAL container or in Databricks DISTRIBUTED cluster?

In [None]:
localOrDistributedComputing = "LOCAL" # LOCAL or DISTRIBUTED

### Downloaded demo dataset from [Kaggle](https://www.kaggle.com/c/airbus-ship-detection) and uploaded to "airbus-ship-detection" folder in MinIO

In [None]:
bucketName = 'sample-data'
awsRegion = 'us-east-1'
prefix = "airbus-ship-detection/"

In [None]:
aws_access_key_id = 'aaaaaaaaaaaaa'
aws_secret_access_key = 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'

### Import libraries

In [None]:
%xmode Minimal
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.ERROR)

import lakefs
from lakefs.exceptions import NotFoundException
import os

import boto3
import random
import time
import datetime
from pyspark.sql.functions import substring_index, col, pandas_udf, collect_list, size, desc, base64
from PIL import Image
import io
import pandas as pd
import numpy as np
from petastorm.spark import SparkDatasetConverter, make_spark_converter
from petastorm import TransformSpec
from functools import partial
import torch
import torchvision
from torchvision import transforms
import pytorch_lightning as pl
import segmentation_models_pytorch as smp
import base64
import mlflow
from hyperopt import fmin, rand, hp, SparkTrials, STATUS_OK, space_eval, tpe
import gc
import ipywidgets as widgets
from IPython.display import Markdown as md

## Versioning Information

In [None]:
mainBranch = "main"
emptyBranch = "empty"
experimentBranch = localOrDistributedComputing + "-experiment"
commitMetadata=""
tagPrefix = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

### Set environment variables

In [None]:
os.environ["LAKECTL_SERVER_ENDPOINT_URL"] = lakefsEndPoint
os.environ["LAKECTL_CREDENTIALS_ACCESS_KEY_ID"] = lakefsAccessKey
os.environ["LAKECTL_CREDENTIALS_SECRET_ACCESS_KEY"] = lakefsSecretKey

### Define lakeFS UI Endpoint

In [None]:
if lakefsEndPoint.startswith('http://host.docker.internal'):
    lakefsUIEndPoint = 'http://127.0.0.1:8000'
elif lakefsEndPoint.startswith('http://lakefs:8000'):
    lakefsUIEndPoint = 'http://127.0.0.1:8003'
else:
    lakefsUIEndPoint = lakefsEndPoint

### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.client.Client().version
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v}")

### Define lakeFS Repository

In [None]:
repo = lakefs.Repository(repo_name).create(storage_namespace=f"{storageNamespace}/{repo_name}", default_branch=mainBranch, exist_ok=True)
branchMain = repo.branch(mainBranch)
print(repo)

### Configure lakectl for LOCAL

In [None]:
if localOrDistributedComputing == "LOCAL":
    f = open(".lakectl.yaml", "w")
    f.write(f"credentials: \n\
    access_key_id: {lakefsAccessKey}\n\
    secret_access_key: {lakefsSecretKey}\n\
server: \n\
    endpoint_url: {lakefsEndPoint}")
    f.close()

### Set up Spark for LOCAL

In [None]:
if localOrDistributedComputing == "LOCAL":
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.appName("lakeFS / Jupyter") \
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey) \
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey) \
        .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .getOrCreate()
    
#        .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0,org.mlflow.mlflow-spark") \
    spark.sparkContext.setLogLevel("INFO")

    spark

### Create empty branch

In [None]:
branchEmpty = repo.branch(emptyBranch).create(source_reference=mainBranch, exist_ok=True)

### Folder structure to implement Medallion Architecture

In [None]:
raw_data_folder = "raw/"
bronze_data_folder = "bronze/"
silver_data_folder = "silver/"
gold_data_folder = "gold/"
training_data_folder = "train_v2/"
mask_data_folder = "mask/"

### Create S3 client

In [None]:
s3 = boto3.client('s3',
    endpoint_url='https://s3.' + awsRegion + '.amazonaws.com',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key)

### Get the list of images in training dataset

In [None]:
def list_images():
    s3_result =  s3.list_objects_v2(Bucket=bucketName, Prefix=prefix+training_data_folder)

    file_list = []
    for key in s3_result['Contents']:
        file_list.append(key['Key'])

    while s3_result['IsTruncated']:
        continuation_key = s3_result['NextContinuationToken']
        s3_result = s3.list_objects_v2(Bucket=bucketName, Prefix=prefix, ContinuationToken=continuation_key)
        for key in s3_result['Contents']:
            file_list.append(key['Key'])
    print(f"Image count = {len(file_list)}")
    return file_list

### Import subset of training data to lakeFS repo

In [None]:
def import_images(file_list_random):
    importer = branchExperimentBranchN.import_data(commit_message="import images")
    for file in file_list_random:
        importer.prefix("s3://"+bucketName+'/'+file, destination=raw_data_folder+training_data_folder)

    importer.prefix("s3://"+bucketName+'/'+prefix+'train_ship_segmentations_v2.csv', destination=raw_data_folder)
    
    importer.start()
    time.sleep(2)
    status = importer.status()
    print(status)

    while not status.completed and status.error is None:
        time.sleep(2)
        status = importer.status()
        print(status)

    if status.error:
        raise Exception(status.error)
    
    print(f"\nImported a total of {status.ingested_objects} objects into branch {experimentBranchN}")

## Build the data pipeline

### Ingest raw images as bronze data set and save as Delta table

In [None]:
def bronze_images():
    return spark.read.format("binaryfile").option("pathGlobFilter", "*.jpg").load(training_data_path)

### Print Diff results

In [None]:
def print_diff_refs(diff_refs):
    results = map(
        lambda n:[n.path,n.path_type,n.size_bytes,n.type],
        diff_refs)
    return results

### Diff the branch to find uncommitted changes

In [None]:
def diff_branch(repo, repo_path, branch):
    if localOrDistributedComputing == "LOCAL":
        lakeFSLocalCommand = f"lakectl local status {repo_path}"
        ! $lakeFSLocalCommand
    elif localOrDistributedComputing == "DISTRIBUTED":
        for a in print_diff_refs(
            branchMain.diff(other_ref=branch)):
            print(a)

### Commit changes

In [None]:
def print_lakectl_response(response, lines):
    if lines > len(response):
        lines = len(response)
    for x in range(lines):
        print(response[-(lines-x)])

def get_commit_id(commit_response):
    responseLastNLinesSplit = [i.split(':') for i in commit_response]
    for key_value in responseLastNLinesSplit:
        for key in key_value:
            if key == 'ID':
                return key_value[1].strip()

In [None]:
def commit(repo, repo_path, branch, commitMessage, metadata=""):
    if localOrDistributedComputing == "LOCAL":
        lakeFSLocalCommand = f"lakectl local commit -m '{commitMessage}' --meta '{metadata}' {repo_path}"
        commit_response = ! $lakeFSLocalCommand
        # Print last 15 lines
        print_lakectl_response(commit_response, 15)
        return get_commit_id(commit_response)
    elif localOrDistributedComputing == "DISTRIBUTED":
        if metadata == "":
            metadata = {}
        ref = lakefs.Repository(repo).branch(branch).commit(message=commitMessage, metadata=metadata)
        return ref.get_commit().id

### Create lakeFS tag

In [None]:
def lakefs_set_tag(repo, tagID, branch):
    print(lakefs.Tag(repo, tagID).create(branch, exist_ok=True))

### Enrich dataset and save as silver dataset

In [None]:
IMAGE_RESIZE = 320 # divisible by 32

# Resize UDF function
@pandas_udf("binary")
def resize_image_udf(content_series):
    def resize_image(content):
        """resize image and serialize as jpeg"""
        try:
            image = Image.open(io.BytesIO(content)).resize((IMAGE_RESIZE, IMAGE_RESIZE), Image.NEAREST)
            output = io.BytesIO()
            image.save(output, format='JPEG')
            return output.getvalue()
        except Exception:
            # some images are invalid
            return None

    return content_series.apply(resize_image)

# add the metadata to enable the image preview
image_meta = {"spark.contentAnnotation": '{"mimeType": "image/jpeg"}'}

def silver_images(df_bronze_images):
    return df_bronze_images.withColumn("image_id", substring_index(col('path'), '/', -1)) \
        .withColumn("content", resize_image_udf(col("content")).alias("content", metadata=image_meta)) \
        .filter("content is not null") \
        .select("image_id", "content")

### Load the raw image mask

In [None]:
def bronze_mask():
    annotationsDF = (spark.read.option("header", "true")
                     .option("inferSchema", "true")
                     .csv(f"{raw_data_path}/train_ship_segmentations_v2.csv"))
    return (annotationsDF.withColumnRenamed("ImageId", "image_id")
            .withColumnRenamed("EncodedPixels", "encoded_pixels"))

### Transforming masks into images

In [None]:
IMAGE_SIZE = 768

def rle_decode(mask_rle, shape=(IMAGE_SIZE, IMAGE_SIZE)):
    s = mask_rle.split()
    starts = np.asarray(s[0::2], dtype=int)
    lengths = np.asarray(s[1::2], dtype=int)

    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 255
    return img.reshape(shape).T


def mask(rle_masks):
    if isinstance(rle_masks, np.ndarray):
        all_masks = np.zeros((IMAGE_SIZE, IMAGE_SIZE), dtype=np.int8)
        for mask in rle_masks.tolist():
            all_masks += rle_decode(mask)
        image = Image.fromarray(all_masks, mode="L").resize((IMAGE_RESIZE, IMAGE_RESIZE), Image.NEAREST)
        output = io.BytesIO()
        image.save(output, format='JPEG')
        return output.getvalue()
    raise Exception(type(rle_masks))


@pandas_udf("binary")
def computeMaskUDF(s: pd.Series) -> pd.Series:
    return s.apply(mask)


def silver_mask(df_bronze_mask):
    return (df_bronze_mask
            .filter("encoded_pixels is not null")
            .groupBy("image_id").agg(collect_list('encoded_pixels').alias('encoded_pixels'))
            .withColumn("boat_number", size(col("encoded_pixels")))
            .withColumn("mask", computeMaskUDF(col("encoded_pixels")).alias("mask", metadata=image_meta)))

### Joining image and mask both as the gold layer

In [None]:
def gold_images(df_silver_images, df_silver_mask):
    imagesWithMask = df_silver_mask.join(df_silver_images, "image_id")
    return imagesWithMask.select("image_id", "boat_number", "mask", "content")

### Decode the raw image bytes and apply standard ImageNet transforms

In [None]:
def transform_row(is_train, pd_batch):
  """
  The input and output of this function must be pandas dataframes.
  Do data augmentation for the training dataset only.
  """
  transformers = [transforms.Lambda(lambda x: Image.open(io.BytesIO(x)))]
  if is_train:
    transformers.extend([
      transforms.RandomResizedCrop(224),
      transforms.RandomHorizontalFlip(),
    ])
  else:
    transformers.extend([
      transforms.Resize(256),
      transforms.CenterCrop(224),
    ])
  transformers.extend([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
  ])
  
  transformersMask = [transforms.Lambda(lambda x: Image.open(io.BytesIO(x)))]
  if is_train:
    transformersMask.extend([
      #transforms.RandomResizedCrop(224),
      #transforms.RandomHorizontalFlip(),
      transforms.Resize(256),
      transforms.CenterCrop(224),
    ])
  else:
    transformersMask.extend([
      transforms.Resize(256),
      transforms.CenterCrop(224),
    ])
  transformersMask.extend([
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5]),
  ])
  
  trans = transforms.Compose(transformers)
  transMask = transforms.Compose(transformersMask)

  pd_batch['content'] = pd_batch['content'].map(lambda x: trans(x).numpy())
  pd_batch['mask'] = pd_batch['mask'].map(lambda x: transMask(x).numpy())
  #pd_batch = pd_batch.drop(labels=['content'], axis=1)
  pd_batch = pd_batch.drop(labels=['image_id'], axis=1)
  pd_batch = pd_batch.drop(labels=['boat_number'], axis=1)
  #pd_batch = pd_batch.drop(labels=['mask'], axis=1)
  return pd_batch

def get_transform_spec(is_train=True):
  # Note that the output shape of the `TransformSpec` is not automatically known by petastorm, 
  # so we need to specify the shape for new columns in `edit_fields` and specify the order of 
  # the output columns in `selected_fields`.
  return TransformSpec(partial(transform_row, is_train), 
                       edit_fields=[('content', np.float32, (3, 224, 224), False), ('mask', np.float32, (1, 224, 224), False)], # num_channels is 1 for greyscale images
                       #edit_fields=[('content', torch.bfloat16, (3, 224, 224), False), ('mask', torch.bfloat16, (1, 224, 224), False)], 
                       selected_fields=['content', 'mask'])

### Model

In [None]:
class BoatModel(pl.LightningModule):

    def __init__(self, arch, encoder_name, in_channels, out_classes, lr, **kwargs):
        super().__init__()
        self.model = smp.create_model(
            arch, encoder_name=encoder_name, in_channels=in_channels, classes=out_classes, **kwargs
        )

        # preprocessing parameteres for image
        params = smp.encoders.get_preprocessing_params(encoder_name)
        self.register_buffer("std", torch.tensor(params["std"]).view(1, 3, 1, 1))
        self.register_buffer("mean", torch.tensor(params["mean"]).view(1, 3, 1, 1))

        # for image segmentation dice loss could be the best first choice
        self.loss_fn = smp.losses.DiceLoss(smp.losses.BINARY_MODE, from_logits=True)

    def forward(self, image):
        # normalize image here
        image = (image - self.mean) / self.std
        mask = self.model(image)
        return mask

    def shared_step(self, batch, stage):
        
        image = batch["content"]

        # Shape of the image should be (batch_size, num_channels, height, width)
        # if you work with grayscale images, expand channels dim to have [batch_size, 1, height, width]
        assert image.ndim == 4

        # Check that image dimensions are divisible by 32, 
        # encoder and decoder connected by `skip connections` and usually encoder have 5 stages of 
        # downsampling by factor 2 (2 ^ 5 = 32); e.g. if we have image with shape 65x65 we will have 
        # following shapes of features in encoder and decoder: 84, 42, 21, 10, 5 -> 5, 10, 20, 40, 80
        # and we will get an error trying to concat these features
        h, w = image.shape[2:]
        assert h % 32 == 0 and w % 32 == 0

        mask = batch["mask"]

        # Shape of the mask should be [batch_size, num_classes, height, width]
        # for binary segmentation num_classes = 1
        assert mask.ndim == 4

        # Check that mask values in between 0 and 1, NOT 0 and 255 for binary segmentation
        #assert mask.max() <= 1.0 and mask.min() >= 0 # Commented to avoid error

        logits_mask = self.forward(image)
        
        # Predicted mask contains logits, and loss_fn param `from_logits` is set to True
        loss = self.loss_fn(logits_mask, mask)

        # Lets compute metrics for some threshold
        # first convert mask values to probabilities, then 
        # apply thresholding
        prob_mask = logits_mask.sigmoid()
        pred_mask = (prob_mask > 0.5).float()

        # We will compute IoU metric by two ways
        #   1. dataset-wise
        #   2. image-wise
        # but for now we just compute true positive, false positive, false negative and
        # true negative 'pixels' for each image and class
        # these values will be aggregated in the end of an epoch
        tp, fp, fn, tn = smp.metrics.get_stats(pred_mask.long(), mask.long(), mode="binary")

        return {
            "loss": loss,
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "tn": tn,
        }

    def shared_epoch_end(self, outputs, stage):
        # aggregate step metics
        tp = torch.cat([x["tp"] for x in outputs])
        fp = torch.cat([x["fp"] for x in outputs])
        fn = torch.cat([x["fn"] for x in outputs])
        tn = torch.cat([x["tn"] for x in outputs])
        
        # per image IoU means that we first calculate IoU score for each image 
        # and then compute mean over these scores
        per_image_iou = smp.metrics.iou_score(tp, fp, fn, tn, reduction="micro-imagewise")
        
        # dataset IoU means that we aggregate intersection and union over whole dataset
        # and then compute IoU score. The difference between dataset_iou and per_image_iou scores
        # in this particular case will not be much, however for dataset 
        # with "empty" images (images without target class) a large gap could be observed. 
        # Empty images influence a lot on per_image_iou and much less on dataset_iou.
        dataset_iou = smp.metrics.iou_score(tp, fp, fn, tn, reduction="micro")

        metrics = {
            f"{stage}_per_image_iou": per_image_iou,
            f"{stage}_dataset_iou": dataset_iou,
        }
        
        self.log_dict(metrics, prog_bar=True)

    def training_step(self, batch, batch_idx):
        return self.shared_step(batch, "train")            

    def training_epoch_end(self, outputs):
        return self.shared_epoch_end(outputs, "train")

    def validation_step(self, batch, batch_idx):
        return self.shared_step(batch, "valid")

    def validation_epoch_end(self, outputs):
        return self.shared_epoch_end(outputs, "valid")

    def test_step(self, batch, batch_idx):
        return self.shared_step(batch, "test")  

    def test_epoch_end(self, outputs):
        return self.shared_epoch_end(outputs, "test")

    def predict_step(self, batch, batch_idx):
        return self.shared_step(batch, "test")  

    def predict_epoch_end(self, outputs):
        return self.shared_epoch_end(outputs, "test")
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.0001)

###  Model wrapper which tackles all the data transformations originally captured in transform spec.

In [None]:
class CVModelWrapper(mlflow.pyfunc.PythonModel):
  
  def __init__(self, model):    
    # instantiate model in evaluation mode
    self.model = model.eval()
    
     # define transformation pipeline
    trans = torchvision.transforms.Compose([
              torchvision.transforms.Lambda(lambda x: Image.open(io.BytesIO(x))),
              torchvision.transforms.Resize(256),
              torchvision.transforms.ToTensor(),
              torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
              ])
    self.transform = trans
    
  def predict(self, context, model_input):
    for i in torch.utils.data.DataLoader(model_input):
      output = model(i)
      #probs = torch.nn.functional.softmax(output, dim=1)[:,1]
      outputs += [output]
    
    ser = pd.Series(outputs)
    return ser

### Train the base Model

In [None]:
BATCH_SIZE = 16

#mlflow.autolog(log_models=False)

def train_model(arch, encoder_name, lr, nested=False):
    with mlflow.start_run(nested=nested) as run:
        model = BoatModel(arch=arch, encoder_name=encoder_name, in_channels=3, out_classes=1, lr=lr)
        trainer = pl.Trainer(
            benchmark=True,
            num_sanity_val_steps=0,
            precision=32, # Changed precision from 16 to 32 to run model on CPUs
            #accelerator="gpu",
            #gpus="1",
            log_every_n_steps=100,
            default_root_dir="/tmp",
            max_epochs=1)

        mlflow.log_param("encoder", encoder_name)
        mlflow.log_param("arch", arch)
        mlflow.log_param("learning rate", lr)
        mlflow.set_tag("lakefs_demos", "image_segmentation")
        with converter_train.make_torch_dataloader(num_epochs=1, transform_spec=get_transform_spec(is_train=True),
                                                   batch_size=BATCH_SIZE) as train_dataloader, \
                converter_test.make_torch_dataloader(num_epochs=1, transform_spec=get_transform_spec(is_train=False),
                                                     batch_size=BATCH_SIZE) as valid_dataloader:
            trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=valid_dataloader)
            delattr(model, "trainer")
            # add model requirement
            reqs = mlflow.pytorch.get_default_pip_requirements() + [
                "git+https://github.com/qubvel/segmentation_models.pytorch", "pytorch-lightning==" + pl.__version__]
            mlflow.pyfunc.log_model(artifact_path="model", python_model=CVModelWrapper(model), pip_requirements=reqs)
            # log and returns model accuracy
            valid_metrics = trainer.validate(model, dataloaders=valid_dataloader, verbose=False)
            valid_per_image_iou = valid_metrics[0]['valid_per_image_iou']
            mlflow.log_metric("valid_per_image_iou", valid_per_image_iou)
            mlflow.log_metric("loss", 1 - valid_per_image_iou)
            
            # Log information about dataset and lakeFS tags/commits
            if localOrDistributedComputing == "LOCAL":
                dataset = mlflow.data.load_delta(path=f"{gold_data_path}/{training_data_folder}")
                mlflow.log_input(dataset, context="Gold Dataset")
            lakefs_dataset_tag = f"{lakefsUIEndPoint}/repositories/{repo_name}/objects?ref={goldDatasetTagID}&path=gold/train_v2/"
            dictionary = {"lakefs_dataset": lakefs_dataset_tag}
            mlflow.log_dict(dictionary, "model/lakefs_dataset.json")
            mlflow.set_tag("lakefs_repo", repo_name)
            mlflow.set_tag("lakefs_branch", experimentBranchN)
            mlflow.set_tag("lakefs_dataset", lakefs_dataset_tag)

            return valid_per_image_iou

### Other helper functions

In [None]:
def commit_metadata_for_best_model(best_model, model_registered):
    if localOrDistributedComputing == "LOCAL":
        commitMetadata='::lakefs::Registered Model::url[url:ui]=http://127.0.0.1:5002/#/models/lakefs_demos_image_segmentation/versions/' + str(model_registered.version) + \
            ',Model Name=' + model_registered.name + \
            ',Model Version=' + str(model_registered.version) + \
            ',Model Run Name=' + best_model["tags.mlflow.runName"] + \
            ',Model Metric valid_per_image_iou=' + str(best_model["metrics.valid_per_image_iou"])
    elif localOrDistributedComputing == "DISTRIBUTED":    
        commitMetadata={'::lakefs::Registered Model::url[url:ui]' : 'https://'+spark.conf.get("spark.databricks.workspaceUrl")+'/#mlflow/models/lakefs_demos_image_segmentation/versions/' + str(model_registered.version),
                        'Model Name' : model_registered.name,
                        'Model Version' : str(model_registered.version),
                        'Model Run Name' : best_model["tags.mlflow.runName"],
                        'Model Metric valid_per_image_iou' : str(best_model["metrics.valid_per_image_iou"])                
                       }
    return commitMetadata                                                                                

In [None]:
def display_gold_images_header():
    wi1 = widgets.Label(value='image_id', disabled=True, layout=widgets.Layout(width='120px'))
    wi2 = widgets.Label(value='boats', disabled=True, layout=widgets.Layout(width='60px'))
    wi3 = widgets.Label(value='mask', disabled=True, layout=widgets.Layout(width='150px'))
    wi4 = widgets.Label(value='content', disabled=True, layout=widgets.Layout(width='150px'))
    wid=widgets.HBox([wi1,wi2,wi3,wi4])
    display(wid)
    
def display_gold_images_row(row):
    wi1 = widgets.Text(value=row['image_id'], disabled=True, layout=widgets.Layout(width='120px'))
    wi2 = widgets.IntText(value=row['boat_number'], disabled=True, layout=widgets.Layout(width='60px'))
    wi3 = widgets.Image(value=row['mask'], format='jpg', width=150, height=150)
    wi4 = widgets.Image(value=row['content'], format='jpg', width=150, height=150)
    wid=widgets.HBox([wi1,wi2,wi3,wi4])
    display(wid)
    
def display_gold_images_local(df_gold_images):
    display_gold_images_header()
    dataCollect = df_gold_images.collect()
    for row in dataCollect:
        display_gold_images_row(row)
        
def display_gold_images(df_gold_images):
    if localOrDistributedComputing == "LOCAL":
        display_gold_images_local(df_gold_images)
    elif localOrDistributedComputing == "DISTRIBUTED":
        display(df_gold_images)

In [None]:
# This works with Jupyter notebook but does not work with Databricks notebook
def display_capture(capture, lines):
    capture_split = capture.stdout.split('\n')
    if lines > len(capture_split):
        lines = len(capture_split)
    for x in range(lines):
        print(capture_split[-(lines-x)])
        
    capture_split = capture.stderr.split('\n')
    if lines > len(capture_split):
        lines = len(capture_split)
    for x in range(lines):
        print(capture_split[-(lines-x)])

# Added for Local demo

In [None]:
file_list_random = []
file_list_random.append(prefix+training_data_folder)

## Work locally with smaller dataset or work with bigger dataset in Databricks cluster

In [None]:
if localOrDistributedComputing == "LOCAL":
    repo_path = f"{repo_name}/lakefs_local"
elif localOrDistributedComputing == "DISTRIBUTED":
    repo_path = f"lakefs://{repo_name}/{experimentBranchN}"

raw_data_path = f"{repo_path}/{raw_data_folder}"
training_data_path = f"{raw_data_path}{training_data_folder}"
bronze_data_path = f"{repo_path}/{bronze_data_folder}"
silver_data_path = f"{repo_path}/{silver_data_folder}"
gold_data_path = f"{repo_path}/{gold_data_folder}"

## Run the data pipeline

In [None]:
def data_pipeline():
    print("############ Ingest raw images as bronze data set and save as Delta table ############")
    df_bronze_images = bronze_images()
    df_bronze_images.write.format("delta").mode("overwrite").save(f"{bronze_data_path}/{training_data_folder}")
    diff_branch(repo.id, repo_path, experimentBranchN)
    
    print("############ Commit bronze dataset to the lakeFS repository and tag it ############")
    commitMessage = 'Converted raw images to binary content and saved as Delta table'
    commit(repo.id, repo_path, experimentBranchN, commitMessage)
    lakefs_set_tag(repo.id, f"{tagPrefix}-{experimentBranchN}-bronze-images", experimentBranchN)
    
    print("############ Enrich dataset and save as silver dataset ############")
    df_silver_images = silver_images(df_bronze_images)
    df_silver_images.write.format("delta").mode("overwrite").save(f"{silver_data_path}/{training_data_folder}")
    diff_branch(repo.id, repo_path, experimentBranchN)
    
    print("############ Commit silver dataset to the lakeFS repository and tag it ############")
    commitMessage = 'Enriched dataset and saved as silver dataset'
    commit(repo.id, repo_path, experimentBranchN, commitMessage)
    lakefs_set_tag(repo.id, f"{tagPrefix}-{experimentBranchN}-silver-images", experimentBranchN)
    
    print("############ Load the raw image mask as bronze dataset ############")
    df_bronze_mask = bronze_mask()
    df_bronze_mask.write.format("delta").mode("overwrite").save(f"{bronze_data_path}/{mask_data_folder}")
    diff_branch(repo.id, repo_path, experimentBranchN)
    
    print("############ Commit bronze mask dataset to the lakeFS repository and tag it ############")
    commitMessage = 'Loaded the raw image mask and saved as Delta table'
    commit(repo.id, repo_path, experimentBranchN, commitMessage)
    lakefs_set_tag(repo.id, f"{tagPrefix}-{experimentBranchN}-bronze-mask", experimentBranchN)
    
    print("############ Transform masks into images ############")
    df_silver_mask = silver_mask(df_bronze_mask)
    df_silver_mask.write.format("delta").mode("overwrite").save(f"{silver_data_path}/{mask_data_folder}")
    diff_branch(repo.id, repo_path, experimentBranchN)
    
    print("############ Commit silver mask dataset to the lakeFS repository and tag it ############")
    commitMessage = 'Transformed masks into images'
    commit(repo.id, repo_path, experimentBranchN, commitMessage)
    lakefs_set_tag(repo.id, f"{tagPrefix}-{experimentBranchN}-silver-mask", experimentBranchN)
    
    print("############ To verify that pipeline ran successfully, join image and mask both as the gold layer and select top 10 images with maximum number of boats/ships ############")
    df_gold_images = gold_images(df_silver_images, df_silver_mask)
    display_gold_images(df_gold_images.orderBy(desc("boat_number")).limit(10))
    
    print("\n\n############ Save gold dataset ############")
    df_gold_images.write.format("delta").mode("overwrite").save(f"{gold_data_path}/{training_data_folder}")
    diff_branch(repo.id, repo_path, experimentBranchN)
    
    print("############ Commit gold dataset to the lakeFS repository and tag it ############")
    commitMessage = 'Joined image and mask both as the gold layer'
    commit(repo.id, repo_path, experimentBranchN, commitMessage)
    goldDatasetTagID = f"{tagPrefix}-{experimentBranchN}-gold-images"
    lakefs_set_tag(repo.id, goldDatasetTagID, experimentBranchN)
    
    return goldDatasetTagID