## Installing dependencies

In [None]:
!pip install lakefs_client


In [None]:
!pip install boto3

In [None]:
! pip install opencv-python

In [None]:
! pip install tensorflow

In [None]:
! pip install nbimporter

## Imports

In [None]:
import os
import json
import boto3
import joblib
import tempfile
from io import BytesIO
import nbimporter
import pprint

from datetime import date, time
from utils.ml_utils import *

import cv2
import numpy as np
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Conv2D,MaxPooling2D,Dense,Flatten,Dropout
from tensorflow.keras.layers import BatchNormalization

import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField, StringType

print("Loaded all libraries")

### Configuring Boto3, lakeFSClient and Spark

In [None]:
# Configuring boto3 client
s3_client = boto3.client('s3',
    endpoint_url='http://host.docker.internal:8000',
    aws_access_key_id='AKIAIOSFODNN7EXAMPLE',
    aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY')

s3_resource = boto3.resource('s3',
    endpoint_url='http://host.docker.internal:8000',
    aws_access_key_id='AKIAIOSFODNN7EXAMPLE',
    aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY')

In [None]:
# AccessKey and SecretKey are present in the docker-compose.yaml file we used to spin up the everything bagel
lakefsAccessKey = "AKIAIOSFODNN7EXAMPLE"
lakefsSecretKey = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
lakefsEndPoint = "http://lakefs:8000"

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint


### Configuring S3A Gateway

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")


In [None]:
client = LakeFSClient(configuration)

## Demo Steps

1. Show the dataset in MinIO
2. Create a new MinIO bucket for lakeFS repository
3. Create a lakeFS repository (ml-demo)
4. Import dataset into the lakeFS repo

## Experiment Configs

In [None]:
repo_name = "ml-demo"

ingest_branch = "_main_imported"
exp1_branch = "experiment-1"
exp2_branch = "experiment-2"

prod_branch = "main"


In [None]:
file_path = f"s3a://{repo_name}"

images_path = "dogs_dataset_/images/Images"
annotations = "dogs_dataset_/annotations/Annotations"

raw_path = "raw"
processed_path = "processed"
artifact_path = "artifacts"
metrics_path = "metrics"
training_code_path = "src"


## File utils

In [None]:
def get_img_from_s3(bucket, key):
    
    bucket = s3_resource.Bucket(bucket)
    file_stream = BytesIO()
    bucket.Object(key).download_fileobj(file_stream)
    np_1d_array = np.frombuffer(file_stream.getbuffer(), dtype="uint8")
    img = cv2.imdecode(np_1d_array, cv2.IMREAD_COLOR).copy()
    
    return resize_img(img)

In [None]:
def get_category_list_from_s3(bucket, branch,  key, delimiter, n_cats):
    
    list_resp = s3_client.list_objects_v2(Bucket=bucket, 
                                          Prefix=key, 
                                          Delimiter=delimiter)
    #print("List_resp", list_resp)
    
    category_list = [ x['Prefix'] for x in list_resp['CommonPrefixes'][:n_cats]]
    #print(len(category_list))
    
    return category_list

In [None]:
def get_images_and_labels(bucket, category_list, n_images):

    img_list = []
    labels_list = []
    
    for index, category in enumerate(category_list):
        # breed = category.split("/")[-2]
        list_resp = s3_client.list_objects_v2(Bucket=bucket, Prefix=category)

        for c in list_resp['Contents'][:n_images]:
            key = c['Key']
            img = get_img_from_s3(bucket, key)
            label = index
            
            img_list.append(img)
            labels_list.append(label)
    
    images = np.array(img_list)
    labels = np.array(labels_list)

    # print("Images shape = ",images.shape,"\nLabels shape = ",labels.shape)
    # print(type(images),type(labels))
    
    return images, labels

In [None]:
def load_training_data(params):
    
    category_list = get_category_list_from_s3(bucket=params['repo_name'],
                                         branch=params['branch'],
                                         key=params['image_path'],
                                         delimiter=params['delimiter'],
                                         n_cats=params['n_cats']
                                         )
    
    images, labels = get_images_and_labels(bucket=repo_name,
                                             category_list=category_list,
                                             n_images=params['n_images'])
    
    return images, labels

In [None]:
def model_load(model_name, bucket_name, key):
    
    key = f"{key}/{model_name}"
    
    # READ
    with tempfile.TemporaryFile() as fp:
        s3_client.download_fileobj(Fileobj=fp, Bucket=bucket_name, Key=key)
        fp.seek(0)
        model = joblib.load(fp)

    # DELETE
    # s3_client.delete_object(Bucket=bucket_name, Key=key)
    
    print(type(model))
    
    return model

In [None]:
def model_save(model, model_name, bucket_name, key):
    
    joblib.dump(model, model_name)
    
    key = f"{key}/{model_name}"
    print(model_name, bucket_name, key)

    # WRITE
    with tempfile.TemporaryFile() as fp:
        joblib.dump(model, fp)
        fp.seek(0)
        s3_client.put_object(Body=fp.read(), Bucket=bucket_name, Key=key)

    return

In [None]:
def save_metrics(metrics, bucket_name, key):
    
    data = [(str(metrics['loss']), str(metrics['accuracy']))]

    schema = StructType([ \
        StructField("loss",StringType(),True), \
        StructField("accuracy",StringType(),True) \
      ])
 
    df = spark.createDataFrame(data=data,schema=schema)
    df.printSchema()
    df.show(truncate=False)
    
    path = f"s3a://{bucket_name}/{key}"
    df.write.json(path)
    return

In [None]:
def load_metrics(bucket_name, key):
    
    path = f"s3a://{bucket_name}/{key}"
    
    df = spark.read.json(path)
    metrics = df.collect()[0]
    loss = metrics['loss']
    accuracy = metrics['accuracy']
    return loss, accuracy

# Experimentation Begins

## Experiment #1

In [None]:
params_exp1 ={
    'repo_name': repo_name,
    'branch': exp1_branch,
    'image_path': f"{exp1_branch}/{raw_path}/{images_path}",
    'artifacts_path': f"{exp1_branch}/{artifact_path}",
    'metrics_path': f"{exp1_branch}/{metrics_path}",
    'model_name': "model.pkl",
    'delimiter': "/",
    'n_cats': 3,
    'n_images': 100,
    'is_shuffle':True,
    'is_normalize': False,
    'epochs': 200,
    'train_test_split_ratio': 0.2,
    'optimizer': "adam",
    'loss': "sparse_categorical_crossentropy",
    'metrics': ["accuracy"]
}
params = params_exp1

### Set up lakeFS for experiment #1

#### Create a new branch: `experiment-1` from `_main_exported`

In [None]:
client.branches.list_branches(repo_name)

In [None]:
client.branches.create_branch(repository=repo_name, 
                              branch_creation=models.BranchCreation(name=exp1_branch, 
                                                                    source=ingest_branch)
                             )
client.branches.list_branches(repo_name)

#### Load training data from lakeFS. 
#### Generate images and labels for training and Commit.

In [None]:
images, labels = load_training_data(params)
print("Loading training data")

In [None]:
#TODO: Commit the training data after preprocessing under /processed

#### Train the model. 
#### Upload model metrics to lakeFS and commit. 

In [None]:
model1, metrics1 = ml_pipeline(params, images, labels)

In [None]:
save_metrics(metrics1, repo_name, params['metrics_path'])

In [None]:
params['loss'], params['accuracy'] = load_metrics(repo_name, params['metrics_path'])
pprint.pprint(params)

In [None]:
client.branches.diff_branch(repository=repo_name, 
                            branch=exp1_branch).results

commit_meta_params = {}
for k,v in params.items():
    commit_meta_params[k]=str(v)
print(commit_meta_params)

client.commits.commit(repository=repo_name,
                      branch=exp1_branch,
                      commit_creation=models.CommitCreation(
                          message=f"Saving model metrics to {exp1_branch}",
                          metadata=commit_meta_params)
                     )

#### Upload model artifacts to lakeFS and commit. 

In [None]:
model_save(model1, 
           params['model_name'], 
           params['repo_name'], 
           params['artifacts_path'])


In [None]:
client.branches.diff_branch(repository=repo_name, 
                            branch=exp1_branch).results

commit_meta_params = {}
for k,v in params.items():
    commit_meta_params[k]=str(v)
print(commit_meta_params)

client.commits.commit(repository=repo_name,
                      branch=exp1_branch,
                      commit_creation=models.CommitCreation(
                          message=f"Saving model artifacts to {exp1_branch}",
                          metadata=commit_meta_params)
                     )

#### Load the pickle file from lakeFS, and run predictions.

In [None]:
model1_reloaded = model_load(params['model_name'], 
           params['repo_name'], 
           params['artifacts_path'])

In [None]:
x_train, x_test, y_train, y_test = split_train_test(images, labels, params['train_test_split_ratio'])
pred = model1_reloaded.predict(x_test)

pred.shape

In [None]:
plt.figure(1 , figsize = (19 , 10))
n = 0 

for i in range(9):
    n += 1 
    r = np.random.randint( 0, x_test.shape[0], 1)
    
    plt.subplot(3, 3, n)
    plt.subplots_adjust(hspace = 0.3, wspace = 0.3)
    
    plt.imshow(x_test[r[0]])
    plt.title('Actual = {}, Predicted = {}'.format(y_test[r[0]] , y_test[r[0]]*pred[r[0]][y_test[r[0]]]) )
    plt.xticks([]) , plt.yticks([])

plt.show()

## Experiment #2

In [None]:
params_exp2 ={
    'repo_name': repo_name,
    'branch': exp2_branch,
    'image_path': f"{exp2_branch}/{raw_path}/{images_path}",
    'artifacts_path': f"{exp2_branch}/{artifact_path}",
    'metrics_path': f"{exp2_branch}/{metrics_path}",
    'model_name': "model-exp2.pkl",
    'delimiter': "/",
    'n_cats': 3,
    'n_images': 50,
    'is_shuffle': True,
    'is_normalize': True,
    'epochs': 10,
    'train_test_split_ratio': 0.15,
    'optimizer': "adagrad",
    'loss': "sparse_categorical_crossentropy",
    'metrics': ["accuracy"]
}
params = params_exp2

### Set up lakeFS for experiment #2

1. Create a new branch: `experiment-2` from `_main_exported`

In [None]:
client.branches.list_branches(repo_name)

client.branches.create_branch(repository=repo_name, 
                              branch_creation=models.BranchCreation(name=exp2_branch, 
                                                                    source=ingest_branch)
                             )

client.branches.list_branches(repo_name)

#### Load training data from lakeFS. 
#### Generate images and labels for training and Commit.

In [None]:
images, labels = load_training_data(params)

In [None]:
# TODO: Commit training data

#### Train the model. 
#### Upload model metrics to lakeFS and commit.

In [None]:
model2, metrics2 = ml_pipeline(params, images, labels)

In [None]:
save_metrics(metrics2, repo_name, params['metrics_path'])

In [None]:
params['loss'], params['accuracy'] = load_metrics(repo_name, params['metrics_path'])
pprint.pprint(params)

In [None]:
client.branches.diff_branch(repository=repo_name, 
                            branch=exp2_branch).results

commit_meta_params = {}
for k,v in params.items():
    commit_meta_params[k]=str(v)
pprint.pprint(commit_meta_params)

client.commits.commit(repository=repo_name,
                      branch=exp2_branch,
                      commit_creation=models.CommitCreation(
                          message=f"Saving model metrics to {exp2_branch}",
                          metadata=commit_meta_params)
                     )

#### Upload model artifacts to lakeFS and commit.

In [None]:
model_save(model2, 
           params['model_name'], 
           params['repo_name'], 
           params['artifacts_path'])

In [None]:
client.branches.diff_branch(repository=repo_name, 
                            branch=exp2_branch).results

commit_meta_params = {}
for k,v in params.items():
    commit_meta_params[k]=str(v)
pprint.pprint(commit_meta_params)

client.commits.commit(repository=repo_name,
                      branch=exp2_branch,
                      commit_creation=models.CommitCreation(
                          message=f"Saving model artifacts to {exp2_branch}",
                          metadata=commit_meta_params)
                     )

#### Load the pickle file from lakeFS, and run predictions.

In [None]:
model2_reloaded = model_load(params['model_name'], 
           params['repo_name'], 
           params['artifacts_path'])

In [None]:
x_train, x_test, y_train, y_test = split_train_test(images, labels, params['train_test_split_ratio'])
pred = model2_reloaded.predict(x_test)

pred.shape

In [None]:
plt.figure(1 , figsize = (19 , 10))
n = 0 

for i in range(9):
    n += 1 
    r = np.random.randint( 0, x_test.shape[0], 1)
    
    plt.subplot(3, 3, n)
    plt.subplots_adjust(hspace = 0.3, wspace = 0.3)
    
    plt.imshow(x_test[r[0]])
    plt.title('Actual = {}, Predicted = {}'.format(y_test[r[0]] , y_test[r[0]]*pred[r[0]][y_test[r[0]]]) )
    plt.xticks([]) , plt.yticks([])

plt.show()

### Compare models in both branches

In [None]:
win_branch = exp2_branch
if metrics1['accuracy']> metrics2['accuracy']:
    win_branch = exp1_branch

In [None]:
client.refs.merge_into_branch(repository=repo_name, 
                              source_ref=win_branch, 
                              destination_branch=prod_branch)

## DONE!!