# Integration of lakeFS with Labelbox

## Use Case: ML Reproducibility

## Setup Task: Import required Python packages

In [None]:
import os
import requests
from pathlib import Path
import os
import pandas as pd
import labelbox
import datetime
from tabulate import tabulate
from uuid import uuid4 ## to generate unique IDs
import json
from labelbox.schema.ontology import OntologyBuilder, Tool, Classification,Option
import random
from labelbox.data.annotation_types import (
    Label,
    Point,
    LabelList,
    ImageData,
    Rectangle,
    ObjectAnnotation,
)
from labelbox.data.serialization import NDJsonConverter
import time
from labelbox.schema.annotation_import import LabelImport

## Setup Task: lakeFS Upload Objects Function

In [None]:
def upload_files(repo, branch, path, files):
    for file in files:
        print(file)
        contentToUpload = open(file, 'rb') # Only a single file per upload which must be named \\\"content\\\"
        client.objects.upload_object(
            repository=repo,
            branch=branch,
            path=path+'/'+os.path.basename(file), content=contentToUpload)

## Setup Task: lakeFS Stage Object Function

In [None]:
from lakefs_client.model.object_stage_creation import ObjectStageCreation
from lakefs_client.model.object_user_metadata import ObjectUserMetadata

def object_stage(source_uri, size_bytes, content_type):
    object_stage_creation = ObjectStageCreation(
        physical_address=source_uri,
        checksum="",
        size_bytes=size_bytes,
        mtime=1,
        metadata=ObjectUserMetadata( # optional
            key="version: v1",
        ),
        content_type=content_type,
    ) # ObjectStageCreation | 
    return object_stage_creation
 
def stage_objects(repo_name, importBranch, source_uri, path, size_bytes, content_type):   
    object_stage_creation = object_stage(source_uri, size_bytes, content_type)
    try:       
        api_response_1 = client.objects.stage_object(repo_name, importBranch, path, object_stage_creation)
       
    except lakefs_client.ApiException as e:
        print("Exception when calling objects->stage_object: %s\n" % e)

## Setup Task: Create S3 client

In [None]:
import boto3
s3 = boto3.client('s3',
    endpoint_url='https://s3.' + awsRegion + '.amazonaws.com',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key)

## Setup Task: Create lakeFS Python client

In [None]:
%xmode Minimal
import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient
import datetime

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

client = LakeFSClient(configuration)

## Setup Task: S3A Gateway configuration

##### Note: lakeFS can be configured to work with Spark in two ways:
###### * Access lakeFS using the S3A gateway https://docs.lakefs.io/integrations/spark.html#access-lakefs-using-the-s3a-gateway.
###### * Access lakeFS using the lakeFS-specific Hadoop FileSystem https://docs.lakefs.io/integrations/spark.html#access-lakefs-using-the-lakefs-specific-hadoop-filesystem.

In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

## Setup Task: Create Labelbox Python client

In [None]:
lb_client = labelbox.Client(LB_API_KEY)