<img src="./images/logo.svg" alt="lakeFS logo" width=300/> 

# Import into a lakeFS repository from multiple paths

## Config

### lakeFS endpoint and credentials

Change these if using lakeFS other than provided in the samples repo. 

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Storage Information

If you're not using sample repo lakeFS, then change the Storage Namespace to a location in the bucket you’ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [None]:
storageNamespace = 's3://example/import/' # e.g. "s3://bucket"

In [None]:
repo_name = "multi-bucket-import"

## Setup

### Configuring lakeFSClient

In [None]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

In [None]:
print(f"lakeFS client version: {lakefs_client.__version__}")

### Define lakeFS Repository

In [None]:
from lakefs_client.exceptions import NotFoundException

try:
    repo=lakefs.repositories.get_repository(repo_name)
    print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")
except NotFoundException as f:
    print(f"Repository {repo_name} does not exist, so going to try and create it now.")
    try:
        repo=lakefs.repositories.create_repository(repository_creation=RepositoryCreation(name=repo_name,
                                                                                                storage_namespace=f"{storageNamespace}/{repo_name}"))
        print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")
    except lakefs_client.ApiException as e:
        print(f"Error creating repo {repo_name}. Error is {e}")
        os._exit(00)
except lakefs_client.ApiException as e:
    print(f"Error getting repo {repo_name}: {e}")
    os._exit(00)

## Import to a single repository from multiple paths

### Configure the source/target paths

In [None]:
sourceBranch = "main"

# Import Sources and Destinations
importSource1 = "s3://sample-data/stanfordogsdataset/Images" # e.g. "s3://sample-dog-images/Images/n02085620-Chihuahua/"
importSource2 = "s3://sample-data/stanfordogsdataset/Annotation" # e.g. "s3://sample-dog-images/Annotation/n02085620-Chihuahua/"
importDestination = "raw/" # will keep the original files in the raw directory


### Do the import

In [None]:
import time

# Start Import
commit = CommitCreation(message="import objects", metadata={"key": "value"})
paths=[
    ImportLocation(type="common_prefix", path=importSource1, destination=importDestination),
    ImportLocation(type="common_prefix", path=importSource2, destination=importDestination)
]
import_creation = ImportCreation(paths=paths, commit=commit)
create_resp = lakefs.import_api.import_start(repo.id, sourceBranch, import_creation)

# Wait for import to finish
while True:
    status_resp = lakefs.import_api.import_status(repo.id, sourceBranch, create_resp.id)
    print(status_resp)
    if hasattr(status_resp, "Error in import"):
        raise Exception(status_resp.err)
    if status_resp.completed:
        print("Import completed Successfully. Data imported into branch:", sourceBranch)
        break
    time.sleep(2)

In [None]:
from IPython.display import Markdown as md

if lakefsEndPoint=='http://lakefs:8000':
    lakeFSWebUI='http://localhost:8000'
else:
    lakeFSWebUI=lakefsEndPoint

md(f"### 👉🏻 View the objects in [lakeFS web UI]({lakeFSWebUI}/repositories/multi-bucket-import/objects)")

## More Questions?

**👉🏻 Join the lakeFS Slack group - https://lakefs.io/slack**