In [1]:
!pip install lakefs_client



In [2]:
lakefsAccessKey = ""
lakefsSecretKey = ""
lakefsEndPoint = "http://lakefs:8000"

In [3]:
storageNamespace = "s3://test-1"

In [4]:
sourceBranch = "main"
newBranch = "experiment1"

In [5]:
import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient

In [6]:
# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

In [7]:
client = LakeFSClient(configuration)

In [8]:
repo = "test-1"
client.branches.list_branches('example')

{'pagination': {'has_more': False,
                'max_per_page': 1000,
                'next_offset': '',
                'results': 1},
 'results': [{'commit_id': 'c311aa2593dcfea814f7557b7f360b4e9d36c929212694e913bde76cab4348e5',
              'id': 'main'}]}

In [9]:
client.repositories.create_repository(
    repository_creation=models.RepositoryCreation(
        name=repo,
        storage_namespace=storageNamespace,
        default_branch=sourceBranch))

ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'X-Request-Id': 'fa641ae4-9c65-4aa4-a12c-65168f3f19b2', 'Date': 'Thu, 08 Sep 2022 22:10:25 GMT', 'Content-Length': '153'})
HTTP response body: {"message":"failed to create repository: found lakeFS objects in the storage namespace(s3://test-1): lakeFS repositories can't share storage namespace"}



In [10]:
client.branches.create_branch(
    repository=repo,
    branch_creation=models.BranchCreation(
        name=newBranch,
        source=sourceBranch))

ApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'X-Request-Id': 'ed0ccb64-d2c5-4a9b-8ed3-ba998c23800c', 'Date': 'Thu, 08 Sep 2022 22:10:45 GMT', 'Content-Length': '48'})
HTTP response body: {"message":"branch already exists: not unique"}



In [11]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

In [14]:
fileName = "temp.yaml"
dataPath = f"s3a://{repo}/{sourceBranch}/{fileName}"

df = spark.read.csv(dataPath)
df.show()

+--------------------+
|                 _c0|
+--------------------+
|name: NoUserColum...|
|     description: >-|
|  This webhook en...|
|  beginning with ...|
|                 on:|
|          pre-merge:|
|           branches:|
|              - main|
|              hooks:|
|  - id: pub_preve...|
|       type: webhook|
|    description: ...|
|         properties:|
|      url: "http:...|
|       query_params:|
|        disallow:...|
|        prefix: p...|
+--------------------+



In [15]:
newPath = "new_temp"
newDataPath = f"s3a://{repo}/{newBranch}/{newPath}"

df.write.csv(newDataPath)

In [16]:
client.commits.commit(
    repository=repo,
    branch=newBranch,
    commit_creation=models.CommitCreation(
        message='Saved CSV file!'))

{'committer': 'docker',
 'creation_date': 1662675169,
 'id': '1c0a8f7727873c9fe5b0f4195478183fc582887ff156b5a38a8709e69cbeea52',
 'message': 'Saved CSV file!',
 'meta_range_id': '',
 'metadata': {},
 'parents': ['016f7fdcaed8d21eb9a4ddc4d07477c7cc3cea5c73ec35287c113b52b0fc72da']}

In [18]:
newPath1 = "test-path-parquet-2"
newDataPath1 = f"s3a://{repo}/{newBranch}/{newPath1}"

df.write.parquet(newDataPath1)

In [21]:
client.refs.merge_into_branch(repository='test-1', source_ref='experiment1', destination_branch='main')

ApiException: (412)
Reason: Precondition Failed
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'X-Request-Id': '22716fce-4f8d-433e-900c-679524de84ae', 'Date': 'Thu, 08 Sep 2022 22:14:41 GMT', 'Content-Length': '358'})
HTTP response body: {"message":"pre-merge hook aborted, run id '20220908221441GUVUBeNM': 1 error occurred:\n\t* hook run id '0000_0000' failed on action 'ParquetOnlyInProduction' hook 'production_format_validator': Post \"lakefs-webhooks:5000/webhooks/format?allow=parquet\u0026allow=delta_lake\u0026prefix=production%2F\": unsupported protocol scheme \"lakefs-webhooks\"\n\n"}

