# Integration of lakeFS with Spark and Python

## Use Case: Isolated Testing Environment

## Access lakeFS using the S3A gateway

## Change your lakeFS credentials

In [None]:
lakefsAccessKey = '<lakeFS Access Key>'
lakefsSecretKey = '<lakeFS Secret Key>'
lakefsEndPoint = '<lakeFS Endpoint URL>' # e.g. 'https://username.aws_region_name.lakefscloud.io'

## Storage Information
#### Change the Storage Namespace to a location in the bucket youâ€™ve configured. The storage namespace is a location in the underlying storage where data for this repository will be stored.

In [None]:
storageNamespace = 's3://<S3 Bucket Name>/' # e.g. "s3://username-lakefs-cloud/"

## Versioning Information

In [None]:
sourceBranch = "main"
newBranch = "experiment1"
newPath = "partitioned_data"
fileName = "lakefs_test.csv"

## Working with the lakeFS Python client API

In [None]:
import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

client = LakeFSClient(configuration)

## You can change lakeFS repo name (it can be an existing repo or provide another repo name)

In [None]:
repo = "my-repo"

## If above mentioned repo already exists on your lakeFS server then you can skip following step otherwise create a new repo:

In [None]:
client.repositories.create_repository(repository_creation=models.RepositoryCreation(name=repo, storage_namespace=storageNamespace, default_branch=sourceBranch))

## Upload a file

In [None]:
import os
contentToUpload = open(os.path.expanduser('~')+'/'+fileName, 'rb') # Only a single file per upload which must be named \\\"content\\\"
client.objects.upload_object(repository=repo, branch=sourceBranch, path=fileName, content=contentToUpload)

## Commit changes and attach some metadata

In [None]:
client.commits.commit(
    repository=repo,
    branch=sourceBranch,
    commit_creation=models.CommitCreation(message='Added my first file!', metadata={'using': 'python_api'}))

## S3A Gateway configuration

In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

## Reading data by using S3A Gateway

In [None]:
dataPath = "s3a://{0}/{1}/{2}".format(repo,sourceBranch,fileName)

df = spark.read.csv(dataPath)
df.show()

# Experimentation Starts

## List the repository branches by using lakeFS Python client API

In [None]:
client.branches.list_branches(repository=repo).results

## Create a new branch

In [None]:
client.branches.create_branch(repository=repo, branch_creation=models.BranchCreation(name=newBranch, source=sourceBranch))

## Partition the data and write to new branch by using S3A Gateway

In [None]:
newDataPath = "s3a://{0}/{1}/{2}".format(repo,newBranch,newPath)

df.write.partitionBy("_c0").csv(newDataPath)

## Diffing a single branch will show all the uncommitted changes on that branch

In [None]:
client.branches.diff_branch(repository=repo, branch=newBranch).results

## Commit changes and attach some metadata

In [None]:
client.commits.commit(
    repository=repo,
    branch=newBranch,
    commit_creation=models.CommitCreation(message='Partitioned CSV file!', metadata={'using': 'python_api'}))

## Diff between the new branch and the source branch

In [None]:
client.refs.diff_refs(repository=repo, left_ref=sourceBranch, right_ref=newBranch).results

# Experimentation Completes

## Delete new branch

In [None]:
client.branches.delete_branch(repository=repo, branch=newBranch)