### Installing lakeFS python client

In [None]:
!pip install lakefs_client


In [None]:
from datetime import date, time


In [None]:
import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient


### Configuring lakeFSClient and Spark

In [None]:
# AccessKey and SecretKey are present in the docker-compose.yaml file we used to spin up the everything bagel
lakefsAccessKey = "AKIAIOSFODNN7EXAMPLE"
lakefsSecretKey = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
lakefsEndPoint = "http://lakefs:8000"

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint


In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")


In [None]:
client = LakeFSClient(configuration)


## Creating Ingest and Staging branches

In [None]:
repo_name = "example"

ingest_branch = "ingest-landing-area"
staging_branch = "staging-area"
prod_branch = "main"


In [None]:
client.branches.list_branches(repo_name)


In [None]:
client.branches.create_branch(repository=repo_name, 
                              branch_creation=models.BranchCreation(name=ingest_branch, 
                                                                    source=prod_branch)
                             )

In [None]:
client.branches.create_branch(repository=repo_name, 
                              branch_creation=models.BranchCreation(name=staging_branch, 
                                                                    source=prod_branch)
                             )

In [None]:
client.branches.list_branches(repo_name)


## Uploading movies data to ingest branch

In [None]:
ingest_data = "movies.csv"

ingest_path = f'dt={str(date.today())}/{ingest_data}'
ingest_path


In [None]:
with open(f'./{ingest_data}', 'rb') as f:
    client.objects.upload_object(repository=repo_name, 
                                 branch=ingest_branch, 
                                 path=ingest_path, 
                                 content=f
                                )


In [None]:
client.branches.diff_branch(repository=repo_name, 
                            branch=ingest_branch).results


In [None]:
client.commits.commit(repository=repo_name,
                      branch=ingest_branch,
                      commit_creation=models.CommitCreation(
                          message="netflix movie data arrived at landing area (today's partition)")
                     )

## Uploading actions.yaml config file to staging branch

* Hooks config file `actions.yaml` needs to be uploaded to the branch on which the tests are run. i.e., we want to run data quality tests on staging branch before merging the data into production.

* So add `_lakefs_actions/actions.yaml` to staging branch
* `actions.yaml` contains a pre-merge hook configured to check for file format validation.

In [None]:
hooks_config_yaml = "actions.yaml"
hooks_prefix = "_lakefs_actions"


In [None]:
with open(f'./{hooks_config_yaml}', 'rb') as f:
    client.objects.upload_object(repository=repo_name, 
                                 branch=staging_branch, 
                                 path=f'{hooks_prefix}/{hooks_config_yaml}', 
                                 content=f
                                )


In [None]:
client.branches.diff_branch(repository=repo_name, 
                            branch=staging_branch).results


In [None]:
client.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=models.CommitCreation(
                          message='Added hooks config file - actions.yaml to staging area')
                     )


## Extracting data from ingest branch for transformation

In [None]:
ingest_long_path = f"s3a://{repo_name}/{ingest_branch}/{ingest_path}"
ingest_long_path


In [None]:
movies_df = spark.read.option("header","true").csv(ingest_long_path)
print(movies_df.count())
print(movies_df.printSchema())


In [None]:
movies_df.show(10)

In [None]:
movies_df = movies_df.sample(False,0.1,0)


## Loading transformed data into Staging Area/Branch

In [None]:
staging_long_path = f"s3a://{repo_name}/{staging_branch}"
staging_long_path


### Writing csv files to staging area

In [None]:
movies_df.write.option("header",True)\
        .partitionBy("type")\
        .mode("append")\
        .csv(f"{staging_long_path}/analytics/movies-by-type-csv")
    

In [None]:
client.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=models.CommitCreation(
                          message='loaded paritioned movies csv to staging area'))


### Pushing csv files to Prod

In [None]:
client.refs.merge_into_branch(repository=repo_name, 
                              source_ref=staging_branch, 
                              destination_branch=prod_branch)
