### Installing lakeFS python client

In [1]:
!pip install lakefs_client




In [2]:
from datetime import date, time


In [3]:
import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient


### Configuring lakeFSClient and Spark

In [4]:
# AccessKey and SecretKey are present in the docker-compose.yaml file we used to spin up the everything bagel
lakefsAccessKey = "AKIAIOSFODNN7EXAMPLE"
lakefsSecretKey = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
lakefsEndPoint = "http://lakefs:8000"

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint


In [5]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")


In [6]:
client = LakeFSClient(configuration)


## Creating Ingest and Staging branches

In [7]:
repo_name = "example"

ingest_branch = "ingest-landing-area"
staging_branch = "staging-area"
prod_branch = "main"


In [8]:
client.branches.list_branches(repo_name)


{'pagination': {'has_more': False,
                'max_per_page': 1000,
                'next_offset': '',
                'results': 1},
 'results': [{'commit_id': '9c133ddbfc5d23978b8fba7345b8f19a6e2ac7d7a7ec9511ef164018ae4624af',
              'id': 'main'}]}

In [9]:
client.branches.create_branch(repository=repo_name, 
                              branch_creation=models.BranchCreation(name=ingest_branch, 
                                                                    source=prod_branch)
                             )

'9c133ddbfc5d23978b8fba7345b8f19a6e2ac7d7a7ec9511ef164018ae4624af'

In [10]:
client.branches.create_branch(repository=repo_name, 
                              branch_creation=models.BranchCreation(name=staging_branch, 
                                                                    source=prod_branch)
                             )

'9c133ddbfc5d23978b8fba7345b8f19a6e2ac7d7a7ec9511ef164018ae4624af'

In [11]:
client.branches.list_branches(repo_name)


{'pagination': {'has_more': False,
                'max_per_page': 1000,
                'next_offset': '',
                'results': 3},
 'results': [{'commit_id': '9c133ddbfc5d23978b8fba7345b8f19a6e2ac7d7a7ec9511ef164018ae4624af',
              'id': 'ingest-landing-area'},
             {'commit_id': '9c133ddbfc5d23978b8fba7345b8f19a6e2ac7d7a7ec9511ef164018ae4624af',
              'id': 'main'},
             {'commit_id': '9c133ddbfc5d23978b8fba7345b8f19a6e2ac7d7a7ec9511ef164018ae4624af',
              'id': 'staging-area'}]}

## Uploading movies data to ingest branch

In [12]:
ingest_data = "movies.csv"

ingest_path = f'dt={str(date.today())}/{ingest_data}'
ingest_path


'dt=2022-11-02/movies.csv'

In [13]:
with open(f'./{ingest_data}', 'rb') as f:
    client.objects.upload_object(repository=repo_name, 
                                 branch=ingest_branch, 
                                 path=ingest_path, 
                                 content=f
                                )


In [14]:
client.branches.diff_branch(repository=repo_name, 
                            branch=ingest_branch).results


[{'path': 'dt=2022-11-02/movies.csv',
  'path_type': 'object',
  'size_bytes': 1071619,
  'type': 'added'}]

In [15]:
client.commits.commit(repository=repo_name,
                      branch=ingest_branch,
                      commit_creation=models.CommitCreation(
                          message="netflix movie data arrived at landing area (today's partition)")
                     )

{'committer': 'docker',
 'creation_date': 1667365963,
 'id': '28229cfc8f387d8207dcf83df4d0c9aa12d83bbe2af0fe074ee88386b8581e0e',
 'message': "netflix movie data arrived at landing area (today's partition)",
 'meta_range_id': '',
 'metadata': {},
 'parents': ['9c133ddbfc5d23978b8fba7345b8f19a6e2ac7d7a7ec9511ef164018ae4624af']}

## Uploading actions.yaml config file to staging branch

* We want to run data quality tests on staging branch before merging the data into production. Hooks config file `actions.yaml` needs to be in the branch on which the tests are run.

* So add `_lakefs_actions/actions.yaml` to staging branch
* `actions.yaml` contains a pre-merge hook configured to check for file format validation.

In [16]:
hooks_config_yaml = "actions.yaml"
hooks_prefix = "_lakefs_actions"


In [17]:
with open(f'./{hooks_config_yaml}', 'rb') as f:
    client.objects.upload_object(repository=repo_name, 
                                 branch=staging_branch, 
                                 path=f'{hooks_prefix}/{hooks_config_yaml}', 
                                 content=f
                                )


In [18]:
client.branches.diff_branch(repository=repo_name, 
                            branch=staging_branch).results


[{'path': '_lakefs_actions/actions.yaml',
  'path_type': 'object',
  'size_bytes': 420,
  'type': 'added'}]

In [19]:
client.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=models.CommitCreation(
                          message='Added hooks config file - actions.yaml to staging area')
                     )


{'committer': 'docker',
 'creation_date': 1667365963,
 'id': 'e9be752c525cb5ea37e25dd50106fbbf001f08d6692a970c47734be727b84189',
 'message': 'Added hooks config file - actions.yaml to staging area',
 'meta_range_id': '',
 'metadata': {},
 'parents': ['9c133ddbfc5d23978b8fba7345b8f19a6e2ac7d7a7ec9511ef164018ae4624af']}

## Extracting data from ingest branch for transformation

In [20]:
ingest_long_path = f"s3a://{repo_name}/{ingest_branch}/{ingest_path}"
ingest_long_path


's3a://example/ingest-landing-area/dt=2022-11-02/movies.csv'

In [21]:
movies_df = spark.read.option("header","true").csv(ingest_long_path)
print(movies_df.count())
print(movies_df.printSchema())


8791
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)

None


In [22]:
movies_df.show(10)

+-------+-------+--------------------+-------------------+--------------+----------+------------+------+---------+--------------------+
|show_id|   type|               title|           director|       country|date_added|release_year|rating| duration|           listed_in|
+-------+-------+--------------------+-------------------+--------------+----------+------------+------+---------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|    Kirsten Johnson| United States| 9/25/2021|        2020| PG-13|   90 min|       Documentaries|
|     s3|TV Show|           Ganglands|    Julien Leclercq|        France| 9/24/2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...|
|     s6|TV Show|       Midnight Mass|      Mike Flanagan| United States| 9/24/2021|        2021| TV-MA| 1 Season|TV Dramas, TV Hor...|
|    s14|  Movie|Confessions of an...|      Bruno Garotti|        Brazil| 9/22/2021|        2021| TV-PG|   91 min|Children & Family...|
|     s8|  Movie|             Sankofa|       Hai

In [23]:
movies_df = movies_df.sample(False,0.1,0)


## Loading transformed data into Staging Area/Branch

In [24]:
staging_long_path = f"s3a://{repo_name}/{staging_branch}"
staging_long_path


's3a://example/staging-area'

## Scenario #1

### Writing parquet files to staging area

In [25]:
movies_df.write.option("header",True)\
        .partitionBy("type")\
        .mode("append")\
        .parquet(f"{staging_long_path}/analytics/movies-by-type-parquet")

In [26]:
client.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=models.CommitCreation(
                          message='loaded paritioned movies parquet to staging area'))


{'committer': 'docker',
 'creation_date': 1667365976,
 'id': '0d4de5b8a83357928c63e9e9e34b77ae86050125a67e2767b83e6dc90a48a2b8',
 'message': 'loaded paritioned movies parquet to staging area',
 'meta_range_id': '',
 'metadata': {},
 'parents': ['e9be752c525cb5ea37e25dd50106fbbf001f08d6692a970c47734be727b84189']}

### Pushing parquet files to Prod

In [27]:
client.refs.merge_into_branch(repository=repo_name, 
                              source_ref=staging_branch, 
                              destination_branch=prod_branch)


{'reference': '900bb7b4c9800e511c8187189620a20a56998cc533e5f8eaed3d070b6e3eb553',
 'summary': {'added': 0, 'changed': 0, 'conflict': 0, 'removed': 0}}

## Scenario #2

### Writing csv files to staging area

In [28]:
movies_df.write.option("header",True)\
        .partitionBy("type")\
        .mode("append")\
        .csv(f"{staging_long_path}/analytics/movies-by-type-csv")
    

In [29]:
client.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=models.CommitCreation(
                          message='loaded paritioned movies csv to staging area'))


{'committer': 'docker',
 'creation_date': 1667365978,
 'id': '6e76a6d1173908888beaf0fb6fa97f3908d82104095f7695a16fe576b375df6c',
 'message': 'loaded paritioned movies csv to staging area',
 'meta_range_id': '',
 'metadata': {},
 'parents': ['0d4de5b8a83357928c63e9e9e34b77ae86050125a67e2767b83e6dc90a48a2b8']}

### Pushing csv files to Prod

In [30]:
client.refs.merge_into_branch(repository=repo_name, 
                              source_ref=staging_branch, 
                              destination_branch=prod_branch)


ApiException: (412)
Reason: Precondition Failed
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Request-Id': 'e134c599-c241-4396-b09f-5224a0e9a403', 'Date': 'Wed, 02 Nov 2022 05:12:58 GMT', 'Content-Length': '261'})
HTTP response body: {"message":"update branch main: pre-merge hook aborted, run id '5n99v9n42i5s773bapug': 1 error occurred:\n\t* hook run id '0000_0000' failed on action 'ParquetOnlyInProduction' hook 'production_format_validator': webhook request failed (status code: 400)\n\n"}



### Why did the merge operation fail?
If you look deeper into the error log, you'll see that the merge request failed with status code '412' (precondition failed). The actions file was executed and blocked a commit with a csv file to merge into main.
* Hint: You can see previous actions run [here](http://localhost:8000/repositories/example/actions)