# Managing the Data Lifecycle with lakeFS

##Efficient DataOps for High Quality Data Products

<img src="https://lakefs.io/wp-content/uploads/2022/06/what-is-lakefs-slide.png" width=800/>

<img src="https://lakefs.io/wp-content/uploads/2022/06/why-git-for-data-2.png" width=800/>

In [0]:
lakefsEndPoint = 'https://YourEndPoint/' # e.g. 'https://username.azure_region_name.lakefscloud.io'
lakefsAccessKey = 'AKIAlakeFSAccessKey'
lakefsSecretKey = 'lakeFSSecretKey'

In [0]:
# Configuring Python Client

%xmode Minimal
import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

client = LakeFSClient(configuration)

In [0]:
# Configuring environment variables

repo = "learn-lakefs-repo01"
storageNamespace = 'https://storage-account-name.blob.core.windows.net/storage-container-name/'+repo
sourceBranch = "main"
dataPath = "product-reviews"

In [0]:
# Creating a repository 
client.repositories.create_repository(
    repository_creation=models.RepositoryCreation(
        name=repo,
        storage_namespace=storageNamespace,
        default_branch=sourceBranch))

In [0]:
#Reading data from databricks datasets and inserging into the newly created repository (Creating initial data)

import_data_path = "/databricks-datasets/amazon/test4K/"
df = spark.read.parquet(import_data_path)
df.write.format("parquet").save("lakefs://{}/{}/{}".format(repo,sourceBranch,dataPath))


In [0]:
# Commiting the changes

client.commits.commit(
    repository=repo,
    branch=sourceBranch,
    commit_creation=models.CommitCreation(
        message='Uploading intial data into lakefs',
        metadata={'using': 'python_api'}))

### Create a production identical isolated environment

In [0]:
# Review production Data from your production "main" branch

# Note - This example uses static strings instead of parameters for an easier read

df = spark.read.parquet("lakefs://learn-lakefs-repo01/main/product-reviews/")
df.show()

In [0]:
# Create an isolated Testing Branch

client.branches.create_branch(
    repository="learn-lakefs-repo01",
    branch_creation=models.BranchCreation(
        name="experiment",
        source="main"))

In [0]:
#Reading data from the experiment branch
df = spark.read.parquet("lakefs://learn-lakefs-repo01/experiment/product-reviews/")
df.show()

### Run ETL Data Pipelines in isolation
#### Delete 1 star reviews & re-partition by rating

In [0]:
# Delete all overly happy or overly unhappy star ratings

df_no_1star=df.where(df.rating!='1')
df_no_5star=df_no_1star.where(df.rating!='5')

df = df_no_5star
df.show()

In [0]:
# Repartition by rating

df.write.partitionBy("rating").format("parquet").save("lakefs://learn-lakefs-repo01/experiment/product-reviews_by_rating")

In [0]:
# Commiting the changes to the dirty branch

client.commits.commit(
    repository="learn-lakefs-repo01",
    branch="experiment",
    commit_creation=models.CommitCreation(
        message='Remove extreme reviews and repartitioned by stars',
        metadata={'using': 'python_api',
                  '::lakefs::DataBricks Notebook::url[url:ui]': 'https://adb-8911673420610391.11.azuredatabricks.net/?o=8911673420610391#notebook/3087183711593803/command/3087183711593821'}))

In [0]:
# Comparing the difference between both branches

results = map(
    lambda n:[n.path,n.path_type,n.size_bytes,n.type],
    client.refs.diff_refs(
        repository="learn-lakefs-repo01",
        left_ref="main",
        right_ref="experiment").results)

from tabulate import tabulate
print(tabulate(
    results,
    headers=['Path','Path Type','Size(Bytes)','Type']))

In [0]:
# Query rating breakdown

df = spark.read.parquet("lakefs://learn-lakefs-repo01/experiment/product-reviews_by_rating")
df.groupby("rating").count().display()

### Merge Changes into Main
#### Once you are satisfied, merge into main

In [0]:
# Happy with the results? Merge into main

client.refs.merge_into_branch(
    repository="learn-lakefs-repo01",
    source_ref="experiment", 
    destination_branch="main")

### Unhappy With the changes? Don't merge to main
#### Delete the experiment branch

In [0]:
# Delete the experimentation branch

client.branches.delete_branch(
    repository="learn-lakefs-repo01",
    branch="experiment")

<img src="https://lakefs.io/wp-content/uploads/2022/06/how-does-lakefs-work-1.png" width=800/>

#### More Questions?

###### Join the [lakeFS Slack group](https://lakefs.io/slack)