<img src="https://docs.lakefs.io/assets/logo.svg" alt="lakeFS logo" width=300/> 

# Creating Dev-Test environments with lakeFS branches

## Setup

### Installing lakeFS python client

In [1]:
from datetime import date, time

In [2]:
import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient


### Configuring lakeFSClient and Spark

In [3]:
# AccessKey and SecretKey are present in the docker-compose.yaml file we used to spin up the everything bagel
lakefsAccessKey = "AKIAIOSFODNN7EXAMPLE"
lakefsSecretKey = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
lakefsEndPoint = "http://lakefs:8000"

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint


In [4]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col,isnan,when,count

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")


In [6]:
client = LakeFSClient(configuration)

### Define lakeFS Repository

_This should already exist; if not, go and create it through the lakeFS UI_

In [39]:
repo_name = "netflix"

client.repositories.create_repository(repository_creation=models.RepositoryCreation(name=repo_name,
                                                                                    storage_namespace="s3://example/netflix"))

{'creation_date': 1685004183,
 'default_branch': 'main',
 'id': 'netflix',
 'storage_namespace': 's3://example/netflix'}

## Creating Ingest and Staging branches

In [8]:
ingest_branch = "ingress-landing-area"
staging_branch = "staging-area"
prod_branch = "main"


In [9]:
client.branches.list_branches(repo_name)


{'pagination': {'has_more': False,
                'max_per_page': 1000,
                'next_offset': '',
                'results': 1},
 'results': [{'commit_id': '1ab81d7fdabd9840c36e85df46507a1b3eeb0121d404c8dbf80eec98639467ce',
              'id': 'main'}]}

In [10]:
client.branches.create_branch(repository=repo_name, 
                              branch_creation=models.BranchCreation(name=ingest_branch, 
                                                                    source=prod_branch)
                             )

'1ab81d7fdabd9840c36e85df46507a1b3eeb0121d404c8dbf80eec98639467ce'

In [11]:
client.branches.create_branch(repository=repo_name, 
                              branch_creation=models.BranchCreation(name=staging_branch, 
                                                                    source=prod_branch)
                             )

'1ab81d7fdabd9840c36e85df46507a1b3eeb0121d404c8dbf80eec98639467ce'

In [12]:
client.branches.list_branches(repo_name)


{'pagination': {'has_more': False,
                'max_per_page': 1000,
                'next_offset': '',
                'results': 3},
 'results': [{'commit_id': '1ab81d7fdabd9840c36e85df46507a1b3eeb0121d404c8dbf80eec98639467ce',
              'id': 'ingress-landing-area'},
             {'commit_id': '1ab81d7fdabd9840c36e85df46507a1b3eeb0121d404c8dbf80eec98639467ce',
              'id': 'main'},
             {'commit_id': '1ab81d7fdabd9840c36e85df46507a1b3eeb0121d404c8dbf80eec98639467ce',
              'id': 'staging-area'}]}

## Load some sample data about Netflix movies

The daily partition lands in ingress path (branch)

In [13]:
ingest_data = "movies.csv"

ingest_path = f'dt={str(date.today())}/{ingest_data}'
ingest_path


'dt=2023-05-25/movies.csv'

In [14]:
with open(f'/data/{ingest_data}', 'rb') as f:
    client.objects.upload_object(repository=repo_name, 
                                 branch=ingest_branch, 
                                 path=ingest_path, 
                                 content=f
                                )


In [15]:
client.branches.diff_branch(repository=repo_name, 
                            branch=ingest_branch).results


[{'path': 'dt=2023-05-25/movies.csv',
  'path_type': 'object',
  'size_bytes': 1071619,
  'type': 'added'}]

In [16]:
client.commits.commit(repository=repo_name,
                      branch=ingest_branch,
                      commit_creation=models.CommitCreation(
                          message="netflix movie data arrived at landing area (today's partition)")
                     )

{'committer': 'everything-bagel',
 'creation_date': 1685003604,
 'id': '154ed1603ce11ec6fdb5e95a19c5eeebdfe0368e4513b4b1f9c05960fc4db5a3',
 'message': "netflix movie data arrived at landing area (today's partition)",
 'meta_range_id': '',
 'metadata': {},
 'parents': ['1ab81d7fdabd9840c36e85df46507a1b3eeb0121d404c8dbf80eec98639467ce']}

## Copying daily partition from ingress to staging area (branch)

In [17]:
staging_long_path = f"s3a://{repo_name}/{staging_branch}"
staging_long_path

's3a://example/staging-area'

In [18]:
csv_path = f"{staging_long_path}/raw/dt={str(date.today())}/csv"
csv_path

's3a://example/staging-area/raw/dt=2023-05-25/csv'

In [19]:
movies_df = spark.read.option("header","true").csv(f"s3a://{repo_name}/{ingest_branch}/{ingest_path}")

In [20]:
movies_df.write.option("header",True)\
        .mode("append")\
        .csv(csv_path)
    

In [21]:
client.branches.diff_branch(repository=repo_name, 
                            branch=staging_branch).results


[{'path': 'raw/dt=2023-05-25/csv/_SUCCESS',
  'path_type': 'object',
  'size_bytes': 1062839,
  'type': 'added'},
 {'path': 'raw/dt=2023-05-25/csv/part-00000-7a59cd7f-2b15-412d-8646-1b80fc493e63-c000.csv',
  'path_type': 'object',
  'size_bytes': 1062839,
  'type': 'added'}]

In [22]:
client.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=models.CommitCreation(
                          message="netflix movie data copied to staging area (today's partition)")
                     )

{'committer': 'everything-bagel',
 'creation_date': 1685003617,
 'id': '846c6072a80f3965b732f686c82d05e17ee28f50fd339148830147cf2b6e63dd',
 'message': "netflix movie data copied to staging area (today's partition)",
 'meta_range_id': '',
 'metadata': {},
 'parents': ['1ab81d7fdabd9840c36e85df46507a1b3eeb0121d404c8dbf80eec98639467ce']}

## Data Exploration and Cleaning in staging area (branch)

In [28]:
movies_df = spark.read.option("header","true").csv(csv_path)
df_columns=movies_df.columns


In [29]:
print(movies_df.count())
print(movies_df.printSchema())


8791
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)

None


In [30]:
movies_df.show(10)

+-------+-------+--------------------+-------------------+--------------+----------+------------+------+---------+--------------------+
|show_id|   type|               title|           director|       country|date_added|release_year|rating| duration|           listed_in|
+-------+-------+--------------------+-------------------+--------------+----------+------------+------+---------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|    Kirsten Johnson| United States| 9/25/2021|        2020| PG-13|   90 min|       Documentaries|
|     s3|TV Show|           Ganglands|    Julien Leclercq|        France| 9/24/2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...|
|     s6|TV Show|       Midnight Mass|      Mike Flanagan| United States| 9/24/2021|        2021| TV-MA| 1 Season|TV Dramas, TV Hor...|
|    s14|  Movie|Confessions of an...|      Bruno Garotti|        Brazil| 9/22/2021|        2021| TV-PG|   91 min|Children & Family...|
|     s8|  Movie|             Sankofa|       Hai

In [31]:
#movies_df = movies_df.sample(False,0.1,0)

## Null checks

In [32]:
movies_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_columns]).show()


+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|show_id|type|title|director|country|date_added|release_year|rating|duration|listed_in|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|      0|   0|    0|       1|      1|         1|           1|     1|       2|        2|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+



In [33]:
movies_df = movies_df.na.drop("any")

In [34]:
movies_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_columns]).show()


+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|show_id|type|title|director|country|date_added|release_year|rating|duration|listed_in|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|      0|   0|    0|       0|      0|         0|           0|     0|       0|        0|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+



## Writing Transformed Parquet files to staging area

In [35]:
movies_df.write.option("header",True)\
        .partitionBy("country")\
        .mode("append")\
        .parquet(f"{staging_long_path}/analytics/movies-by-country-parquet")
    

### View uncommitted changes and clean up the files not needed

Go to the lakeFS UI to inspect the uncommitted changes, e.g. http://localhost:8000/repositories/example/changes?ref=staging-area&prefix=analytics%2Fmovies-by-country-parquet%2F

## Commit the changes to staging 

In [36]:
client.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=models.CommitCreation(
                          message='loaded paritioned movies parquet to staging area'))


{'committer': 'everything-bagel',
 'creation_date': 1685003820,
 'id': '32b04151698a19e4eccf0a1a48037f388a09eea37cd4d52021549b84b7ead3f5',
 'message': 'loaded paritioned movies parquet to staging area',
 'meta_range_id': '',
 'metadata': {},
 'parents': ['846c6072a80f3965b732f686c82d05e17ee28f50fd339148830147cf2b6e63dd']}

## Merging Daily Data (Parquet files) to Prod

In [37]:
client.refs.merge_into_branch(repository=repo_name, 
                              source_ref=staging_branch, 
                              destination_branch=prod_branch)


{'reference': 'fe86e2f05c322cb275c2af02725b0f9664aadfc9172de531de3eaa1707a78800',
 'summary': {'added': 0, 'changed': 0, 'conflict': 0, 'removed': 0}}