In [0]:

import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = '{your user name}'
configuration.password = '{your password}'
configuration.host = 'https://{some-aws-host}.lakefs.io'


In [0]:

client = LakeFSClient(configuration)

In [0]:
%python
client.branches.list_branches('dais22-adi')

Create new branch

In [0]:

client.branches.create_branch(repository='dais22-adi', branch_creation=models.BranchCreation(name='experiment-chaos', source='main'))

In [0]:

main_repo_path = "lakefs://dais22-adi/main/"
chaos_repo_path = "lakefs://dais22-adi/experiment-chaos/"

## Diffing a single branch will show all uncommitted changes on that branch:

In [0]:

client.branches.diff_branch(repository='dais22-adi', branch='experiment-chaos').results

In [0]:

genre_df = spark.read.format("parquet").load(chaos_repo_path+"genres")

In [0]:
%python
books_df = spark.read.format("parquet").load(chaos_repo_path+"books")

In [0]:

books_df.show(10, truncate=False)

In [0]:

genre_df.show(10, truncate=False)

In [0]:
%sql
DROP TABLE IF EXISTS books

In [0]:

books_df.write.saveAsTable("books")

In [0]:
%sql
DROP TABLE IF EXISTS genre

In [0]:

genre_df.write.saveAsTable("genre")

## Join operation

In [0]:

data = genre_df.join( books_df, genre_df.isbn ==  books_df.isbn, "left" ).select(books_df.isbn, books_df.name, books_df.author, genre_df.genre)

## Save the materialized view

In [0]:

data.write.mode('overwrite').parquet(chaos_repo_path+"books-dataset")

In [0]:

data.show(20,truncate=False)

## Run Quality checks on the experimentation brach

In [0]:

from pyspark.sql.functions import col,isnan, when, count
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

In [0]:

data.count()

In [0]:

genre_df.count()

In [0]:

books_df.count()

In [0]:

display(genre_df)

isbn,name,genre
32278888,Hello Spark Fans,classics
73825104,Fundamentals of Data Observability,adventure
73341143,"Data Engineering with Apache Spark, Delta Lake, and Lakehouse",adventure
54725104,Fundamentals of Data Observability,classics
42278345,Intro to Hive metastore,mystery
52278888,Reviving zookeper,drama
62278888,Life after Hadoop,crime
83825104,Fundamentals of Lakehouse,adventure
93825104,High Performance Yarn,fiction
54725222,Designing Data-Intensive Applications,classics


## join operation #2nd try

In [0]:

data_v2 = books_df.join( genre_df, genre_df.isbn ==  books_df.isbn, "left" ).select(books_df.isbn, books_df.name, books_df.author, genre_df.genre)

In [0]:

from pyspark.sql.functions import col,isnan, when, count
data_v2.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

In [0]:
%python
genre_df.count()

In [0]:

books_df.count()

In [0]:

data.count()

## fix missed data

In [0]:

data_v3 = data_v2.fillna("classics",subset=["genre"])
data.write.mode('overwrite').parquet(chaos_repo_path+"books-dataset")

In [0]:

client.branches.diff_branch(repository='dais22-adi', branch='experiment-chaos').results

# Git like interface - Branching out

![](https://docs.lakefs.io/assets/img/branching_7.png)
<!-- ![](https://miro.medium.com/max/1400/0*2N9qc0DlQmD_nK_Q.png) -->
<!-- <img src="https://miro.medium.com/max/1400/0*2N9qc0DlQmD_nK_Q.png" alt="drawing" style="width:50px;height:20px;"/> -->

## Cross collection consistency
We often need consistency between different data collections. A few examples may be:

* To join different collections in order to create a unified view of an account, a user or another entity we measure.
* To introduce the same data in different formats
* To introduce the same data with a different leading index or sorting due to performance considerations

![](https://docs.lakefs.io/assets/img/branching_8.png)

## We need to modify the materialized view!

In [0]:
%python
data_v2 = books_df.join( genre_df, genre_df.isbn ==  books_df.isbn, "left" ).select(books_df.isbn, books_df.name, books_df.author, genre_df.genre)

In [0]:

from pyspark.sql.functions import col,isnan, when, count
data_v2.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

In [0]:

data.write.mode('overwrite').parquet(chaos_repo_path+"books-dataset")

In [0]:

data_v3 =  data_v2.fillna("classics", subset=["genre"])