### Installing lakeFS python client

In [2]:
!pip install lakefs_client




In [3]:
from datetime import date, time


In [4]:
import lakefs_client
from lakefs_client import models
from lakefs_client.client import LakeFSClient


### Configuring lakeFSClient and Spark

In [5]:
# AccessKey and SecretKey are present in the docker-compose.yaml file we used to spin up the everything bagel
lakefsAccessKey = "AKIAIOSFODNN7EXAMPLE"
lakefsSecretKey = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
lakefsEndPoint = "http://lakefs:8000"

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint


In [6]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col,isnan,when,count

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")


In [7]:
client = LakeFSClient(configuration)


# Create Netflix Movies Repository

In [44]:
repo_name = "netflix-movies-data"

### TODO: Show creating bucket and repo from the UI

# Creating Ingest and Staging branches

In [46]:
ingest_branch = "ingress-landing-area"
staging_branch = "staging-area"
prod_branch = "main"


In [47]:
client.branches.list_branches(repo_name)


{'pagination': {'has_more': False,
                'max_per_page': 1000,
                'next_offset': '',
                'results': 1},
 'results': [{'commit_id': 'c103ed371e2b627584462e3f1f6f7609cdc916633e3192873810a4c83e01e7a1',
              'id': 'main'}]}

In [48]:
client.branches.create_branch(repository=repo_name, 
                              branch_creation=models.BranchCreation(name=ingest_branch, 
                                                                    source=prod_branch)
                             )

'c103ed371e2b627584462e3f1f6f7609cdc916633e3192873810a4c83e01e7a1'

In [49]:
client.branches.create_branch(repository=repo_name, 
                              branch_creation=models.BranchCreation(name=staging_branch, 
                                                                    source=prod_branch)
                             )

'c103ed371e2b627584462e3f1f6f7609cdc916633e3192873810a4c83e01e7a1'

In [50]:
client.branches.list_branches(repo_name)


{'pagination': {'has_more': False,
                'max_per_page': 1000,
                'next_offset': '',
                'results': 3},
 'results': [{'commit_id': 'c103ed371e2b627584462e3f1f6f7609cdc916633e3192873810a4c83e01e7a1',
              'id': 'ingress-landing-area'},
             {'commit_id': 'c103ed371e2b627584462e3f1f6f7609cdc916633e3192873810a4c83e01e7a1',
              'id': 'main'},
             {'commit_id': 'c103ed371e2b627584462e3f1f6f7609cdc916633e3192873810a4c83e01e7a1',
              'id': 'staging-area'}]}

# Netflix movies daily partition lands in ingress path (branch)

In [51]:
ingest_data = "movies.csv"

ingest_path = f'dt={str(date.today())}/{ingest_data}'
ingest_path


'dt=2022-11-08/movies.csv'

In [52]:
with open(f'./{ingest_data}', 'rb') as f:
    client.objects.upload_object(repository=repo_name, 
                                 branch=ingest_branch, 
                                 path=ingest_path, 
                                 content=f
                                )


In [53]:
client.branches.diff_branch(repository=repo_name, 
                            branch=ingest_branch).results


[{'path': 'dt=2022-11-08/movies.csv',
  'path_type': 'object',
  'size_bytes': 1071619,
  'type': 'added'}]

In [54]:
client.commits.commit(repository=repo_name,
                      branch=ingest_branch,
                      commit_creation=models.CommitCreation(
                          message="netflix movie data arrived at landing area (today's partition)")
                     )

{'committer': 'docker',
 'creation_date': 1667944331,
 'id': 'b394ad9f2e240903f78b68bded3703727417d4d97c365cbc9b1bf861ac8f2c66',
 'message': "netflix movie data arrived at landing area (today's partition)",
 'meta_range_id': '',
 'metadata': {},
 'parents': ['c103ed371e2b627584462e3f1f6f7609cdc916633e3192873810a4c83e01e7a1']}

# Copying daily partition from ingress to staging area (branch)

In [55]:
staging_long_path = f"s3a://{repo_name}/{staging_branch}"
staging_long_path

's3a://netflix-movies-data-1002/staging-area'

In [56]:
csv_path = f"{staging_long_path}/raw/dt={str(date.today())}/csv"
csv_path

's3a://netflix-movies-data-1002/staging-area/raw/dt=2022-11-08/csv'

In [57]:
movies_df = spark.read.option("header","true").csv(f"s3a://{repo_name}/{ingest_branch}/{ingest_path}")

In [58]:
movies_df.write.option("header",True)\
        .mode("append")\
        .csv(csv_path)
    

In [59]:
client.branches.diff_branch(repository=repo_name, 
                            branch=staging_branch).results


[{'path': 'raw/dt=2022-11-08/csv/_SUCCESS',
  'path_type': 'object',
  'size_bytes': 1062857,
  'type': 'added'},
 {'path': 'raw/dt=2022-11-08/csv/part-00000-4045f35d-2e66-43f9-860e-aa052bc3612e-c000.csv',
  'path_type': 'object',
  'size_bytes': 1062857,
  'type': 'added'}]

In [60]:
client.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=models.CommitCreation(
                          message="netflix movie data copied to staging area (today's partition)")
                     )

{'committer': 'docker',
 'creation_date': 1667944334,
 'id': '5c046800964ccfd5a06d65a62d0ecb5e4209bd5feb45cb93cf0a175b07abb399',
 'message': "netflix movie data copied to staging area (today's partition)",
 'meta_range_id': '',
 'metadata': {},
 'parents': ['c103ed371e2b627584462e3f1f6f7609cdc916633e3192873810a4c83e01e7a1']}

# Data Exploration and Cleaning in staging area (branch)

In [61]:
movies_df = spark.read.option("header","true").csv(csv_path)
df_columns=movies_df.columns


In [62]:
print(movies_df.count())
print(movies_df.printSchema())


8791
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)

None


In [63]:
movies_df.show(10)

+-------+-------+--------------------+-------------------+--------------+----------+------------+------+---------+--------------------+
|show_id|   type|               title|           director|       country|date_added|release_year|rating| duration|           listed_in|
+-------+-------+--------------------+-------------------+--------------+----------+------------+------+---------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|    Kirsten Johnson| United States| 9/25/2021|        2020| PG-13|   90 min|       Documentaries|
|     s3|TV Show|           Ganglands|    Julien Leclercq|        France| 9/24/2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...|
|     s6|TV Show|       Midnight Mass|      Mike Flanagan| United States| 9/24/2021|        2021| TV-MA| 1 Season|TV Dramas, TV Hor...|
|    s14|  Movie|Confessions of an...|      Bruno Garotti|        Brazil| 9/22/2021|        2021| TV-PG|   91 min|Children & Family...|
|     s8|  Movie|             Sankofa|       Hai

In [64]:
# movies_df = movies_df.sample(False,0.1,0)


## Null checks

In [65]:
movies_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_columns]).show()


+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|show_id|type|title|director|country|date_added|release_year|rating|duration|listed_in|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|      0|   0|    0|       1|      1|         1|           1|     1|       2|        2|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+



In [66]:
movies_df = movies_df.na.drop("any")

In [67]:
movies_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_columns]).show()


+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|show_id|type|title|director|country|date_added|release_year|rating|duration|listed_in|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+
|      0|   0|    0|       0|      0|         0|           0|     0|       0|        0|
+-------+----+-----+--------+-------+----------+------------+------+--------+---------+



In [68]:
# Writing Transformed Parquet files to staging area

In [69]:
movies_df.write.option("header",True)\
        .partitionBy("country")\
        .mode("append")\
        .parquet(f"{staging_long_path}/analytics/movies-by-country-parquet")
    

### TODO:  View uncommitted changes and clean up the files not needed

In [40]:
client.commits.commit(repository=repo_name,
                      branch=staging_branch,
                      commit_creation=models.CommitCreation(
                          message='loaded paritioned movies parquet to staging area'))


{'committer': 'docker',
 'creation_date': 1667942344,
 'id': '54a8404b3ca79e32449f832305cad18a596083f9dcca1609f1670d17d0fa99c2',
 'message': 'loaded paritioned movies parquet to staging area',
 'meta_range_id': '',
 'metadata': {},
 'parents': ['4f8859c53eb3e32059acb553ffda421c04458a983a464336ab523e0852f747e3']}

# Merging Daily Data (Parquet files) to Prod

In [70]:
client.refs.merge_into_branch(repository=repo_name, 
                              source_ref=staging_branch, 
                              destination_branch=prod_branch)


{'reference': 'b0cb44d84eb77c90ff801c63497d907cc53fa47fa84dc73bad9bebcdee53a9ea',
 'summary': {'added': 0, 'changed': 0, 'conflict': 0, 'removed': 0}}

## TODO: Clean up the staging area in the UI