# Integration of lakeFS with Airflow via Hooks

## Use Case: Versioning DAGs and running pipeline from hooks using a configurable version of DAGs

## Versioning Information

In [None]:
sourceBranch = "main"
newBranch = "version1"
fileName = "lakefs_test.csv"
local_path = 'airflow/DAG_Versioning'
dags_folder_on_lakefs = 'dags'
data_folder_on_lakefs = 'data'
actions_folder_on_lakefs = '_lakefs_actions'
newPath = data_folder_on_lakefs + '/' + "partitioned_data"
dag_name = 'lakefs_versioning_dag'
dag_template_filename = 'lakefs_versioning_dag_template.py'

## Import Python packages

In [None]:
import lakefs_client
import lakefs_demo
import os
from airflow.models import DagBag

# importing sys
import sys
 
# adding folder to the system path
sys.path.insert(0, './'+local_path)
 
from lakefs_create_dag import lakefs_create_dag

## Working with the lakeFS Python client API

###### Note: To learn more about lakeFS Python integration visit https://docs.lakefs.io/integrations/python.html

In [None]:
%xmode Minimal
from lakefs_client import models
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

client = LakeFSClient(configuration)

## Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials")
client.config.get_lake_fs_version()
print("lakeFS credentials verified")

## S3A Gateway configuration

##### Note: lakeFS can be configured to work with Spark in two ways:
###### * Access lakeFS using the S3A gateway https://docs.lakefs.io/integrations/spark.html#access-lakefs-using-the-s3a-gateway.
###### * Access lakeFS using the lakeFS-specific Hadoop FileSystem https://docs.lakefs.io/integrations/spark.html#access-lakefs-using-the-lakefs-specific-hadoop-filesystem.

In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

## Start Airflow

In [None]:
print("Starting Airflow")

In [None]:
! pkill airflow
! pkill airflow
! pkill airflow

In [None]:
%env AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.basic_auth

In [None]:
%%script bash --bg --out script_out --err script_error
airflow standalone

In [None]:
# Wait for Airflow to start
! sleep 20

In [None]:
print("Airflow Started")

## Create Airflow connections for lakeFS

In [None]:
! airflow connections delete conn_lakefs
lakeFSConnectionCommand = 'airflow connections add conn_lakefs --conn-type=http --conn-host=' + lakefsEndPoint + ' --conn-extra=\'{"access_key_id":"' + lakefsAccessKey + '","secret_access_key":"' + lakefsSecretKey + '"}\''
! $lakeFSConnectionCommand

! airflow connections delete conn_spark
sparkConnectionCommand = 'airflow connections add conn_spark --conn-type=spark --conn-host=local[*]'
! $sparkConnectionCommand

## Set Airflow variables which are used by the demo workflow

In [None]:
! airflow variables set lakefsAccessKey $lakefsAccessKey
! airflow variables set lakefsSecretKey $lakefsSecretKey
! airflow variables set lakefsEndPoint $lakefsEndPoint
! airflow variables set lakefsUIEndPoint $lakefsUIEndPoint
! airflow variables set repo $repo
! airflow variables set sourceBranch $sourceBranch
! airflow variables set newBranch $newBranch
fileName_on_lakefs = data_folder_on_lakefs + '/' + fileName
! airflow variables set fileName $fileName_on_lakefs
! airflow variables set newPath $newPath
! airflow variables set conn_lakefs 'conn_lakefs'
! airflow variables set dags_folder_on_lakefs $dags_folder_on_lakefs
! airflow variables set dag_name $dag_name
! airflow variables set dag_template_filename $dag_template_filename

spark_home = os.getenv('SPARK_HOME')
! airflow variables set spark_home $spark_home

## Copy DAG programs to Airflow DAGs directory and sync to Airflow database

In [None]:
! cp ./airflow/DAG_Versioning/lakefs_create_dag_auto.py ./airflow/dags
! cp ./airflow/DAG_Versioning/lakefs_delete_dag_auto.py ./airflow/dags
! cp ./airflow/DAG_Versioning/lakefs_trigger_dag_auto.py ./airflow/dags

dagbag = DagBag(include_examples=False)
dagbag.sync_to_db()

## Unpause Airflow DAGs

In [None]:
! airflow dags unpause lakefs_create_dag
! airflow dags unpause lakefs_delete_dag
! airflow dags unpause lakefs_trigger_dag