# [Integration of lakeFS with Airflow](https://docs.lakefs.io/integrations/airflow.html)

## Import Python packages

In [None]:
%xmode Minimal
import lakefs
import os
import time
from airflow.models import DagBag
import lakefs_demo

## Set environment variables

In [None]:
os.environ["LAKECTL_SERVER_ENDPOINT_URL"] = lakefsEndPoint
os.environ["LAKECTL_CREDENTIALS_ACCESS_KEY_ID"] = lakefsAccessKey
os.environ["LAKECTL_CREDENTIALS_SECRET_ACCESS_KEY"] = lakefsSecretKey

## Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.client.Client().version
except:
    print("🛑 failed to get lakeFS version")
    time.sleep(3)
    os._exit(00)
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v}")

## S3A Gateway configuration

##### Note: lakeFS can be configured to work with Spark in two ways:
###### * Access lakeFS using the S3A gateway https://docs.lakefs.io/integrations/spark.html#access-lakefs-using-the-s3a-gateway.
###### * Access lakeFS using the lakeFS-specific Hadoop FileSystem https://docs.lakefs.io/integrations/spark.html#access-lakefs-using-the-lakefs-specific-hadoop-filesystem.

In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", lakefsAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", lakefsSecretKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", lakefsEndPoint)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

## Start Airflow

In [None]:
if os.path.exists('./airflow/airflow-webserver.pid'):
    print("Airflow is running")
else:
    print("Starting Airflow")

In [None]:
%%script bash --bg --out script_out --err script_error
FILE=./airflow/airflow-webserver.pid
if test -f "$FILE"; then
    echo "Airflow Webserver is running"
else
    (echo "Starting Airflow Webserver";
    airflow db init;
    airflow users create \
        --username $_AIRFLOW_WWW_USER_USERNAME \
        --password $_AIRFLOW_WWW_USER_PASSWORD \
        --firstname $_AIRFLOW_WWW_USER_USERNAME \
        --lastname $_AIRFLOW_WWW_USER_USERNAME \
        --role Admin \
        --email admin@example.com;
    airflow webserver --port 8080 -D)
fi

In [None]:
while not os.path.exists('./airflow/airflow-webserver.pid'):
    time.sleep(10)
    print("Starting Airflow")

In [None]:
%%script bash --bg --out script_out --err script_error
FILE=./airflow/airflow-scheduler1.pid
if test -f "$FILE"; then
    echo "Airflow Scheduler1 is running"
else
    airflow scheduler --pid ./airflow/airflow-scheduler1.pid -D
fi

In [None]:
%%script bash --bg --out script_out --err script_error
FILE=./airflow/airflow-scheduler2.pid
if test -f "$FILE"; then
    echo "Airflow Scheduler2 is running"
else
    airflow scheduler --pid ./airflow/airflow-scheduler2.pid -D
fi

## Create Airflow connections for lakeFS and Spark

In [None]:
! airflow connections delete conn_lakefs
lakeFSConnectionCommand = 'airflow connections add conn_lakefs --conn-type=http --conn-host=' + lakefsEndPoint + ' --conn-extra=\'{"access_key_id":"' + lakefsAccessKey + '","secret_access_key":"' + lakefsSecretKey + '"}\''
! $lakeFSConnectionCommand > ./airflow/airflow-connection.txt
with open("./airflow/airflow-connection.txt", "r") as file:
    last_line = file.readlines()[-1]
print(last_line)

! airflow connections delete conn_spark
sparkConnectionCommand = 'airflow connections add conn_spark --conn-type=spark --conn-host=local[*]'
! $sparkConnectionCommand > ./airflow/airflow-connection.txt
with open("./airflow/airflow-connection.txt", "r") as file:
    last_line = file.readlines()[-1]
print(last_line)

## Set Airflow variables which are used by the demo workflow

In [None]:
! airflow variables set lakefsAccessKey $lakefsAccessKey
! airflow variables set lakefsSecretKey $lakefsSecretKey
! airflow variables set lakefsEndPoint $lakefsEndPoint
! airflow variables set repo $repo
! airflow variables set sourceBranch $sourceBranch
! airflow variables set newBranch $newBranch
! airflow variables set conn_lakefs 'conn_lakefs'

spark_home = os.getenv('SPARK_HOME')
! airflow variables set spark_home $spark_home

if lakefsEndPoint.startswith('http://host.docker.internal'):
    lakefsUIEndPoint = lakefsEndPoint.replace('host.docker.internal','127.0.0.1')
else:
    lakefsUIEndPoint = lakefsEndPoint
! airflow variables set lakefsUIEndPoint $lakefsUIEndPoint