From 70b677719282a0d1ea269c225cb6c466f646781b Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 27 Apr 2022 21:10:44 +0200 Subject: [PATCH] docs: Update documentation to showcase new connection feature --- README.md | 8 ++- docs/development.rst | 16 ++++-- docs/getting_started.rst | 8 +-- docs/introduction.rst | 61 ++++++++++++++++++++++- examples/airflow_connection_target_dag.py | 52 +++++++++++++++++++ 5 files changed, 134 insertions(+), 11 deletions(-) create mode 100644 examples/airflow_connection_target_dag.py diff --git a/README.md b/README.md index 1eb9aaf..e6c69d0 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ With poetry: poetry install ``` -Install any extras you need, and only those you need: +Install with any necessary extras: ``` shell poetry install -E postgres -E redshift ``` @@ -85,6 +85,12 @@ As of the time of writing S3 is the only supported backend for dbt projects, but Each dbt execution produces one or more [JSON artifacts](https://docs.getdbt.com/reference/artifacts/dbt-artifacts/) that are valuable to produce meta-metrics, build conditional workflows, for reporting purposes, and other uses. airflow-dbt-python can push these artifacts to [XCom](https://airflow.apache.org/docs/apache-airflow/stable/concepts/xcoms.html) as requested via the `do_xcom_push_artifacts` parameter, which takes a list of artifacts to push. +## Use Airflow connections as dbt targets (without a profiles.yml) + +[Airflow connections](https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html) allow users to manage and store connection information, such as hostname, port, user name, and password, for operators to use when accessing certain applications, like databases. Similarly, a dbt `profiles.yml` file stores connection information under each target key. `airflow-dbt-python` bridges the gap between the two and allows you to use connection information stored as an Airflow connection by specifying the connection id as the `target` parameter of any of the dbt operators it provides. What's more, if using an Airflow connection, the `profiles.yml` file may be entirely omitted (although keep in mind a `profiles.yml` file contains a configuration block besides target connection information). + +See an example DAG [here](examples/airflow_connection_target_dag.py). + # Motivation ## Airflow running in a managed environment diff --git a/docs/development.rst b/docs/development.rst index 66316dd..d5020c6 100644 --- a/docs/development.rst +++ b/docs/development.rst @@ -10,6 +10,8 @@ Poetry airflow-dbt-python uses `Poetry `_ for project management. Ensure it's installed before running: see `Poetry's installation documentation `_. +As of `airflow-dbt-python` version 0.14, we have moved the project to Poetry version >= 1.2.0 to allow us to use dependency groups. + Installing Airflow ------------------ @@ -26,22 +28,26 @@ Installing the ``airflow`` extra will fetch the latest version of Airflow with m .. code-block:: shell - cd airflow-dbt-python poetry install -E airflow +Some features require Airflow providers. For example, any S3 backend operations require ``apache-airflow-providers-amazon``. These providers may be installed individually or with the ``airflow-providers`` extra: + +.. code-block:: shell + + poetry install -E airflow-providers Building from source -------------------- Clone the main repo and install it: - .. code-block:: shell git clone https://github.com/tomasfarias/airflow-dbt-python.git cd airflow-dbt-python - poetry install + poetry install --with dev +The dev dependency group includes development tools for code formatting, type checking, and testing. Pre-commit hooks ---------------- @@ -78,11 +84,11 @@ Requirements Unit tests interact with a `PostgreSQL `_ database as a target to run dbt commands. This requires PostgreSQL to be installed in your local environment. Installation instructions for all major platforms can be found here: https://www.postgresql.org/download/. -Some unit tests require the `Amazon provider package for Airflow `_. Ensure it's installed via the ``amazon`` extra: +Some unit tests require the `Amazon provider package for Airflow `_. Ensure it's installed via the ``airflow-providers`` extra: .. code-block:: shell - poetry install -E amazon + poetry install -E airflow-providers Running unit tests with pytest ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/getting_started.rst b/docs/getting_started.rst index c2a1c5a..edabcc1 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -54,18 +54,18 @@ airflow-dbt-python can also be built from source by cloning the main repo: git clone https://github.com/tomasfarias/airflow-dbt-python.git cd airflow-dbt-python -And installing with ``poetry`` (without development dependencies): +And installing with ``poetry``: .. code-block:: shell - poetry install --no-dev + poetry install As with ``pip``, any extra adapters can be installed: .. code-block:: shell - poetry install -E postgres -E redshift -E bigquery -E snowflake --no-dev - poetry install -E all --no-dev + poetry install -E postgres -E redshift -E bigquery -E snowflake + poetry install -E all Installing in MWAA ^^^^^^^^^^^^^^^^^^ diff --git a/docs/introduction.rst b/docs/introduction.rst index 179ab75..67f8212 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -49,7 +49,7 @@ This way, artifacts may be pulled and operated on by downstream tasks. For examp .. code-block:: python :linenos: - :caption: example_dbt_artifacts.py + :caption: example_dbt_artifacts_dag.py import datetime as dt @@ -87,3 +87,62 @@ This way, artifacts may be pulled and operated on by downstream tasks. For examp ) dbt_run >> process_artifacts + +Use Airflow connections as dbt targets (without a profiles.yml) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +`Airflow connections `_ allow users to manage and store connection information, such as hostname, port, user name, and password, for operators to use when accessing certain applications, like databases. Similarly, a dbt ``profiles.yml`` file stores connection information under each target key. + +``airflow-dbt-python`` bridges the gap between the two and allows you to use connection information stored as an Airflow connection by specifying the connection id as the ``target`` parameter of any of the dbt operators it provides. What's more, if using an Airflow connection, the ``profiles.yml`` file may be entirely omitted (although keep in mind a ``profiles.yml`` file contains a configuration block besides target connection information). + + +.. code-block:: python + :linenos: + :caption: airflow_connection_target_dag.py + + import datetime as dt + import json + import os + + from airflow import DAG, settings + from airflow.models.connection import Connection + from airflow.utils.dates import days_ago + from airflow_dbt_python.dbt.operators import DbtRunOperator + + # For illustration purposes, and to keep the example self-contained, we create + # a Connection using Airflow's ORM. However, any method of loading connections would + # work, like Airflow's UI, Airflow's CLI, or in deployment scripts. + my_conn = Connection( + conn_id="my_db_connection", + conn_type="postgres", + description="A test postgres connection", + host="localhost", + login="username", + port=5432, + schema="my_dbt_schema", + password="password", # pragma: allowlist secret + # Other dbt parameters can be added as extras + extra=json.dumps(dict(threads=4, sslmode="require")), + ) + session = settings.Session() + session.add(my_conn) + session.commit() + + with DAG( + dag_id="example_airflow_connection", + schedule_interval="0 * * * *", + start_date=days_ago(1), + catchup=False, + dagrun_timeout=dt.timedelta(minutes=60), + ) as dag: + dbt_run = DbtRunOperator( + task_id="dbt_run_hourly", + target="my_db_connection", + # Profiles file is not needed as we are using an Airflow connection. + # If a profiles file is used, the Airflow connection will be merged to the + # existing targets + profiles_dir=None, # Defaults to None so this may be omitted. + project_dir="/path/to/my/dbt/project/", + select=["+tag:hourly"], + exclude=["tag:deprecated"], + ) diff --git a/examples/airflow_connection_target_dag.py b/examples/airflow_connection_target_dag.py new file mode 100644 index 0000000..e24db9d --- /dev/null +++ b/examples/airflow_connection_target_dag.py @@ -0,0 +1,52 @@ +"""Sample basic DAG which showcases using an Airflow Connection as target.""" +import datetime as dt +import json + +from airflow import DAG, settings +from airflow.models.connection import Connection +from airflow.utils.dates import days_ago + +from airflow_dbt_python.dbt.operators import DbtRunOperator + +# For illustration purposes, and to keep the example self-contained, we create +# a Connection using Airflow's ORM. However, any method of loading connections would +# work, like Airflow's UI, Airflow's CLI, or in deployment scripts. +my_conn = Connection( + conn_id="my_db_connection", + conn_type="postgres", + description="A test postgres connection", + host="localhost", + login="username", + port=5432, + schema="my_dbt_schema", + password="password", # pragma: allowlist secret + # Other dbt parameters can be added as extras + extra=json.dumps(dict(threads=4, sslmode="require")), +) + + +if settings.Session is None: + settings.configure_orm() + +session = settings.Session() +session.add(my_conn) +session.commit() + +with DAG( + dag_id="example_airflow_connection", + schedule_interval="0 * * * *", + start_date=days_ago(1), + catchup=False, + dagrun_timeout=dt.timedelta(minutes=60), +) as dag: + dbt_run = DbtRunOperator( + task_id="dbt_run_hourly", + target="my_db_connection", + # Profiles file is not needed as we are using an Airflow connection. + # If a profiles file is used, the Airflow connection will be merged to the + # existing targets + profiles_dir=None, # Defaults to None so this may be omitted. + project_dir="/path/to/my/dbt/project/", + select=["+tag:hourly"], + exclude=["tag:deprecated"], + )