In [None]:
print("hello world")

In [None]:
for i in range(10):
    # % is the modulus (remainder) operator
    # 10 % 3 = 1
    # 10 % 2 = 0
    # 10 % 4 = 2
    # so this if statement prints even numbers
    if i % 2 == 0:
        print(i)

# Aaron's demo

Run a dbt Cloud job from a Python notebook kicked off via Databricks API in a Python model that returns no real data.

https://www.loom.com/share/f649bc7dcb244d548269d4c1804e8db2 

## What happens when you `dbt run`?

1. The Python model code is compiled into the `target/` directory, just like SQL.

2. The code is shipped off and executed in the warehouse, just like SQL.

3. The Python code executes -- `dbutils.notebook.run(notebook, 600)` runs an existing notebook in the Databricks workspace

4. That notebook uses a secret for a dbt Cloud credential and makes an API request to execute a dbt Cloud job

In [None]:
# models/run_notebook.py in dbt project


def model(dbt, session):

    dbt.config(materialized="table")

    notebook = "linked_notebook_test"
    dbutils.notebook.run(notebook, 600)
    # this could be replaced with something like:
    # os.system("pip install --upgrade papermill")
    # os.system(f"papermill {notebook} -k python -")

    df = session.sql(
        """
        select 
            'snowflake_dbt_job_notebook' as notebook_triggered,
            current_timestamp() as runtime
        """
    )

    return df

In [None]:
# linked_notebook_test.ipynb in Databricks workspace

import os  # built-in Python module
import requests  # installed with `pip install requests`

from datetime import datetime  # built-in Python module

DBT_ACCOUNT_ID = 1
DBT_JOB_ID = 2

DBT_API_KEY = os.environ.get("API_KEY")

r = requests.post(
    url=f"https://cloud.getdbt.com/api/v2/accounts/{DBT_ACCOUNT_ID}/jobs/{DBT_JOB_ID}/run",
    headers={"Authorization": f"Token {DBT_API_KEY}"},
    json={
        "cause": f"Triggered by a Python notebook triggered by a dbt Python model at {datetime.now()}"
    },
)

r.json()

## How I (almost) solved a difficult problem

Trying to replicate `orders.sql` to `orders.py`.

In [3]:
# from orders.sql in jaffle_shop/models; parts cut out for brevity

sql = """
{% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %}

select
    order_id,

    {% for payment_method in payment_methods -%}
    sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount,
    {% endfor -%}

    sum(amount) as total_amount

from payments

group by order_id
""".strip()

print(sql)

{% set payment_methods = ['credit_card', 'coupon', 'bank_transfer', 'gift_card'] %}

select
    order_id,

    {% for payment_method in payment_methods -%}
    sum(case when payment_method = '{{ payment_method }}' then amount else 0 end) as {{ payment_method }}_amount,
    {% endfor -%}

    sum(amount) as total_amount

from payments

group by order_id


In [8]:
py_sql_prefix = """
    select

        order_id,
        sum(amount) as total_amount,

""".lstrip()

py_sql_postfix = """

    from payments

    group by order_id
""".rstrip()

payment_methods = ["credit_card", "coupon", "bank_transfer", "gift_card"]

py_sql_middle = ""

for payment_method in payment_methods:
    py_sql_middle += f"\tsum(case when payment_method = {payment_method} then amount else 0 end) as {payment_method}_amount,\n"

py_sql_middle = py_sql_middle.rstrip(",\n") + "\n"

py_sql = py_sql_prefix + py_sql_middle + py_sql_postfix

print(py_sql)

select

        order_id,

	sum(case when payment_method = credit_card then amount else 0 end) as credit_card_amount,
	sum(case when payment_method = coupon then amount else 0 end) as coupon_amount,
	sum(case when payment_method = bank_transfer then amount else 0 end) as bank_transfer_amount,
	sum(case when payment_method = gift_card then amount else 0 end) as gift_card_amount,

        sum(amount) as total_amount

    from payments

    group by order_id


In [None]:
df.select(py_sql)

In [None]:
# tracking
import mlflow

# pydata/ml
import sklearn as sklearn

import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# viz
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# snowflake
import yaml
import snowflake.snowpark

from snowflake.snowpark import types
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col, udf, sql_expr

In [None]:
# setup viz defaults
sns.set_theme()
sns.set(rc={"figure.figsize": (16, 18)})
sns.set_style("darkgrid")
plt.style.use(["dark_background"])

In [None]:
with open("/home/vscode/.dbt/profiles.yml", "r") as f:
    profiles = yaml.safe_load(f)
    dev_profile = profiles["snowflake"]["outputs"]["dev"]

conn_params = {
    "account": dev_profile["account"],
    "user": dev_profile["user"],
    "role": dev_profile["role"],
    "warehouse": dev_profile["warehouse"],
    "database": dev_profile["database"],
    "schema": dev_profile["schema"],
    "authenticator": dev_profile["authenticator"],
}
conn_params

In [None]:
s = Session.builder.configs(conn_params).create()

In [None]:
%%time

model = "raw_customers"

df = s.table(model)

df.show(5)

In [None]:
import logging

for logger_name in ("snowflake.snowpark", "snowflake.connector"):
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(
        logging.Formatter(
            "%(asctime)s - %(threadName)s %(filename)s:%(lineno)d - %(funcName)s() - %(levelname)s - %(message)s"
        )
    )
    logger.addHandler(ch)

df.to_pandas()

In [None]:
%%time

model = "customers"

df = s.table(model)
df

In [None]:
df.show()

In [None]:
# dare to dance with the devil?
# df = df.toPandas()
# df = df.to_pandas()