In [1]:
import os
from delta.tables import DeltaTable
from pyspark.sql import SparkSession
from dateutil.parser import parse
import pyspark.sql.types as T
import pyspark.sql.functions as F
import pandas as pd
import psycopg2
import requests

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
try:
    spark.stop()
except Exception as e: 
    print(e)

spark = (
    SparkSession.builder \
    .appName("SparkLocalStackS3Integration") \
    .config("spark.hadoop.fs.s3a.endpoint", os.environ['AWS_ENDPOINT_URL'])\
    .config("spark.hadoop.fs.s3a.access.key", os.environ["AWS_ACCESS_KEY_ID"])\
    .config("spark.hadoop.fs.s3a.secret.key",os.environ["AWS_SECRET_ACCESS_KEY"])\
    .config("spark.hadoop.fs.s3a.path.style.access", "true")\
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")\
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")\
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

name 'spark' is not defined


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
pipeline_config_id = 1
DATALAKE_PATH = requests.get(f"http://backend:8000/configs/{pipeline_config_id}/").json()["upsert_write_path"]
print(DATALAKE_PATH)


s3a://my-storage-bucket/upsert-locations/postgres.public.task_instance_1_1


In [4]:
datalake_df = spark.read.format('delta').load(DATALAKE_PATH)

In [5]:
datalake_df.printSchema()

root
 |-- __op: string (nullable = true)
 |-- custom_operator_name: string (nullable = true)
 |-- dag_id: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- end_date: string (nullable = true)
 |-- executor: string (nullable = true)
 |-- executor_config: string (nullable = true)
 |-- external_executor_id: string (nullable = true)
 |-- hostname: string (nullable = true)
 |-- job_id: long (nullable = true)
 |-- map_index: long (nullable = true)
 |-- max_tries: long (nullable = true)
 |-- next_kwargs: string (nullable = true)
 |-- next_method: string (nullable = true)
 |-- operator: string (nullable = true)
 |-- pid: long (nullable = true)
 |-- pool: string (nullable = true)
 |-- pool_slots: long (nullable = true)
 |-- priority_weight: long (nullable = true)
 |-- queue: string (nullable = true)
 |-- queued_by_job_id: long (nullable = true)
 |-- queued_dttm: string (nullable = true)
 |-- rendered_map_index: string (nullable = true)
 |-- run_id: string (nullable = true)
 |

In [8]:
datalake_df.limit(10).toPandas()

                                                                                

Unnamed: 0,__op,custom_operator_name,dag_id,duration,end_date,executor,executor_config,external_executor_id,hostname,job_id,map_index,max_tries,next_kwargs,next_method,operator,pid,pool,pool_slots,priority_weight,queue,queued_by_job_id,queued_dttm,rendered_map_index,run_id,start_date,state,task_display_name,task_id,trigger_id,trigger_timeout,try_number,unixname,updated_at,__key,__topic,__partition,__offset,__timestamp,__timestampType,year,month,day
0,u,,continuous_python_dag,0.103745,2025-06-20T09:37:02.055085Z,,gAV9lC4=,d0b3381c-b484-498d-9210-50ead41d6bf6,F29B73BEE9F4,9,-1,1,,,PythonOperator,97,default_pool,1,1,default,1,2025-06-20T09:37:01.791883Z,,scheduled__2025-06-20T09:36:59.968862+00:00,2025-06-20 09:37:01.951340,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:37:02.061673Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,55,2025-06-20 09:37:02.885,0,2025,6,20
1,u,,continuous_python_dag,0.110174,2025-06-20T09:39:25.977925Z,,gAV9lC4=,92a9a85b-5c28-4b6b-89cb-45be2e80625c,F29B73BEE9F4,85,-1,1,,,PythonOperator,353,default_pool,1,1,default,1,2025-06-20T09:39:25.706210Z,,scheduled__2025-06-20T09:39:22.365746+00:00,2025-06-20 09:39:25.867751,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:39:25.984976Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,587,2025-06-20 09:39:26.431,0,2025,6,20
2,u,,continuous_python_dag,0.204726,2025-06-20T09:39:54.501605Z,,gAV9lC4=,16f7155c-7558-4025-9184-4105d68866f1,F29B73BEE9F4,103,-1,1,,,PythonOperator,414,default_pool,1,1,default,1,2025-06-20T09:39:53.984014Z,,scheduled__2025-06-20T09:39:52.294760+00:00,2025-06-20 09:39:54.296879,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:39:54.513067Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,713,2025-06-20 09:39:55.019,0,2025,6,20
3,u,,continuous_python_dag,0.133019,2025-06-20T09:37:33.633567Z,,gAV9lC4=,faf46252-488f-430b-ab1e-a031cfadfc67,F29B73BEE9F4,26,-1,1,,,PythonOperator,155,default_pool,1,1,default,1,2025-06-20T09:37:33.262826Z,,scheduled__2025-06-20T09:37:31.563692+00:00,2025-06-20 09:37:33.500548,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:37:33.640402Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,174,2025-06-20 09:37:34.028,0,2025,6,20
4,u,,continuous_python_dag,0.114372,2025-06-20T09:39:32.715930Z,,gAV9lC4=,a5bc2886-4c5c-4424-acbc-12a4ccca56a1,F29B73BEE9F4,89,-1,1,,,PythonOperator,372,default_pool,1,1,default,1,2025-06-20T09:39:32.440723Z,,scheduled__2025-06-20T09:39:29.739354+00:00,2025-06-20 09:39:32.601558,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:39:32.722738Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,615,2025-06-20 09:39:33.450,0,2025,6,20
5,u,,continuous_python_dag,0.105232,2025-06-20T09:37:13.224343Z,,gAV9lC4=,fa15031e-2641-4165-86be-f4b85f83e76d,F29B73BEE9F4,15,-1,1,,,PythonOperator,115,default_pool,1,1,default,1,2025-06-20T09:37:12.976663Z,,scheduled__2025-06-20T09:37:11.199651+00:00,2025-06-20 09:37:13.119111,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:37:13.230789Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,97,2025-06-20 09:37:13.951,0,2025,6,20
6,u,,continuous_python_dag,0.104708,2025-06-20T09:37:31.909040Z,,gAV9lC4=,1aeb8beb-f87e-46c3-82df-925a6377e051,F29B73BEE9F4,25,-1,1,,,PythonOperator,152,default_pool,1,1,default,1,2025-06-20T09:37:31.649664Z,,scheduled__2025-06-20T09:37:29.300928+00:00,2025-06-20 09:37:31.804332,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:37:31.915767Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,167,2025-06-20 09:37:32.520,0,2025,6,20
7,u,,continuous_python_dag,0.109979,2025-06-20T09:37:25.567907Z,,gAV9lC4=,ee9bddd0-6460-43db-a4df-a378078c30fa,F29B73BEE9F4,21,-1,1,,,PythonOperator,140,default_pool,1,1,default,1,2025-06-20T09:37:25.288499Z,,scheduled__2025-06-20T09:37:22.052345+00:00,2025-06-20 09:37:25.457928,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:37:25.574478Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,139,2025-06-20 09:37:26.490,0,2025,6,20
8,u,,continuous_python_dag,0.101752,2025-06-20T09:39:36.113179Z,,gAV9lC4=,85bfcd00-9298-4bc1-b062-6ec480e22f3c,F29B73BEE9F4,91,-1,1,,,PythonOperator,378,default_pool,1,1,default,1,2025-06-20T09:39:35.869610Z,,scheduled__2025-06-20T09:39:33.529777+00:00,2025-06-20 09:39:36.011427,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:39:36.119747Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,629,2025-06-20 09:39:36.960,0,2025,6,20
9,u,,continuous_python_dag,0.102536,2025-06-20T09:39:08.709492Z,,gAV9lC4=,1a6fb401-f475-47fb-b969-5f27980fde16,F29B73BEE9F4,76,-1,1,,,PythonOperator,326,default_pool,1,1,default,1,2025-06-20T09:39:08.451419Z,,scheduled__2025-06-20T09:39:05.533781+00:00,2025-06-20 09:39:08.606956,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:39:08.716092Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,524,2025-06-20 09:39:09.376,0,2025,6,20


In [6]:
spark.sql(f"""select * from delta.`{DATALAKE_PATH}` limit 10""").toPandas()

                                                                                

Unnamed: 0,__op,custom_operator_name,dag_id,duration,end_date,executor,executor_config,external_executor_id,hostname,job_id,map_index,max_tries,next_kwargs,next_method,operator,pid,pool,pool_slots,priority_weight,queue,queued_by_job_id,queued_dttm,rendered_map_index,run_id,start_date,state,task_display_name,task_id,trigger_id,trigger_timeout,try_number,unixname,updated_at,__key,__topic,__partition,__offset,__timestamp,__timestampType,year,month,day
0,u,,continuous_python_dag,0.103745,2025-06-20T09:37:02.055085Z,,gAV9lC4=,d0b3381c-b484-498d-9210-50ead41d6bf6,F29B73BEE9F4,9,-1,1,,,PythonOperator,97,default_pool,1,1,default,1,2025-06-20T09:37:01.791883Z,,scheduled__2025-06-20T09:36:59.968862+00:00,2025-06-20 09:37:01.951340,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:37:02.061673Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,55,2025-06-20 09:37:02.885,0,2025,6,20
1,u,,continuous_python_dag,0.110174,2025-06-20T09:39:25.977925Z,,gAV9lC4=,92a9a85b-5c28-4b6b-89cb-45be2e80625c,F29B73BEE9F4,85,-1,1,,,PythonOperator,353,default_pool,1,1,default,1,2025-06-20T09:39:25.706210Z,,scheduled__2025-06-20T09:39:22.365746+00:00,2025-06-20 09:39:25.867751,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:39:25.984976Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,587,2025-06-20 09:39:26.431,0,2025,6,20
2,u,,continuous_python_dag,0.204726,2025-06-20T09:39:54.501605Z,,gAV9lC4=,16f7155c-7558-4025-9184-4105d68866f1,F29B73BEE9F4,103,-1,1,,,PythonOperator,414,default_pool,1,1,default,1,2025-06-20T09:39:53.984014Z,,scheduled__2025-06-20T09:39:52.294760+00:00,2025-06-20 09:39:54.296879,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:39:54.513067Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,713,2025-06-20 09:39:55.019,0,2025,6,20
3,u,,continuous_python_dag,0.133019,2025-06-20T09:37:33.633567Z,,gAV9lC4=,faf46252-488f-430b-ab1e-a031cfadfc67,F29B73BEE9F4,26,-1,1,,,PythonOperator,155,default_pool,1,1,default,1,2025-06-20T09:37:33.262826Z,,scheduled__2025-06-20T09:37:31.563692+00:00,2025-06-20 09:37:33.500548,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:37:33.640402Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,174,2025-06-20 09:37:34.028,0,2025,6,20
4,u,,continuous_python_dag,0.114372,2025-06-20T09:39:32.715930Z,,gAV9lC4=,a5bc2886-4c5c-4424-acbc-12a4ccca56a1,F29B73BEE9F4,89,-1,1,,,PythonOperator,372,default_pool,1,1,default,1,2025-06-20T09:39:32.440723Z,,scheduled__2025-06-20T09:39:29.739354+00:00,2025-06-20 09:39:32.601558,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:39:32.722738Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,615,2025-06-20 09:39:33.450,0,2025,6,20
5,u,,continuous_python_dag,0.105232,2025-06-20T09:37:13.224343Z,,gAV9lC4=,fa15031e-2641-4165-86be-f4b85f83e76d,F29B73BEE9F4,15,-1,1,,,PythonOperator,115,default_pool,1,1,default,1,2025-06-20T09:37:12.976663Z,,scheduled__2025-06-20T09:37:11.199651+00:00,2025-06-20 09:37:13.119111,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:37:13.230789Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,97,2025-06-20 09:37:13.951,0,2025,6,20
6,u,,continuous_python_dag,0.104708,2025-06-20T09:37:31.909040Z,,gAV9lC4=,1aeb8beb-f87e-46c3-82df-925a6377e051,F29B73BEE9F4,25,-1,1,,,PythonOperator,152,default_pool,1,1,default,1,2025-06-20T09:37:31.649664Z,,scheduled__2025-06-20T09:37:29.300928+00:00,2025-06-20 09:37:31.804332,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:37:31.915767Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,167,2025-06-20 09:37:32.520,0,2025,6,20
7,u,,continuous_python_dag,0.109979,2025-06-20T09:37:25.567907Z,,gAV9lC4=,ee9bddd0-6460-43db-a4df-a378078c30fa,F29B73BEE9F4,21,-1,1,,,PythonOperator,140,default_pool,1,1,default,1,2025-06-20T09:37:25.288499Z,,scheduled__2025-06-20T09:37:22.052345+00:00,2025-06-20 09:37:25.457928,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:37:25.574478Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,139,2025-06-20 09:37:26.490,0,2025,6,20
8,u,,continuous_python_dag,0.101752,2025-06-20T09:39:36.113179Z,,gAV9lC4=,85bfcd00-9298-4bc1-b062-6ec480e22f3c,F29B73BEE9F4,91,-1,1,,,PythonOperator,378,default_pool,1,1,default,1,2025-06-20T09:39:35.869610Z,,scheduled__2025-06-20T09:39:33.529777+00:00,2025-06-20 09:39:36.011427,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:39:36.119747Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,629,2025-06-20 09:39:36.960,0,2025,6,20
9,u,,continuous_python_dag,0.102536,2025-06-20T09:39:08.709492Z,,gAV9lC4=,1a6fb401-f475-47fb-b969-5f27980fde16,F29B73BEE9F4,76,-1,1,,,PythonOperator,326,default_pool,1,1,default,1,2025-06-20T09:39:08.451419Z,,scheduled__2025-06-20T09:39:05.533781+00:00,2025-06-20 09:39:08.606956,success,run_my_continuous_task,run_my_continuous_task,,,1,root,2025-06-20T09:39:08.716092Z,"{""dag_id"":""continuous_python_dag"",""task_id"":""r...",postgres.public.task_instance,0,524,2025-06-20 09:39:09.376,0,2025,6,20
