In [2]:
import sys
from pathlib import Path

In [3]:
project_root = Path("/opt/spark/app")
src_path = project_root / "src"

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sha2, expr
from orchestration.orchestrator import run_pipeline

In [5]:
spark = (
    SparkSession.builder
    .appName("naas_request_pipeline")
    .getOrCreate()
)

In [5]:
df = (
    spark.range(100)
         .repartition(4)
         .select(
             sha2(expr("uuid()"), 256).alias("tracking_id")
         )
)
df.show()

+--------------------+
|         tracking_id|
+--------------------+
|abce912f920630981...|
|78e11b8596a1f065c...|
|68580b6422ac28049...|
|4148448068adc97be...|
|1bbe54924860975e4...|
|15bd63aec360e920a...|
|6ad76290cf25a377f...|
|f663bb7219764f7a6...|
|4a20accc645abd971...|
|58762be6a92a0ce94...|
|c9e9ad8be8e2e3410...|
|0c7a5272594789e58...|
|199ac9e89bc580376...|
|3a4c968fe4de26000...|
|10c3c08c97968d707...|
|00eeafa9b31bcbacb...|
|2f23d41399009e728...|
|ffcbfda30cf543245...|
|5b4febd2ffdef79b5...|
|022863bc9336f8b66...|
+--------------------+
only showing top 20 rows



In [6]:
config_path = project_root / "configs" / "examples" / "naas_demo.yml"

In [7]:
run_pipeline(
    spark=spark,
    config_path=config_path,
    source_df=df,
    source_id="tracking_id"
)

2026-02-06 00:07:13,434 [INFO] [PipelineOrchestrator]: Authentication does not have a runtime service... skipping
2026-02-06 00:07:13,435 [INFO] [PipelineOrchestrator]: Request from URL: /https://naas.isalman.dev/no
2026-02-06 00:07:13,436 [INFO] [TableManager]: Creating database naas
2026-02-06 00:07:21,589 [INFO] [TableManager]: Created Delta table: naas.naas_api_response
2026-02-06 00:07:23,596 [INFO] [BatchProcessor]: ➤ Attempt 1: Processing 1 batches
2026-02-06 00:07:23,597 [INFO] [BatchProcessor]:     → Processing batch 1/1
2026-02-06 00:07:24,613 [INFO] numexpr.utils: Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2026-02-06 00:07:24,614 [INFO] numexpr.utils: NumExpr defaulting to 8 threads.
✓ All API requests processed
2026-02-06 00:07:37,849 [INFO] [PipelineOrchestrator]: Pipeline run finished


In [6]:
response_df = spark.table("naas.naas_api_response")
response_df.show()

+--------------------+--------------------+--------------------+------+--------------------+--------------+------------------+-----------+--------------------+--------------------+-------+-------------+--------+--------------------+--------------------+
|          request_id|            row_hash|                 url|method|     request_headers|request_params|  request_metadata|status_code|    response_headers|           body_text|success|error_message|attempts|   response_metadata|       _request_time|
+--------------------+--------------------+--------------------+------+--------------------+--------------+------------------+-----------+--------------------+--------------------+-------+-------------+--------+--------------------+--------------------+
|78e11b8596a1f065c...|7f2e88c5cb3078682...|/https://naas.isa...|   GET|{"Accept": "appli...|            {}|{"vendor": "NaaS"}|        200|{"Date": "Fri, 06...|{"reason":"Thank ...|   true|         NULL|       1|{"connection_warm...|2026-0

In [7]:
response_df.columns

['request_id',
 'row_hash',
 'url',
 'method',
 'request_headers',
 'request_params',
 'request_metadata',
 'status_code',
 'response_headers',
 'body_text',
 'success',
 'error_message',
 'attempts',
 'response_metadata',
 '_request_time']

In [8]:
response_df.groupBy("status_code").count().show()

+-----------+-----+
|status_code|count|
+-----------+-----+
|        200|  100|
+-----------+-----+



In [14]:
response_df.select("body_text").limit(1).collect()

[Row(body_text='{"reason":"I\'ve been looking forward to reorganizing my button collection instead."}')]

In [15]:
response_df.select("response_metadata").limit(1).collect()

[Row(response_metadata='{"connection_warmup": {"warmed_up": false, "warmup_error": null, "warmup_timeout": 10}, "logs": ["-> GET /https://naas.isalman.dev/no", "[RetryMiddleware] Attempt 1/10 -> GET /https://naas.isalman.dev/no", "<- 200 /https://naas.isalman.dev/no"], "json": {"valid": true, "error": null}, "timing": {"total_seconds": 1.21}}')]