In [1]:
%load_ext autotime
%load_ext autoreload
%autoreload 2

time: 14.4 ms (started: 2024-12-23 16:50:57 -03:00)


In [2]:
# Used to make our results easier to read
import pandas as pd
from pprint import pprint
from apache_beam.options.pipeline_options import PipelineOptions
from IPython.display import Image, SVG, display
import requests

from vulkan_public.spec.dependency import INPUT_NODE
from vulkan.beam.local.convert import build_beam_policy
from vulkan.core.policy import Policy

from test_policy.policy import demo_policy as policy_definition

time: 1.69 s (started: 2024-12-23 16:50:57 -03:00)


In [3]:
import os
import socket
import subprocess
import platform
import signal


def is_server_running(host: str, port: int, timeout: float = 1.0) -> bool:
    """Check if a server is running on the given host and port.

    Args:
        host (str): The hostname or IP address of the server.
        port (int): The port number to check.
        timeout (float): The timeout for the connection in seconds (default is 1.0).

    Returns:
        bool: True if a server is running, False otherwise.
    """
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
        sock.settimeout(timeout)  # Set timeout for the connection attempt
        try:
            sock.connect((host, port))
            return True
        except (socket.timeout, ConnectionRefusedError, OSError):
            return False

        
def render_pipeline(
    pipeline_builder,
    output_image_path: str,
    render_port: int = -1,
):
    if render_port > 0:
        busy = is_server_running("localhost", render_port, timeout=1)
        if busy:
            msg = (
                f"There is already a server running on port {render_port}. \n"
                "Skipping starting a server for this render to avoid conflict."
            )
            print(msg)
            render_port = -1

    args = [
        "--runner=apache_beam.runners.render.RenderRunner", 
        f"--render_output={output_image_path}",
        f"--render_port={render_port}",
    ]
    options = PipelineOptions(args)
    
    p = pipeline_builder.build(backfill_id="test-id", pipeline_options=options)
    p.run()
    return SVG(output_image_path)

time: 12.1 ms (started: 2024-12-23 16:50:59 -03:00)


In [4]:
def run_local(pipeline_builder, run_id: str = "test-id"):
    args = [
        "--runner=DirectRunner",
    ]
    options = PipelineOptions(args)

    p = pipeline_builder.build(backfill_id=run_id, pipeline_options=options)
    run = p.run()

    output_data = pd.read_parquet(pipeline_builder.output_path)
    
    return output_data

time: 12 ms (started: 2024-12-23 16:50:59 -03:00)


In [5]:
df = pd.read_csv("simple_bkt_lg.csv").iloc[0:1]
df["month"] = df["month"].astype(str)
df["tax_id"] = df["tax_id"].astype(str)

df.to_parquet("input.parquet")

time: 15.7 ms (started: 2024-12-23 16:50:59 -03:00)


In [6]:
policy = Policy.from_definition(policy_definition)

time: 12 ms (started: 2024-12-23 16:50:59 -03:00)


In [7]:
# schema = {"month": "str", "tax_id": "str", "score": "int", "default": "int"}
data_sources = {INPUT_NODE: "input.parquet"}
config_variables = {"SCORE_CUTOFF": 500}

builder = build_beam_policy(
    policy, 
    data_sources=data_sources, 
    output_path="./output/", 
    config_variables=config_variables,
) 

time: 11.7 ms (started: 2024-12-23 16:50:59 -03:00)


In [8]:
output_image_path = "dag.svg"

render_pipeline(builder, output_image_path, render_port=21111)



T4: <class 'vulkan.beam.nodes.BeamTransformFn'>
INFO:dill:T4: <class 'vulkan.beam.nodes.BeamTransformFn'>
# T4
INFO:dill:# T4
D2: <dict object at 0x11567aa80>
INFO:dill:D2: <dict object at 0x11567aa80>
Fu: functools.partial(<function branch_condition_1 at 0x11510d4e0>, context=<vulkan.core.context.VulkanExecutionContext object at 0x115110230>)
INFO:dill:Fu: functools.partial(<function branch_condition_1 at 0x11510d4e0>, context=<vulkan.core.context.VulkanExecutionContext object at 0x115110230>)
F2: <function _create_ftype at 0x1094611c0>
INFO:dill:F2: <function _create_ftype at 0x1094611c0>
# F2
INFO:dill:# F2
T1: <class 'functools.partial'>
INFO:dill:T1: <class 'functools.partial'>
F2: <function _load_type at 0x109460fe0>
INFO:dill:F2: <function _load_type at 0x109460fe0>
# F2
INFO:dill:# F2
# T1
INFO:dill:# T1
F2: <function branch_condition_1 at 0x11510d4e0>
INFO:dill:F2: <function branch_condition_1 at 0x11510d4e0>
# F2
INFO:dill:# F2
D2: <dict object at 0x11515c4c0>
INFO:dill:D2: <

PicklingError: logger cannot be pickled

time: 788 ms (started: 2024-12-23 16:50:59 -03:00)


In [9]:
result = run_local(builder)

result.head()

T4: <class 'vulkan.beam.nodes.BeamTransformFn'>
INFO:dill:T4: <class 'vulkan.beam.nodes.BeamTransformFn'>
# T4
INFO:dill:# T4
D2: <dict object at 0x11568bd40>
INFO:dill:D2: <dict object at 0x11568bd40>
Fu: functools.partial(<function branch_condition_1 at 0x11510d4e0>, context=<vulkan.core.context.VulkanExecutionContext object at 0x115110230>)
INFO:dill:Fu: functools.partial(<function branch_condition_1 at 0x11510d4e0>, context=<vulkan.core.context.VulkanExecutionContext object at 0x115110230>)
F2: <function _create_ftype at 0x1094611c0>
INFO:dill:F2: <function _create_ftype at 0x1094611c0>
# F2
INFO:dill:# F2
T1: <class 'functools.partial'>
INFO:dill:T1: <class 'functools.partial'>
F2: <function _load_type at 0x109460fe0>
INFO:dill:F2: <function _load_type at 0x109460fe0>
# F2
INFO:dill:# F2
# T1
INFO:dill:# T1
F2: <function branch_condition_1 at 0x11510d4e0>
INFO:dill:F2: <function branch_condition_1 at 0x11510d4e0>
# F2
INFO:dill:# F2
D2: <dict object at 0x116d92640>
INFO:dill:D2: <

PicklingError: logger cannot be pickled

time: 366 ms (started: 2024-12-23 16:52:18 -03:00)
