In [1]:
from orcapod.execution_engines import RayEngine
import orcapod as op
import pyarrow as pa

In [2]:
ray_engine = RayEngine(
    "ray://raycluster-op-test-kuberay-head-svc.ray.svc.cluster.local:10001"
)

2025-08-10 23:27:14,560	INFO client_builder.py:242 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.
    Ray: 2.48.0
    Python: 3.13.5
This process on Ray Client was started with:
    Ray: 2.48.0
    Python: 3.13.3



[36m(autoscaler +28s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[36m(autoscaler +28s)[0m Adding 5 node(s) of type workergroup.
[36m(autoscaler +28s)[0m Resized to 6 CPUs, 5 GPUs.
[36m(autoscaler +28s)[0m No available node types can fulfill resource requests {'CPU': 1.0}*44. Add suitable node types to this cluster to resolve this issue.
[36m(autoscaler +34s)[0m No available node types can fulfill resource requests {'CPU': 1.0}*22. Add suitable node types to this cluster to resolve this issue.
[36m(autoscaler +49s)[0m No available node types can fulfill resource requests {'CPU': 1.0}*11. Add suitable node types to this cluster to resolve this issue.
[36m(autoscaler +55s)[0m No available node types can fulfill resource requests {'CPU': 1.0}*22. Add suitable node types to this cluster to resolve this issue.
[36m(autoscaler +1m0s)[0m No available node types can fulfill resource requests {'CPU': 1.0}*27. A

In [3]:
input_stream = op.streams.TableStream(
    pa.Table.from_pylist([{"id": i, "x": i * 2, "y": i * 3} for i in range(50)]),
    tag_columns=["id"],
)

In [4]:
from pathlib import Path


@op.function_pod("sum")
def add_numbers(x: int, y: int) -> int:
    """
    A simple function that adds two numbers.
    """
    import time

    time.sleep(0.2)
    return x + y

First run synchronously

In [5]:
result_stream1 = add_numbers(input_stream)
result_stream1.run()
result_stream1.as_df()

id,sum
i64,i64
0,0
1,5
2,10
3,15
4,20
…,…
45,225
46,230
47,235
48,240


Next we run using Ray engine

In [9]:
result_stream2 = add_numbers(input_stream)
result_stream2.run(ray_engine)
result_stream2.as_df()

id,sum
i64,i64
0,0
1,5
2,10
3,15
4,20
…,…
45,225
46,230
47,235
48,240


**NOTE**: Depending on the availability of nodes and how Ray was configured, you may *not* see any improvement in the running speed for the example above (it may even take longer due to overhead!). If you observe that you don't seem to be getting any speed up, please consult your Ray cluster administrator.

## Integration with pipeline system

In [10]:
# make sure we are stating with a clean slate
import shutil

shutil.rmtree("./test_store", ignore_errors=True)

In [11]:
store = op.stores.BatchedDeltaTableArrowStore("./test_store")
pipeline = op.Pipeline("pipeline_with_ray", store)

In [12]:
with pipeline:
    result_stream = add_numbers(input_stream)

In [13]:
pipeline.run(ray_engine)

In [14]:
pipeline.add_numbers.as_df()

id,sum
i64,i64
0,0
1,5
2,10
3,15
4,20
…,…
45,225
46,230
47,235
48,240
