In [1]:
import orcapod as op

In [2]:
from orcapod.data.datagrams import DictDatagram, ArrowDatagram, DictPacket, ArrowPacket

In [3]:
data = {
    "name": "orcapod",
    "__something": "there",
    "_another_kind": 5,
    "value": 42,
    "_source_value": "Japan",
}

In [4]:
dict_datagram = DictDatagram(data)

In [5]:
table = dict_datagram.as_table(include_all_info=True)

In [6]:
stream = op.streams.ImmutableTableStream(table, tag_columns=["name"])

for t, p in stream:
    print(t)
    print(p)

{'name': 'orcapod'}
{'_another_kind': 5, 'value': 42}


In [7]:
p.source_info()

{'_another_kind': None, 'value': 'Japan'}

In [None]:
stream.as_table(include_source=True)

NameError: name 'stream' is not defined

In [71]:
p.as_dict(include_all_info=True)

{'_another_kind': 5,
 'value': 42,
 '_context_key': 'std:v0.1.0:default',
 '__something': 'there',
 '_source__another_kind': None,
 '_source_value': None}

In [51]:
p

ArrowPacket(data={'_another_kind': 5, 'value': 42}, meta_columns=1, context='std:v0.1.0:default')

In [52]:
p.source_info()

{'_another_kind': None, 'value': None}

In [2]:
N = 10
stream = op.SyncStreamFromLists(
    tags=[{"id": i} for i in range(N)],
    packets=[{"x": i, "y": i + 1} for i in range(N)],
    tag_typespec={"id": int},
    packet_typespec={"x": int, "y": int},
    label="MySource",
)

word_stream = op.SyncStreamFromLists(
    tags=[{"id": i} for i in range(N)],
    packets=[{"word1": f"hello {i}", "word2": f"world {i}"} for i in range(N)],
    tag_typespec={"id": int},
    packet_typespec={"word1": str, "word2": str},
    label="HelloWorld",
)

In [None]:
for tag, packet in stream:
    print(tag, packet)

{'id': 0} {'x': 0, 'y': 1}
{'id': 1} {'x': 1, 'y': 2}
{'id': 2} {'x': 2, 'y': 3}
{'id': 3} {'x': 3, 'y': 4}
{'id': 4} {'x': 4, 'y': 5}
{'id': 5} {'x': 5, 'y': 6}
{'id': 6} {'x': 6, 'y': 7}
{'id': 7} {'x': 7, 'y': 8}
{'id': 8} {'x': 8, 'y': 9}
{'id': 9} {'x': 9, 'y': 10}


In [3]:
for tag, packet in word_stream:
    print(tag, packet)

{'id': 0} {'word1': 'hello 0', 'word2': 'world 0'}
{'id': 1} {'word1': 'hello 1', 'word2': 'world 1'}
{'id': 2} {'word1': 'hello 2', 'word2': 'world 2'}
{'id': 3} {'word1': 'hello 3', 'word2': 'world 3'}
{'id': 4} {'word1': 'hello 4', 'word2': 'world 4'}
{'id': 5} {'word1': 'hello 5', 'word2': 'world 5'}
{'id': 6} {'word1': 'hello 6', 'word2': 'world 6'}
{'id': 7} {'word1': 'hello 7', 'word2': 'world 7'}
{'id': 8} {'word1': 'hello 8', 'word2': 'world 8'}
{'id': 9} {'word1': 'hello 9', 'word2': 'world 9'}


## Defining function pods

Now we define our own function pods to perform simple computation. 
Defining a function pod is quite simple, you simply 
1. define a regular function with type annotations
2. decorate with `op.function_pod`, passing in the name ('key') for the output value(s)

In [4]:
@op.function_pod("total")
def total(x: int, y: int) -> int:
    return x + y


@op.function_pod("delta")
def delta(x: int, y: int) -> int:
    return 2 * y - x


@op.function_pod("mult")
def mult(x: int, y: int) -> int:
    return x * y


@op.function_pod("concat_string")
def concat(x: str, y: str) -> str:
    return x + y


Wrapped functions are now `FunctionPod` and expects to be called with streams as inputs. You can still access the original function through its `function` attribute.

In [5]:
# this won't work, because it's expecting a stream as input
total(5, 6)

TypeError: Expected SyncStream, got int for stream 5

In [None]:
# but you can access original function this way
total.function(5, 6)

11

In [None]:
# Passing a stream into a pod does NOT immediately trigger execution, but rather returns another stream

total_stream = total(stream)

Iterating through the stream or calling `flow` triggers the computation

In [6]:
for tag, packet in total_stream:
    print(tag, packet)

NameError: name 'total_stream' is not defined

In [7]:
total_stream.flow()

NameError: name 'total_stream' is not defined

If you try to pass in an incompatible stream (stream whose packets don't match the expected inputs of the function), you will immediately get an error.

In [8]:
total_stream = total(word_stream)

Key 'word1' not found in parameter types.


TypeError: Input packet types {'word1': <class 'str'>, 'word2': <class 'str'>} is not compatible with the function's expected input types {'x': <class 'int'>, 'y': <class 'int'>}

In [9]:
# you can check the tag and packet types of the stream
stream.types()

({'id': int}, {'x': int, 'y': int})

In [10]:
# you can check the tag and packet types of the stream
stream.types()

({'id': int}, {'x': int, 'y': int})

## Defining pipeline

We will now piece together multiple function pods into a pipeline. We do this by instantiating a `Pipeline` object. We will store the results into a simple data store.

In [11]:
# Use simple data store, saving data to Parquet files
from orcapod.stores.delta_table_arrow_data_store import DeltaTableArrowDataStore

pipeline_store = DeltaTableArrowDataStore("./delta_store", batch_size=100)

In [12]:
pipeline = op.Pipeline("test_pipeline", pipeline_store)


Now we have a pipeline object, we can use it to define our pipeline by simply "chaining" together function pod calls.

In [13]:
with pipeline:
    total_stream = total(stream)
    delta_stream = delta(stream)
    mult_stream = mult(
        total_stream.map({"total": "x"}), delta_stream.map({"delta": "y"})
    )

And that's it! Now the elements of the pipeline is available as properties on the pipeline.

By default, the function pods are made available under the function's name in the pipeline.

In [14]:
pipeline.run()

Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!


Error processing packet {'x': 8, 'y': 9}: Memoizing single packet return 2 packets!


AssertionError: Memoizing single packet return 2 packets!

In [16]:
pipeline.total

FunctionPodNode<FunctionPod:<function _original_total at 0x7f94f6424c20>>

In [18]:
pipeline.mult

FunctionPodNode<FunctionPod:<function _original_mult at 0x7fa97bea8a40>>

Other implicitly created nodes such as joining of two streams are made available under the corresponding operator class (e.g. `Join`)

In [19]:
pipeline.Join

KernelNode<Join()>

You can list out all nodes through `nodes` property

In [20]:
pipeline.nodes

{'MySource': KernelNode<StreamSource>,
 'total': FunctionPodNode<FunctionPod:<function _original_total at 0x7fa97bea8900>>,
 'delta': FunctionPodNode<FunctionPod:<function _original_delta at 0x7fa97bea8860>>,
 'MapPackets_0': KernelNode<packets(total ⇒ x)>,
 'MapPackets_1': KernelNode<packets(delta ⇒ y)>,
 'Join': KernelNode<Join()>,
 'mult': FunctionPodNode<FunctionPod:<function _original_mult at 0x7fa97bea8a40>>}

You can easily rename any node using the pipeline's `rename` method

In [21]:
pipeline.rename("MapPackets_0", "total_map")
pipeline.rename("MapPackets_1", "mult_map")

In [22]:
pipeline.nodes

{'MySource': KernelNode<StreamSource>,
 'total': FunctionPodNode<FunctionPod:<function _original_total at 0x7fa97bea8900>>,
 'delta': FunctionPodNode<FunctionPod:<function _original_delta at 0x7fa97bea8860>>,
 'Join': KernelNode<Join()>,
 'mult': FunctionPodNode<FunctionPod:<function _original_mult at 0x7fa97bea8a40>>,
 'total_map': KernelNode<packets(total ⇒ x)>,
 'mult_map': KernelNode<packets(delta ⇒ y)>}

Renaming does NOT change the structure of the pipeline in anyway -- it simply changes how it's labeld for your convenience.

### Running pipeline and accessing results

Since we just created the pipeline, there are no results associated with any node. You can get [Polars](https://pola.rs) DataFrame viewing into the results through the node's `df` attribute.

In [23]:
pipeline.total.df

Flushing triggered!!


Before we run, the source nodes is also not "recorded" and thus will appear empty.

In [24]:
pipeline.MySource.df

Flushing triggered!!


We can trigger the entire pipeline to run and record all results by simply calling the `run` method.

In [18]:
pipeline.run()

Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!
Flushing triggered!!


Error processing packet {'x': 8, 'y': 9}: Memoizing single packet return 2 packets!


AssertionError: Memoizing single packet return 2 packets!

In [24]:
pipeline.MySource.df

id,x,y
i64,i64,i64
0,0,1
1,1,2
2,2,3
3,3,4
4,4,5
5,5,6
6,6,7
7,7,8
8,8,9
9,9,10


In [25]:
pipeline.total.df

id,total
i64,i64
0,1
1,3
2,5
3,7
4,9
5,11
6,13
7,15
8,17
9,19
