In [1]:
import ray

In [2]:
ray.init()

2023-03-21 13:47:45,189	INFO worker.py:1553 -- Started a local Ray instance.


0,1
Python version:,3.8.16
Ray version:,2.3.0


In [3]:
items = [{"name": str(i), "data": i} for i in range(10000)]

In [4]:
items[:2]

[{'name': '0', 'data': 0}, {'name': '1', 'data': 1}]

In [5]:
len(items)

10000

Create a distributed data collection using `ray` from `items`

In [6]:
import ray

In [7]:
ds = ray.data.from_items(items)

In [8]:
ds.show(5)

{'name': '0', 'data': 0}
{'name': '1', 'data': 1}
{'name': '2', 'data': 2}
{'name': '3', 'data': 3}
{'name': '4', 'data': 4}


##### Example 2

In [9]:
pipe = ds.window()

2023-03-21 13:47:48,735	INFO dataset.py:3881 -- Created DatasetPipeline with 20 windows: 7390b min, 8000b max, 7944b mean
2023-03-21 13:47:48,737	INFO dataset.py:3891 -- Blocks per window: 10 min, 10 max, 10 mean
2023-03-21 13:47:48,740	INFO dataset.py:3913 -- ✔️  This pipeline's per-window parallelism is high enough to fully utilize the cluster.
2023-03-21 13:47:48,743	INFO dataset.py:3930 -- ✔️  This pipeline's windows likely fit in object store memory without spilling.


In [10]:
result = pipe.map(lambda x: x["data"] ** 2)\
             .filter(lambda x: x % 2 == 0)\
             .flat_map(lambda x: [x, x**3])

In [11]:
result.show(10)

2023-03-21 13:47:49,142	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[map->filter->flat_map]


[dataset]: Run `pip install tqdm` to enable progress reporting.


2023-03-21 13:47:50,407	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[map->filter->flat_map]


0
0
4
64
16
4096
36
46656
64
262144


##### Example 3

In [19]:
import math
import time

In [21]:
def objective(x, y):
    return math.sqrt((x**2 + y**2)/2)

In [33]:
config = {
    "x": tune.grid_search([-1, -0.5, 0, 0.5, 1]),
    "y": tune.grid_search([-1, -0.5, 0, 0.5, 1]),
}

In [44]:
objective(x=1, y=2)

1.5811388300841898

In [45]:
objective(x=2, y=3)

2.5495097567963922

In [46]:
config

{'x': {'grid_search': [-1, -0.5, 0, 0.5, 1]},
 'y': {'grid_search': [-1, -0.5, 0, 0.5, 1]}}

Find `x` and `y` values that has the lowest output value in the `objective` function

**Hint**: `tune.report()`

In [47]:
from ray import tune

In [48]:
def training_function(config):
    x, y = config["x"], config["y"]
    score = objective(x, y)
    tune.report(score=score)

In [None]:
result = tune.run(
    training_function,
    config = config
)

In [51]:
result.get_best_config(metric="score", mode="min")

{'x': 0, 'y': 0}