In [0]:
import requests
import uuid
import yt.wrapper as yt

In [1]:
# configure environment to run this notebooks
import uuid
import yt.wrapper as yt

username = yt.get_user_name()
if yt.exists(f"//sys/users/{username}/@user_info/home_path"):
    # prepare working directory on distributed file system
    user_info = yt.get(f"//sys/users/{yt.get_user_name()}/@user_info")
    homedir = user_info["home_path"]
    # find avaliable vm presets
    cpu_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("cpu")] or ["default"]
    h100_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("h100")]
    h100_8_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("h100_8")]
    workdir = f"{homedir}/tmp/demo_workdir/{uuid.uuid4().hex}"
else:
    cpu_pool_trees = ["default"]
    h100_pool_trees = ["gpu_h100"]
    h100_8_pool_trees = ["gpu_h100"]
    workdir = f"//tmp/examples/{uuid.uuid4().hex}"

yt.create("map_node", workdir, recursive=True, ignore_existing=True)
print("Current working directory:", workdir)

Current working directory: //home/equal_amethyst_vulture/tmp/demo_workdir/5f5f6e12f7a64e6ba9028884db5783c6


In [2]:
names = requests.get('https://raw.githubusercontent.com/dominictarr/random-name/refs/heads/master/first-names.txt').content.decode("utf-8").split("\r\n")

yt.create("table", f"{workdir}/names", ignore_existing=True)
yt.write_table(f"{workdir}/names", [{"name": name} for name in names])

In [3]:
class StoriesGenerator:
    def __init__(self):
        self.model_loaded = False

    def __call__(self, row):
        import sys
        if not self.model_loaded:
            from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
            import torch

            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            self.model = AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M").to(self.device)
            self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
            self.model_loaded = True

        name = row["name"]
        prompt = f"{name} was a little child "
        print("Prompt {}".format(prompt), file=sys.stderr)
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
        output = self.model.generate(input_ids, max_length=100, num_beams=1)
        output_text = self.tokenizer.decode(output[0], skip_special_tokens=True)

        yield {"story": output_text}

yt.create("table", f"{workdir}/tales", force=True)

yt.run_map(
    StoriesGenerator(),
    f"{workdir}/names",
    f"{workdir}/tales",
    spec={
        "job_count": 1,
        "pool_trees": h100_pool_trees,
        "mapper": {
            "gpu_limit": 1,
            "cpu_limit": 4.0,
        },
    }
)

2025-06-19 19:38:48,551	INFO	Operation started: https://playground.tracto.ai/playground/operations/68bf1bec-6e479302-24dd03e8-6506a5ef/details


2025-06-19 19:38:48,597	INFO	( 0 min) operation 68bf1bec-6e479302-24dd03e8-6506a5ef starting


2025-06-19 19:38:49,202	INFO	( 0 min) Unrecognized spec: {'enable_partitioned_data_balancing': false, 'mapper': {'title': 'StoriesGenerator'}}


2025-06-19 19:38:51,607	INFO	( 0 min) operation 68bf1bec-6e479302-24dd03e8-6506a5ef: running=0     completed=0     pending=1     failed=0     aborted=0     lost=0     total=1     blocked=0    


2025-06-19 19:38:55,215	INFO	( 0 min) operation 68bf1bec-6e479302-24dd03e8-6506a5ef: running=1     completed=0     pending=0     failed=0     aborted=0     lost=0     total=1     blocked=0    


2025-06-19 20:08:27,158	INFO	(29 min) operation 68bf1bec-6e479302-24dd03e8-6506a5ef completed


2025-06-19 20:08:27,220	INFO	(29 min) Alerts: {'low_cpu_usage': {'code': 1, 'message': "Average CPU usage of some of your job types is significantly lower than requested 'cpu_limit'. Consider decreasing cpu_limit in spec of your operation", 'attributes': {'pid': 1, 'tid': 11773568412224608766, 'thread': 'Controller:12', 'fid': 18445936631659228141, 'host': 'ca-0.controller-agents.nebius-playground.svc.kyt.k8s.nebius.yt', 'datetime': '2025-06-19T20:08:22.961010Z', 'trace_id': '779c4e02-586a060b-21845564-e43b82f', 'span_id': 2348756362661160422}, 'inner_errors': [{'code': 1, 'message': 'Jobs of task "map" use 25.27% of requested cpu limit', 'attributes': {'pid': 1, 'tid': 11773568412224608766, 'thread': 'Controller:12', 'fid': 18445936631659228141, 'host': 'ca-0.controller-agents.nebius-playground.svc.kyt.k8s.nebius.yt', 'datetime': '2025-06-19T20:08:22.960993Z', 'trace_id': '779c4e02-586a060b-21845564-e43b82f', 'span_id': 2348756362661160422, 'cpu_time': 1790574, 'cpu_limit': 4.0, 'exec

<yt.wrapper.operation_commands.Operation at 0x7f07c5e3e780>