In [1]:
from runner import run_read, run_transform, run_generate_dataset
from generator import generate_partial_request

In [2]:
# all for chunk size ~128mb
datasets = [
    {'depth': 0, 'sparsity': 0.0, 'docs_count': 625000},
    {'depth': 0, 'sparsity': 0.1, 'docs_count': 675000},
    {'depth': 0, 'sparsity': 0.2, 'docs_count': 725000},
    {'depth': 0, 'sparsity': 0.3, 'docs_count': 775000},
    {'depth': 0, 'sparsity': 0.4, 'docs_count': 825000},
    {'depth': 1, 'sparsity': 0.0, 'docs_count': 95000},
    {'depth': 1, 'sparsity': 0.1, 'docs_count': 105000},
    {'depth': 1, 'sparsity': 0.2, 'docs_count': 122500},
    {'depth': 1, 'sparsity': 0.3, 'docs_count': 150000},
    {'depth': 1, 'sparsity': 0.4, 'docs_count': 182500},
    {'depth': 2, 'sparsity': 0.0, 'docs_count': 17500},
    {'depth': 2, 'sparsity': 0.1, 'docs_count': 22500},
    {'depth': 2, 'sparsity': 0.2, 'docs_count': 30000},
    {'depth': 2, 'sparsity': 0.3, 'docs_count': 39000},
    {'depth': 2, 'sparsity': 0.4, 'docs_count': 55000},
    {'depth': 3, 'sparsity': 0.0, 'docs_count': 3750},
    {'depth': 3, 'sparsity': 0.1, 'docs_count': 5500},
    {'depth': 3, 'sparsity': 0.2, 'docs_count': 8500},
    {'depth': 3, 'sparsity': 0.3, 'docs_count': 12500},
    {'depth': 3, 'sparsity': 0.4, 'docs_count': 22500},
    {'depth': 4, 'sparsity': 0.0, 'docs_count': 400},
    {'depth': 4, 'sparsity': 0.1, 'docs_count': 600},
    {'depth': 4, 'sparsity': 0.2, 'docs_count': 1000},
    {'depth': 4, 'sparsity': 0.3, 'docs_count': 1800},
    {'depth': 4, 'sparsity': 0.4, 'docs_count': 3500},
    {'depth': 5, 'sparsity': 0.0, 'docs_count': 85},
    {'depth': 5, 'sparsity': 0.1, 'docs_count': 135},
    {'depth': 5, 'sparsity': 0.2, 'docs_count': 270},
    {'depth': 5, 'sparsity': 0.3, 'docs_count': 550},
    {'depth': 5, 'sparsity': 0.4, 'docs_count': 1250},
    {'depth': 6, 'sparsity': 0.0, 'docs_count': 22},
    {'depth': 6, 'sparsity': 0.1, 'docs_count': 42},
    {'depth': 6, 'sparsity': 0.2, 'docs_count': 85},
    {'depth': 6, 'sparsity': 0.3, 'docs_count': 205},
    {'depth': 6, 'sparsity': 0.4, 'docs_count': 550},
]

partial_request_ratios = [0.1, 0.5, 0.8, 1.0]

In [3]:
binary_path = "../build/bin/cli"
ds_path = "./datasets/documents"
schemas_path = "./datasets/schemas"
temp_path = "./datasets/temp"

In [4]:
def gen(ds):
    run_generate_dataset(
        "../build/bin/cli",
        f"./datasets/documents/{ds['depth']}-{ds['sparsity']:.1f}.json",
        f"./datasets/schemas/schema-{ds['depth']}.json",
        docs_count=ds["docs_count"],
        sparsity=ds["sparsity"]
    )

In [5]:
import os

# os.mkdir("./datasets")
# os.mkdir("./datasets/documents")
# os.mkdir("./datasets/schemas")
# os.mkdir("./datasets/temp")
# for ds in datasets:
#     gen(ds)

In [10]:
from tqdm.notebook import tqdm

import json

probes_cnt = 5

results = {}

for ds in tqdm(datasets[31:], desc="Dataset", leave=True):
    ds_key = f"{ds['depth']}-{ds['sparsity']:.1f}"
    results[ds_key] = {}

    input_path = f"{ds_path}/{ds['depth']}-{ds['sparsity']:.1f}.json"
    schema_path = f"{schemas_path}/schema-{ds['depth']}.json"
    with open(schema_path) as f:
        schema = json.loads(f.read())

    partial_requests = [generate_partial_request(schema, r) for r in partial_request_ratios]

    for format in tqdm(["json", "bson", "columnar"], desc=f"Processing {ds_key}", leave=True):
        results[ds_key][format] = {}

        # results[ds_key][format]["write"] = []
        # for i in tqdm(range(probes_cnt), desc="Probe write", leave=False):
        #     output_path = f"./temp/out-{ds['depth']}-{ds['sparsity']:.1f}.{format}"
        #     result = run_transform(binary_path, input_path, "json", output_path, format, schema_path=schema_path)
        #     results[ds_key][format]["write"].append(result["write_duration_ns"])

        read_input_path = f"./temp/out-{ds['depth']}-{ds['sparsity']:.1f}.{format}"
        for r in tqdm(partial_request_ratios, desc=f"Processing {format} request ratios", leave=True):
            results[ds_key][format][f"read_{r:.1f}"] = []

            for _ in tqdm(range(probes_cnt), desc=f"Probes for {r:.1f}"):
                partial_request = generate_partial_request(schema, r)
                result = run_read(binary_path, read_input_path, format, schema_path=schema_path, partial_request=partial_request)
                results[ds_key][format][f"read_{r:.1f}"].append(result["read_duration_ns"])


Dataset:   0%|          | 0/4 [00:00<?, ?it/s]

Processing 6-0.1:   0%|          | 0/3 [00:00<?, ?it/s]

Processing json request ratios:   0%|          | 0/4 [00:00<?, ?it/s]

Probes for 0.1:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.5:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.8:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 1.0:   0%|          | 0/5 [00:00<?, ?it/s]

Processing bson request ratios:   0%|          | 0/4 [00:00<?, ?it/s]

Probes for 0.1:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.5:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.8:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 1.0:   0%|          | 0/5 [00:00<?, ?it/s]

Processing columnar request ratios:   0%|          | 0/4 [00:00<?, ?it/s]

Probes for 0.1:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.5:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.8:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 1.0:   0%|          | 0/5 [00:00<?, ?it/s]

Processing 6-0.2:   0%|          | 0/3 [00:00<?, ?it/s]

Processing json request ratios:   0%|          | 0/4 [00:00<?, ?it/s]

Probes for 0.1:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.5:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.8:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 1.0:   0%|          | 0/5 [00:00<?, ?it/s]

Processing bson request ratios:   0%|          | 0/4 [00:00<?, ?it/s]

Probes for 0.1:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.5:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.8:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 1.0:   0%|          | 0/5 [00:00<?, ?it/s]

Processing columnar request ratios:   0%|          | 0/4 [00:00<?, ?it/s]

Probes for 0.1:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.5:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.8:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 1.0:   0%|          | 0/5 [00:00<?, ?it/s]

Processing 6-0.3:   0%|          | 0/3 [00:00<?, ?it/s]

Processing json request ratios:   0%|          | 0/4 [00:00<?, ?it/s]

Probes for 0.1:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.5:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.8:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 1.0:   0%|          | 0/5 [00:00<?, ?it/s]

Processing bson request ratios:   0%|          | 0/4 [00:00<?, ?it/s]

Probes for 0.1:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.5:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.8:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 1.0:   0%|          | 0/5 [00:00<?, ?it/s]

Processing columnar request ratios:   0%|          | 0/4 [00:00<?, ?it/s]

Probes for 0.1:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.5:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 0.8:   0%|          | 0/5 [00:00<?, ?it/s]

Probes for 1.0:   0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
import json

with open("./read_results_mmap.json", "w+") as f:
    f.write(json.dumps(results))