Merge 2497534 into b3a7583

nfrmtk · web-flow · commit 6fff3e8e0974 · 2025-09-29T11:07:56.000Z
diff --git a/ydb/core/kqp/tools/join_perf/benchmark_settings.cpp b/ydb/core/kqp/tools/join_perf/benchmark_settings.cpp
@@ -2,7 +2,7 @@
 
 namespace NKikimr::NMiniKQL {
 
-TString CaseName(ETestedJoinAlgo algo, ETestedJoinKeyType keyType, const TBenchmarkSettings::TPreset& preset,
+TString CaseName(ETestedJoinAlgo algo, ETestedJoinKeyType keyType, const TPreset& preset,
                  TTableSizes size) {
     TString algoName = [&] {
         switch (algo) {
@@ -37,28 +37,40 @@ TString CaseName(ETestedJoinAlgo algo, ETestedJoinKeyType keyType, const TBenchm
 }
 
 namespace NBenchmarkSizes {
-TVector<TTableSizes> ExponentialSizeIncrease() {
-    TVector<TTableSizes> ret;
+TPreset ExponentialSizeIncrease(int samples, int scale) {
+    TPreset ret;
+    ret.PresetName = "ExpGrowth";
     int init = 1 << 18;
+    init *= scale;
     for (int index = 0; index < 8; index++) {
         int thisNum = init * (1 << index);
-        ret.emplace_back(thisNum, thisNum);
+        for (int _ = 0; _ < samples; ++_){
+            ret.Cases.emplace_back(thisNum, thisNum);
+        }
     }
     return ret;
 }
 
-TVector<TTableSizes> LinearSizeIncrease() {
-    TVector<TTableSizes> ret;
-    int init = 1 << 22;
+TPreset LinearSizeIncrease(int samples, int scale) {
+    TPreset ret;
+    ret.PresetName = "LinearGrowth";
+    int init = 1 << 18;
+    init *= scale; 
     for (int index = 1; index < 9; index++) {
         int thisNum = init * index;
-        ret.emplace_back(thisNum, thisNum);
+        for (int _ = 0; _ < samples; ++_){
+            ret.Cases.emplace_back(thisNum, thisNum);
+        }
     }
     return ret;
 }
 
-TVector<TTableSizes> VerySmallSizes() {
-    return {{512, 512}, {1024, 1024}};
+TPreset VerySmallSizes(int, int) {
+    TPreset ret;
+    ret.PresetName = "VerySmall";
+    ret.Cases.emplace_back(512, 512);
+    ret.Cases.emplace_back(1024, 1024);
+    return ret;
 }
 } // namespace NBenchmarkSizes
 
diff --git a/ydb/core/kqp/tools/join_perf/benchmark_settings.h b/ydb/core/kqp/tools/join_perf/benchmark_settings.h
@@ -12,25 +12,25 @@ struct TTableSizes {
     int Left;
     int Right;
 };
+struct TPreset {
+    TVector<TTableSizes> Cases;
+    TString PresetName;
+};
 
 struct TBenchmarkSettings {
-    struct TPreset {
-        TVector<TTableSizes> Cases;
-        TString PresetName;
-    };
 
     TVector<TPreset> Presets;
     TSet<ETestedJoinKeyType> KeyTypes;
     TSet<ETestedJoinAlgo> Algorithms;
 };
 
-TString CaseName(ETestedJoinAlgo algo, ETestedJoinKeyType keyType, const TBenchmarkSettings::TPreset& preset,
+TString CaseName(ETestedJoinAlgo algo, ETestedJoinKeyType keyType, const TPreset& preset,
                  TTableSizes size);
 
 namespace NBenchmarkSizes {
-TVector<TTableSizes> ExponentialSizeIncrease();
-TVector<TTableSizes> LinearSizeIncrease();
-TVector<TTableSizes> VerySmallSizes();
+TPreset ExponentialSizeIncrease(int samples, int scale);
+TPreset LinearSizeIncrease(int samples, int scale);
+TPreset VerySmallSizes(int samples, int scale);
 } // namespace NBenchmarkSizes
 
 } // namespace NKikimr::NMiniKQL
diff --git a/ydb/core/kqp/tools/join_perf/graph.py b/ydb/core/kqp/tools/join_perf/graph.py
@@ -3,6 +3,8 @@
 import matplotlib.pyplot as plt
 import sys
 import os
+import numpy as np
+import math
 from pathlib import Path
 if len(sys.argv) < 2:
     print("usage: python3 graph.py folder/file.jsonl")
@@ -24,6 +26,15 @@
             'key_type': name_parts[1]
         }
     )
+# is_time_sampled = only_needed[0]["input_data_flavour"].startswith("Sampling")
+def geo_mean_70percent_lowest(series):
+    size = len(series)
+    smallest = series.nsmallest(math.ceil(size * 0.7))
+    positive = smallest[smallest > 0]
+    if len(positive) == 0:
+        return np.nan
+    return np.exp(np.mean(np.log(positive)))
+
 df = pd.DataFrame(only_needed)
 df = df.drop('run_name', axis=1)
 images_root_base = str(Path.home())+"/.join_perf/images"
@@ -42,15 +53,15 @@
         print(graph_name)
         subset = df[(df["input_data_flavour"] == data_flavour) & 
             (df["key_type"] == key_type)]
-        print(subset)
         fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 8), sharex=True)
         
         for name, group in subset.groupby('join_algorithm'):
+            group = group.groupby('left_table_size')['time'].apply(lambda x: geo_mean_70percent_lowest(x)).sort_values()
             axes.plot(
-                group['left_table_size'], 
-                group['time'], 
-                label=name,
-                marker='o'
+                group.index, 
+                group.values, 
+                label=name, 
+                marker='o' 
             )
         axes.set_ylabel('time')
         axes.set_xlabel('left_rows')
@@ -65,5 +76,4 @@
         plt.savefig(log_images + "/" + graph_name + ".jpeg")
 
 print(f"images without y-axis log scaling are written to {simple_images}") 
-print(f"images with y-axis log scaling are written to {log_images}") 
-
+print(f"images with y-axis log scaling are written to {log_images}") 
diff --git a/ydb/core/kqp/tools/join_perf/main.cpp b/ydb/core/kqp/tools/join_perf/main.cpp
@@ -36,27 +36,30 @@ int main(int argc, char** argv) {
     opts.AddHelpOption('h');
 
     NKikimr::NMiniKQL::TBenchmarkSettings params;
-    opts.AddLongOption('s', "benchmark_sizes")
-        .Help("left and right table sizes to choose for joins benchmark. visit NBenchmarkSizes namespace in "
-              "benchmark_settings.cpp to see exact values")
+    NKikimr::NMiniKQL::TPreset(*presetWithSamples)(int, int);
+    int samples = 1;
+    int scale = 1;
+    opts.AddHelpOption().Help("visit NBenchmarkSizes namespace in benchmark_settings.cpp for explanation");
+    opts.AddLongOption('c', "case")
+        .Help("left and right table sizes to choose for joins benchmark.")
         .Choices({"exp", "linear", "small"})
         .DefaultValue("small")
         .Handler1([&](const NLastGetopt::TOptsParser* option) {
             auto val = TStringBuf(option->CurVal());
-            auto preset = [&]() -> NKikimr::NMiniKQL::TBenchmarkSettings::TPreset {
+            presetWithSamples = [&]() {
                 if (val == "exp") {
-                    return {NKikimr::NMiniKQL::NBenchmarkSizes::ExponentialSizeIncrease(), "ExpGrowth"};
+                    return &NKikimr::NMiniKQL::NBenchmarkSizes::ExponentialSizeIncrease;
                 } else if (val == "linear") {
-                    return {NKikimr::NMiniKQL::NBenchmarkSizes::LinearSizeIncrease(), "LinearGrowth"};
+                    return &NKikimr::NMiniKQL::NBenchmarkSizes::LinearSizeIncrease;
                 } else if (val == "small") {
-                    return {NKikimr::NMiniKQL::NBenchmarkSizes::VerySmallSizes(), "VerySmall"};
+                    return &NKikimr::NMiniKQL::NBenchmarkSizes::VerySmallSizes;
                 } else {
                     Y_ABORT("unknown option for benchmark_sizes");
                 }
             }();
-            params.Presets.push_back(preset);
         });
-
+    opts.AddLongOption('s', "samples").Help("number representing how much to repeat single case. useful for noise reduction.").DefaultValue(1).StoreResult(&samples);
+    opts.AddLongOption("scale").Help("size of smallest table in case").DefaultValue(1<<18).StoreResult(&scale);
     params.Algorithms = {
         NKikimr::NMiniKQL::ETestedJoinAlgo::kBlockMap,
         // NKikimr::NMiniKQL::ETestedJoinAlgo::kBlockHash,
@@ -70,6 +73,7 @@ int main(int argc, char** argv) {
     };
 
     NLastGetopt::TOptsParseResult parsedOptions(&opts, argc, argv);
+    params.Presets.push_back(presetWithSamples(samples, scale));
     AddLittleLeftTablePreset(params);
 
     auto benchmarkResults = NKikimr::NMiniKQL::RunJoinsBench(params);