In [1]:
!pip install pandas seaborn scipy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import glob
from scipy import stats
import numpy as np

pd.options.mode.copy_on_write = True

In [3]:
categories_df = pd.read_csv("~/Downloads/2024-12-18 _ Benchmark Results - Categories.csv")
d = categories_df.to_dict()

extract_values = lambda l: [y[1] for y in sorted(d[l].items(), key=lambda x: x[0])]
category_map = {(workload, op): category for workload, op, category in zip(extract_values('Workload'), extract_values('Operation'), extract_values('Category'))}

In [4]:
df = pd.read_csv("~/Downloads/2024-12-10 _ Benchmark Results - raw.csv")
df = df[df['name'] == 'service_time']

df['category'] = df.apply(lambda row: category_map.get((row['workload'], row['operation']), None), axis=1)
df = df[~(df["distribution-version"].isin(["8.8.1", "8.11.4"]))]
print(df['distribution-version'].unique())
df.head()

['2.18.0' '2.17.1' '8.15.4' '2.16.0']


Unnamed: 0,user-tags\.run-group,environment,user-tags\.engine-type,distribution-version,workload,workload_subtype,test-procedure,user-tags\.run,operation,name,...,value\.90_0,workload\.target_throughput,workload\.number_of_replicas,workload\.bulk_indexing_clients,workload\.max_num_segments,user-tags\.shard-count,user-tags\.replica-count,workload\.query_data_set_corpus,workload\.target_index_body,category
60,2024-12-02 2:37:20,gh-nightly-1733107040,OS,2.18.0,noaa_semantic_search,,hybrid-query-aggs-no-index,0,aggs-query-min-avg-sum-hybrid,service_time,...,2.164169,,0.0,,8.0,6,0,,,Aggregation
61,2024-12-02 2:37:20,gh-nightly-1733107040,OS,2.18.0,noaa_semantic_search,,hybrid-query-aggs-no-index,0,aggs-query-range-numeric-significant-terms-hybrid,service_time,...,1.829712,,0.0,,8.0,6,0,,,Aggregation
62,2024-12-02 2:37:20,gh-nightly-1733107040,OS,2.18.0,noaa_semantic_search,,hybrid-query-aggs-no-index,0,aggs-query-date-histo-geohash-grid-hybrid-one-...,service_time,...,1.465304,,0.0,,8.0,6,0,,,Aggregation
63,2024-12-02 2:37:20,gh-nightly-1733107040,OS,2.18.0,noaa_semantic_search,,hybrid-query-aggs-no-index,0,aggs-query-min-avg-sum-hybrid-one-subquery-med...,service_time,...,1.427159,,0.0,,8.0,6,0,,,Aggregation
64,2024-12-02 2:37:20,gh-nightly-1733107040,OS,2.18.0,noaa_semantic_search,,hybrid-query-aggs-no-index,0,aggs-query-term-min-hybrid-one-subquery-medium...,service_time,...,1.432932,,0.0,,8.0,6,0,,,Aggregation


In [5]:
results = []
df_engine = df[df["distribution-version"].isin(["8.15.4", "2.18.0"])]
for workload in ["pmc", "nyc_taxis", "noaa", "big5", "noaa_semantic_search"]:
    # df_engine = df[df["user-tags\\.engine-type"] == "OS"]
    df_workload = df_engine[df_engine['workload'] == workload]
    
    for (engine_type, engine_version, operation, category), group in df_workload.groupby(["user-tags\\.engine-type", "distribution-version", "operation", "category"]):
        values = group["value\\.90_0"]
        results.append({
            "workload": workload,
            "engine_type": engine_type,
            "engine_version": engine_version,
            "operation": operation,
            "category": category,
            "p90th": values.mean(),
        })
        # print(engine_type, engine_version, workload, operation, values.mean(), values.median())

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,workload,engine_type,engine_version,operation,category,p90th
0,pmc,ES,8.15.4,articles_monthly_agg_cached,Aggregation,1.218702
1,pmc,ES,8.15.4,articles_monthly_agg_uncached,Aggregation,6.788204
2,pmc,ES,8.15.4,default,General Operations,3.103733
3,pmc,ES,8.15.4,phrase,Text Querying,3.634737
4,pmc,ES,8.15.4,scroll,General Operations,257.126625
...,...,...,...,...,...,...
171,noaa_semantic_search,OS,2.18.0,aggs-query-term-min-hybrid-one-subquery-medium...,Aggregation,1.462272
172,noaa_semantic_search,OS,2.18.0,hybrid-query-only-range,Hybrid Query,1.495748
173,noaa_semantic_search,OS,2.18.0,hybrid-query-only-range-large-subset,Hybrid Query,1.466839
174,noaa_semantic_search,OS,2.18.0,hybrid-query-only-range-medium-subset,Hybrid Query,1.480840


In [22]:
close_results = []
big_results = []
threshold = 0.05
for (workload, operation, category), group in results_df.groupby(["workload", "operation", "category"]):
    es_group = group[group["engine_type"] == "ES"]
    os_group = group[group["engine_type"] == "OS"]

    if es_group.empty or os_group.empty:
        continue

    es_val = es_group["p90th"].iloc[0]
    os_val = os_group["p90th"].iloc[0]

    if es_val > os_val:
        pct_diff = (es_val) / os_val * 100 - 100
    elif os_val > es_val:
        pct_diff = (os_val) / es_val * 100 - 100
        
    if pct_diff < 5:
        close_results.append((
            category,
            workload,
            "ES" if es_val > os_val else "OS",
            operation,
            es_val,
            os_val,
            pct_diff
        ))
    else:
        big_results.append((
            category,
            workload,
            "ES" if es_val > os_val else "OS",
            operation,
            es_val,
            os_val,
            pct_diff
        ))

print()
print()
print("====================")
print(f"# < 5% difference ({len(close_results)})")
for category, workload, engine_type, operation, es_val, os_val, pct_diff in sorted(close_results, key=lambda x: (x[0], x[-1])):
    print(f"{category:<25} | {workload + " / " + operation:<60} | ES {es_val:>7.2f}ms | OS {os_val:>7.2f}ms | {engine_type} slower by {pct_diff:.0f}%")


print()
print()
print("====================")
print(f"# > 5% difference ({len(big_results)})")
for category, workload, engine_type, operation, es_val, os_val, pct_diff in sorted(big_results, key=lambda x: (x[0], x[-1])):
    print(f"{category:<25} | {workload + " / " + operation:<60} | ES {es_val:>7.2f}ms | OS {os_val:>7.2f}ms | {engine_type} slower by {pct_diff:.0f}%")

print()
print()
print("====================")
huge_results = [x for x in big_results if x[-1] > 500]
print(f"# Huge differences ({len(huge_results)})")
for category, workload, engine_type, operation, es_val, os_val, pct_diff in sorted(huge_results, key=lambda x: (x[-1])):
    print(f"{category:<25} | {workload + " / " + operation:<60} | ES {es_val:>7.2f}ms | OS {os_val:>7.2f}ms | {engine_type} {pct_diff/100:.0f}x slower")





# < 5% difference (16)
Date Histogram            | noaa / date-histo-string-significant-terms-via-default-strategy | ES 2665.47ms | OS 2726.23ms | OS slower by 2%
Date Histogram            | noaa / date-histo-numeric-terms                              | ES 2102.18ms | OS 2150.12ms | OS slower by 2%
Date Histogram            | noaa / date-histo-string-significant-terms-via-global-ords   | ES 2661.66ms | OS 2724.63ms | OS slower by 2%
Date Histogram            | noaa / date-histo-string-terms-via-global-ords               | ES 2363.10ms | OS 2442.72ms | OS slower by 3%
Date Histogram            | noaa / date-histo-string-terms-via-default-strategy          | ES 2358.76ms | OS 2438.84ms | OS slower by 3%
Date Histogram            | noaa / date-histo-entire-range                               | ES    2.29ms | OS    2.21ms | ES slower by 4%
Range Queries             | big5 / range_field_conjunction_big_range_big_term_query      | ES    0.84ms | OS    0.84ms | ES slower by 0%
Range Queries