Currently Release Management uses a sys-perf comparison on the Performance Discovery plugin as: <br><br>

<center>$ 50 \% \lt \left ( 100 \times {\LARGE \frac{y_{rc}}{y_{ga}}} \right )  \lt 150 \% $ </center>

where $ \large y_{rc}$ is the measurement of the new release candidate, and $ \large y_{ga}$ is the measurement of the last point release.

The proposal is to use the new variables: <br>
<center> <b>percent</b> = $  100 \times \LARGE \left ( \frac{y_{rc} - \bar{y}}{\bar{y}} \right ) $ </center>
<br>
<center><b>z_score</b> = $ \LARGE \frac{y_{rc} - \bar{y}}{\sigma_y} $ </center>

where $\large \bar{y} $ and $ \large \sigma_y $ refer to the mean and standard deviation since the last <b>Change Point</b>.

Advantages:
- more accurate as it uses more of the time series data
- can tighten the current filter from $\pm 50 \%$
- reduces signal to noise
- Use different limits for iop/s and latency

Disadvantages:
- takes more time to run the analysis: 
    -  4 minutes to load all the 4.4.7/4.4.8 tasks over REST
    -  6 minutes to run the mean/standard deviation algorithm on the analytics database (3,000 charts)
- Not all the legacy data is available (started over a year ago, but some tests have been broken)
    
To Do:
- Pick the limits - run on different branches over the next few releases
    - abs(percent) > 25% | abs(z_score) > 2
- Understand the new metrics
    - system cpu user (%) - mean
    - ss mem resident (MiB) - mean
    - Data - disk xvde utilization (%) - mean
    - Journal - disk xvdf utilization (%) - mean


In [1]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from more_itertools import pairwise
import re
import requests
import json
import yaml
import os
from jupyter_datatables import init_datatables_mode

from nblib import data, perfdisclib



In [4]:
build_a = "sys_perf_4.4_abb6b9c2bf675e9e2aeaecba05f0f8359d99e203" 
build_a_label = '4.4.7'
build_b = "sys_perf_4.4_83b8bb8b6b325d8d8d3dfd2ad9f744bdad7d6ca0"
build_b_label = '4.4.8'

In [5]:
perf_disc_info = perfdisclib.Info(max_tasks=4000, max_tests=20000, batch=100, build_a=build_a, build_a_label=build_a_label, build_b=build_b, build_b_label=build_b_label)
perf_disc_analysis = perfdisclib.Analysis(perf_disc_info)

In [6]:
client = data.PerfAtlasClient().conn()

In [7]:
%%time

# Get the list of tasks from the 2 commits.

dfa = perf_disc_analysis.read_task_list(perf_disc_analysis.pd_info.build_a)
dfb = perf_disc_analysis.read_task_list(perf_disc_analysis.pd_info.build_b)

Fetching tasks for sys_perf_4.4_abb6b9c2bf675e9e2aeaecba05f0f8359d99e203
Tasks fetched: 4000
Finished fetching tasks
Fetching tasks for sys_perf_4.4_83b8bb8b6b325d8d8d3dfd2ad9f744bdad7d6ca0
Tasks fetched: 4000
Finished fetching tasks
CPU times: user 2.16 s, sys: 414 ms, total: 2.57 s
Wall time: 40.3 s


In [14]:
# task_filters = [re.compile("^(canary_|fio_|iperf|NetworkBandwidth|[01]_1c_avg_latency|[01]_1c_max_latency|oplog1|finishing|CleanUp|"
#     "Setup|Quiesce|GennyOverhead|ShardCollection|EnableSharding|genny_canaries|nop_)"),
#     re.compile("Setup|ActorFinished|ActorStarted")]

task_filers = [re.compile("^(CleanUp|canary|fio|iperf|NetworkBandwidth|finishing|Setup|Quiesce|GennyOverhead)"), re.compile('ActorFinished|ActorStarted|Setup')]

In [15]:
%%time

dfa1, dfb1 = perf_disc_analysis.filter_and_merge(dfa, dfb, task_filters)

  print(df_filtered.test.str.contains(task_filter))

  df_filtered = df_filtered[~df_filtered.test.str.contains(task_filter)]



dfa length = 35323, dfb length = 38354
20       False
21       False
22       False
23       False
24       False
         ...  
40854    False
40855    False
40856    False
40857    False
40858    False
Name: test, Length: 35323, dtype: bool
20       False
21       False
22       False
23       False
24       False
         ...  
40854    False
40855    False
40856    False
40857    False
40858    False
Name: test, Length: 35139, dtype: bool
0        False
1        False
2        False
3        False
4        False
         ...  
44265    False
44266    False
44267    False
44268    False
44269    False
Name: test, Length: 38354, dtype: bool
0        False
1        False
2        False
3        False
4        False
         ...  
44265    False
44266    False
44267    False
44268    False
44269    False
Name: test, Length: 38220, dtype: bool
filtered dfa length = 35139, filtered dfb length = 38220
length of merged comparison = 24543
length after de-dup = 22500
length after keeping int

In [10]:
%%time

# Filter and merge the tasks from the 2 commits

def filter_canaries(dframe):

    dframe_filtered = dframe[~dframe.test.str.match('CleanUp|canary|fio|iperf|NetworkBandwidth|finishing|Setup|Quiesce|GennyOverhead')]
    dframe_filtered = dframe_filtered[~dframe_filtered.test.str.contains('ActorFinished|ActorStarted|Setup')]
    return dframe_filtered

print('dfa length = ', len(dfa),' dfb length = ', len(dfb))

dfa = filter_canaries(dfa)
dfb = filter_canaries(dfb)

print('filtered ', len(dfa),' ', len(dfb))

dfa["args"]= dfa["args"].apply(json.dumps)
dfb["args"]= dfb["args"].apply(json.dumps)

# merge our results together:
comparison = dfa.merge(dfb, on=["project","variant","task","test","measurement","args"])

print('length of merged comparison = ', len(comparison))

found_ts = comparison[["project","variant","task","test","measurement","args"]]

# We drop duplicates since there could be multiple executions for the same combination of the properties below.
found_ts = found_ts.drop_duplicates()

print('length after de-dup = ', len(found_ts))

# keep the interesting metrics
found_ts = found_ts[found_ts["measurement"].isin(['AverageLatency',
                                                  'ops_per_sec',
                                                  'system cpu user (%) - mean',
                                                  'ss mem resident (MiB) - mean',
                                                  'Data - disk xvde utilization (%) - mean',
                                                  'Journal - disk xvdf utilization (%) - mean'])]

print('length after keeping interesting metrics = ', len(found_ts))



dfa length =  40859  dfb length =  44270
filtered  35323   38354
length of merged comparison =  24719
length after de-dup =  22550
length after keeping interesting metrics =  1756
CPU times: user 336 ms, sys: 19.6 ms, total: 355 ms
Wall time: 358 ms


In [18]:
pd.concat([dfa,dfa1]).drop_duplicates(keep=False)

Unnamed: 0,project,variant,task,test,measurement,args,execution,value
20,sys-perf-4.4,linux-shard-lite,bestbuy_query,find_project_6_sort_unindexed_skip_limit_1-noAgg,ops_per_sec,"{""thread_level"": 1}",1,2.954792e-01
21,sys-perf-4.4,linux-shard-lite,bestbuy_query,find_project_6_sort_indexed_skip_limit_1-noAgg,ops_per_sec,"{""thread_level"": 32}",1,2.572742e+03
22,sys-perf-4.4,linux-shard-lite,bestbuy_query,find_project_6_sort_indexed_skip_limit_1-noAgg,ops_per_sec,"{""thread_level"": 16}",1,2.578922e+03
23,sys-perf-4.4,linux-shard-lite,bestbuy_query,find_project_6_sort_indexed_skip_limit_1-noAgg,ops_per_sec,"{""thread_level"": 1}",1,2.746334e+02
24,sys-perf-4.4,linux-shard-lite,bestbuy_query,find_project_skip_limit-noAgg,ops_per_sec,"{""thread_level"": 1}",1,1.774970e+01
...,...,...,...,...,...,...,...,...
40371,sys-perf-4.4,linux-standalone,union_with,AddCollections.DatabaseOperation.1.1,ErrorsTotal,"""null""",0,0.000000e+00
40372,sys-perf-4.4,linux-standalone,union_with,AddCollections.DatabaseOperation.1.1,OperationsTotal,"""null""",0,5.000000e+00
40373,sys-perf-4.4,linux-standalone,union_with,AddCollections.DatabaseOperation.1.1,DocumentsTotal,"""null""",0,0.000000e+00
40374,sys-perf-4.4,linux-standalone,union_with,AddCollections.DatabaseOperation.1.1,SizeTotal,"""null""",0,0.000000e+00


In [16]:
# From Alex Costas: Algorithm to look up time series from the anaytics node in able to characterize 
# the stable region of results around build_a.

def get_stable_region(commit_date, ts, cps):
       
    true_positive_orders = {
        cp["order"]
        for cp in cps
        if cp["triage"]["triage_status"] == "true_positive"
    }
    len_ts = len(ts["data"])
    stable_region_bounds = (
        [0]
        + [idx for idx, datum in enumerate(ts["data"]) if datum["order"] in true_positive_orders]
        + [len_ts]
    )

    start = end = 0

    # if base commit before or after the entire time series, get the closest stable region
    if commit_date < ts["data"][0]["commit_date"]:
        # first stable region
        start = stable_region_bounds[0]
        end = stable_region_bounds[1]

    if commit_date > ts["data"][len_ts - 1]["commit_date"]:
        # last stable region
        start = stable_region_bounds[-2]
        end = stable_region_bounds[-1]

    for start_bound, end_bound in pairwise(stable_region_bounds):
        if (
            ts["data"][start_bound]["commit_date"]
            <= commit_date
            <= ts["data"][end_bound - 1]["commit_date"]
        ):
            start = start_bound
            end = end_bound
    return [datum["value"] for datum in ts["data"][start:end]]

In [None]:
%%time

# Calculate the means and std dev for the Zscores
# Must be on VPN to read the analytics DB
print('')
# limit number of tests
found_ts = found_ts[0:max_tests]

total = len(found_ts)

stable_mean = []
stable_std = []
stable_length = []

date_a = client["expanded_metrics"]["versions"].find_one({"version_id": build_a})["commit_date"]
date_b = client["expanded_metrics"]["versions"].find_one({"version_id": build_b})["commit_date"]

for index, row in found_ts.iterrows():
    # some tests do not have threads.
    if row["args"] == "null":
            row["args"] = "{}"
    ts = client["expanded_metrics"]["time_series"].find_one({
            "project": row["project"],
            "variant": row["variant"],
            "task": row["task"],
            "test": row["test"],
            "args": json.loads(row["args"]),
            "measurement": row["measurement"],
        })
    cps = list(client["expanded_metrics"]["change_points"].find({
            "time_series_info.project": row["project"],
            "time_series_info.variant": row["variant"],
            "time_series_info.task": row["task"],
            "time_series_info.test": row["test"],
            "time_series_info.args": json.loads(row["args"]),
            "time_series_info.measurement": row["measurement"],
    }))
    
    try:
      stable_region = get_stable_region(date_a, ts, cps)
      stable_mean.append(np.mean(stable_region))
      stable_std.append(np.std(stable_region))
      stable_length.append(len(stable_region))
    except:
        # no stable region found
        print('')
        print('no stable region found for ', len(stable_length))
        print('')
        stable_mean.append(np.nan)
        stable_std.append(np.nan)
        stable_length.append(0)
        pass
    
    print('{}/{}'.format(len(stable_length), total), end='\r')

print('')
found_ts.insert(0, "stable_mean", stable_mean)
found_ts.insert(1, "stable_std", stable_std)
found_ts.insert(2, "stable_length", stable_length)


In [None]:
# merge the results together:
comparison = comparison.merge(found_ts, on=["project","variant","task","test","measurement","args"])

#comparison["difference"] = comparison["value_y"] - comparison["value_x"]
#comparison["percentage_change"] = ((comparison["value_y"] / comparison["value_x"]) * 100) - 100
#comparison["difference_from_stable_mean"] = comparison["value_y"] - comparison["stable_mean"]
comparison["percent"] = ((comparison["value_y"] / (1.E-3+comparison["stable_mean"])) * 100) - 100
comparison["z_score"] = (comparison["value_y"] - comparison["stable_mean"]) / (1.E-3+comparison["stable_std"])

In [None]:
# save the data to CSV
with open("compare.csv", "w") as csv:
    comparison.to_csv(csv)
    

In [None]:
# histogram the Zscores
%matplotlib inline

import seaborn as sns
sns.set_theme()

plt.rcParams["figure.figsize"] = (14,14)

comparison["z_score"].hist(by=comparison["measurement"])


In [None]:
# histogram the % changes
%matplotlib inline

import seaborn as sns
sns.set_theme()

plt.rcParams["figure.figsize"] = (14,14)

comparison["percent"].hist(by=comparison["measurement"])

In [None]:
# scatter plots 

# %matplotlib widget
# %matplotlib ipympl

# %matplotlib inline
# loses Engineering format
# import mpld3
# mpld3.enable_notebook()
from matplotlib.ticker import EngFormatter

params = {'legend.fontsize': 'large',
          'figure.figsize': (12, 8),
         'axes.labelsize': 16,
         'axes.titlesize': 16,
         'xtick.labelsize':14,
         'ytick.labelsize':14
         }
plt.rcParams.update(params)

fig, axs = plt.subplots(3,2, figsize=(12,12))
fig.subplots_adjust(hspace = .5, wspace=.5)

axs = axs.ravel()
i=0
for t in ['AverageLatency',
'ops_per_sec',
'system cpu user (%) - mean',
'ss mem resident (MiB) - mean',
'Data - disk xvde utilization (%) - mean',
'Journal - disk xvdf utilization (%) - mean']:
    axs[i].yaxis.set_major_formatter(EngFormatter()) 
    axs[i].set_title(t)
    axs[i].set(xlabel="percent", ylabel="z_score")
    axs[i].scatter(comparison["percent"][(comparison["measurement"] == t)],
             comparison["z_score"][(comparison["measurement"] == t)], s=5)
    i=i+1
    



In [None]:
def plot_timeseries(row_num):
    
    # put chart on a new pop-up    
    from IPython import get_ipython
    # %matplotlib widget
    %matplotlib qt

    project = comparison.loc[row_num, 'project']
    variant = comparison.loc[row_num, 'variant']
    task = comparison.loc[row_num, 'task']
    test = comparison.loc[row_num, 'test']
    measurement = comparison.loc[row_num, 'measurement']    
    args = comparison.loc[row_num, 'args']
    value_x = comparison.loc[row_num, 'value_x']
    value_y = comparison.loc[row_num, 'value_y']
    z_score = comparison.loc[row_num, 'z_score']
    percent = comparison.loc[row_num, 'percent']
    stable_mean = comparison.loc[row_num, 'stable_mean']
    stable_std = comparison.loc[row_num, 'stable_std']

    time_series = client["expanded_metrics"]["time_series"].find_one(
        { "project": project,
          "variant": variant, 
          "test": test, 
          "task": task, 
          "measurement": measurement,
         "args": json.loads(args)
        }
    )
        
    dates = [time_series_point["commit_date"] for time_series_point in time_series["data"]]
    values = [time_series_point["value"] for time_series_point in time_series["data"]]

    params = {'legend.fontsize': 'x-large',
          'figure.figsize': (16, 6),
         'axes.labelsize': 24,
         'axes.titlesize': 24,
         'xtick.labelsize':10,
         'ytick.labelsize':18}
    plt.rcParams.update(params)

    plt.suptitle(variant+' '+task+' '+test, fontsize=16)
    plt.title("z_score = {:.2f}".format(z_score)+"  percent = {:.2f}".format(percent), fontsize=10, loc='left')
    plt.plot(dates, values)
    
    plt.xlabel("Commit Date")
    plt.ylabel(time_series["measurement"])
    
    # add marks for the commits
    plt.axvline(date_a, color="green", linestyle="dotted")
    plt.text(date_a, value_x, build_a_label, rotation=90, fontsize=20)
    plt.axhline(value_x, color="green", linestyle="dotted" )
    plt.axvline(date_b, color="red", linestyle="dashed")
    plt.text(date_b, value_y, build_b_label, rotation=90, fontsize=20)
    plt.axhline(value_y, color="red", linestyle="dashed" )
    plt.axhline(stable_mean, color="purple", linestyle="dashdot" )
    plt.axhspan(stable_mean-stable_std, stable_mean+stable_std, facecolor="purple", alpha=0.05)
    
    plt.show()
    

In [None]:
# print the table as a qgrid

# increase size of output window
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 48em; }</style>"))

comparison = comparison.sort_values(by=['z_score', 'percent'], ignore_index=True)

df = pd.DataFrame(comparison)

pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 20)
pd.options.display.float_format = '{:.2f}'.format
# qgrid floating format
pd.set_option('display.precision', 3)

# add filter here to remove ok looking z_score & percentage differences

ddf = df[[ 'variant', 'task', 'test', 'measurement',  'z_score', 'percent', 
          'value_x', 'value_y', 'stable_mean', 'stable_length', 'stable_std', 'args']]

# save to disk
with open(f"selected_tasks_{build_a_label}_{build_b_label}.csv", "w") as csv:
    ddf.to_csv(csv)
    
import ipydatagrid

info_grid = ipydatagrid.DataGrid(ddf)

# display plot when row is selected
def on_row_selected(change):    
    plot_timeseries(change.new[0])
    
info_grid.observe(on_row_selected, names=['_selected_rows'])

print('Click on a row to see the time-series')
info_grid