In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

log_headers = ["timesteps", "action", "latency", "file_path", "num_query"]

In [6]:
def parse_func_name(func_name_str):
    # Func:'load_index' => "load_index"
    str_l = func_name_str.split(":")
    if "completed" in func_name_str:
        return func_name_str
    return str_l[-1][1:-1]

def parse_latency(time_str):
    return float(time_str[:-1])

def create_gantt_chart(load_index_hbars, search_index_hbars, figsize=(8, 1), ax=None, vline=True):
    return_ax = False
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)
        return_ax = True

    bar_width = 0.5
    bar_gap = 0.0
    # [(x, length), ()], (y, bar_width)
    ax.broken_barh(load_index_hbars[:], (1, bar_width), facecolors =('darkorange'))
    ax.broken_barh(search_index_hbars[:], (1+bar_width+bar_gap, bar_width), facecolors =('green'))

    ax.set_yticks([1+bar_width/2, 1+bar_width+bar_gap+bar_width/2], ['load', 'search'])
    # ax.grid(True, alpha=0.5)

    if vline:
        last_x = search_index_hbars[-1][0]+search_index_hbars[-1][1]
        ax.axvline(x=last_x, color='black', linestyle='--', alpha=1)

    if return_ax:
        return fig, ax
    return ax

In [7]:
# read log file without header
df = pd.read_csv('../logs/st_index_1_batch.log', header=None)

# change column name
df.columns = log_headers

df["action"] = df["action"].map(parse_func_name)
df["latency"] = df["latency"].map(parse_latency)

# make a new column for the cumulative sum of the latency
df["cumulative_latency"] = df["latency"].cumsum()
df["start_time"] = df["cumulative_latency"] - df["latency"]

df.head()

Unnamed: 0,timesteps,action,latency,file_path,num_query,cumulative_latency,start_time
0,2024-04-10 22:55:39,load_index,0.000382,shards/idxs/embeds_centroids.index,0,0.000382,0.0
1,2024-04-10 22:55:39,query_index,0.056684,shards/idxs/embeds_centroids.index,10000,0.057066,0.000382
2,2024-04-10 22:55:39,load_index,0.004468,shards/idxs/embeds_333.index,0,0.061534,0.057066
3,2024-04-10 22:55:39,query_index,0.004908,shards/idxs/embeds_333.index,1048,0.066443,0.061534
4,2024-04-10 22:55:39,load_index,0.005058,shards/idxs/embeds_797.index,0,0.071501,0.066443


In [12]:
# create ranking {idx_path: num_query * num_query_index}
query_index_df = df[df["action"] == "query_index"]

index_ranking = {}

unique_fps = np.unique(query_index_df["file_path"])
for fp in unique_fps:
    idx_df = query_index_df[query_index_df["file_path"] == fp]
    num_query = idx_df.iloc[0]["num_query"]
    index_ranking[fp] = num_query

# sort by ranking
index_ranking = {k: v for k, v in sorted(index_ranking.items(), key=lambda item: item[1], reverse=True)}
index_ranking

{'shards/idxs/embeds_centroids.index': 10000,
 'shards/idxs/embeds_64.index': 3111,
 'shards/idxs/embeds_102.index': 3108,
 'shards/idxs/embeds_868.index': 3070,
 'shards/idxs/embeds_111.index': 3062,
 'shards/idxs/embeds_11.index': 3057,
 'shards/idxs/embeds_963.index': 3018,
 'shards/idxs/embeds_101.index': 3008,
 'shards/idxs/embeds_555.index': 2934,
 'shards/idxs/embeds_194.index': 2891,
 'shards/idxs/embeds_646.index': 2772,
 'shards/idxs/embeds_628.index': 2767,
 'shards/idxs/embeds_198.index': 2707,
 'shards/idxs/embeds_407.index': 2640,
 'shards/idxs/embeds_857.index': 2627,
 'shards/idxs/embeds_637.index': 2613,
 'shards/idxs/embeds_557.index': 2582,
 'shards/idxs/embeds_104.index': 2482,
 'shards/idxs/embeds_946.index': 2382,
 'shards/idxs/embeds_79.index': 2250,
 'shards/idxs/embeds_947.index': 2243,
 'shards/idxs/embeds_740.index': 2137,
 'shards/idxs/embeds_356.index': 2107,
 'shards/idxs/embeds_527.index': 2003,
 'shards/idxs/embeds_824.index': 1938,
 'shards/idxs/embeds_