In [7]:
import json
import matplotlib.pyplot as plt
import prettytable

"""Parses the output from go test -bench=BenchmarkQuantizationRecall -run=^$ ./adapters/repos/db/vector/compressionhelpers/ -count=1 -benchtime=1x -json"""
def load_data(file_name):
    rows = []
    with open(file_name, 'r') as file:
        for line in file:
            json_line = json.loads(line)
            if "Output" in json_line and "\t" in json_line["Output"]:
                output = json_line["Output"]
                tabs = output.split("\t")
                if len(tabs) > 5:
                    (dataset, description) = tabs[0].split("|")[1:3]
                    rows.append({
                        "dataset": dataset,
                        "algorithm": description[:2],
                        "bits": float(tabs[3][:-4]),
                        "description": description,
                        "recall100@100": float(tabs[6][:-10]),
                        "recall100@200": float(tabs[7][:-11]),
                    })
    return rows

def extract_column(dataset, property, rows):
    l = []
    for r in rows:
        if r["dataset"] == dataset:
            l.append(r[property])
    return l

def plot(file_name, dataset):
    rows = load_data(file_name)
    bits = extract_column(dataset, "bits", rows)
    algorithm = extract_column(dataset, "algorithm", rows)
    description = extract_column(dataset, "description", rows)
    rec100at100 = extract_column(dataset, "recall100@100", rows)
    colormap = {"BQ": "green", "PQ": "blue", "SQ": "purple", "RQ": "orange"}
    colors = [colormap[a] for a in algorithm]
    
    fig, ax = plt.subplots()
    scatter = ax.scatter(bits, rec100at100, c=colors, label=algorithm)
    ax.grid(True)
    for i, desc in enumerate(description):
        ax.annotate(desc[:2], (bits[i] + 0.2, rec100at100[i]))
    ax.set_ylabel("recall100@100")
    ax.set_xlabel("bits/dimension")
    # Add legend with colors

    ax.set_title(dataset)
    plt.show()

# Algorithm / Dataset -> recall
def ascii_table(file_name, datasets):
    rows = load_data(file_name)

    field_names = ["Algorithm", "Bits"] + datasets

    columns = []
    columns.append(extract_column(datasets[0], "description", rows))
    columns.append(extract_column(datasets[0], "bits", rows))
    for ds in datasets:
        rec100 = extract_column(ds, "recall100@100", rows)
        rec500 = extract_column(ds, "recall100@200", rows)
        rec = [f"{r1:.3f} ({r2:.3f})" for (r1, r2) in zip(rec100, rec500)]
        columns.append(rec)

    table = prettytable.PrettyTable()
    table.field_names = field_names
    
    num_rows = len(columns[0])
    prev_bits = 1
    for i in range(num_rows):
        row = [c[i] for c in columns]
        bits = row[1]
        if bits != prev_bits:
            table.add_divider()
            prev_bits = bits 
        table.add_row(row)
    
    print(table)

datasets = [
    # "sift-128-euclidean", 
    # "glove-200-angular", 
    # "gist-960-euclidean", 
    # "dbpedia-100k-openai-ada002-euclidean",
    # "dbpedia-100k-openai-ada002-angular",
    # "dbpedia-100k-openai-3large-dot",
    "dbpedia-500k-openai-ada002-euclidean",
    "dbpedia-openai-1000k-angular",
    "sphere-1M-meta-dpr",
    "snowflake-msmarco-arctic-embed-m-v1.5-angular",
    ]
ascii_table("fastrq.txt", datasets)



+----------------------------+------+--------------------------------------+------------------------------+--------------------+-----------------------------------------------+
|         Algorithm          | Bits | dbpedia-500k-openai-ada002-euclidean | dbpedia-openai-1000k-angular | sphere-1M-meta-dpr | snowflake-msmarco-arctic-embed-m-v1.5-angular |
+----------------------------+------+--------------------------------------+------------------------------+--------------------+-----------------------------------------------+
| FastRQ-Rank0-Deterministic | 4.0  |            0.880 (0.996)             |        0.889 (0.997)         |   0.742 (0.914)    |                 0.937 (1.000)                 |
+----------------------------+------+--------------------------------------+------------------------------+--------------------+-----------------------------------------------+
| FastRQ-Rank0-Deterministic | 6.0  |            0.970 (1.000)             |        0.973 (1.000)         |   0.933