# Load datasets

In [None]:
import os
import time
import numpy as np
import pandas as pd
import pickle
import psutil
import gc

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive/Colab\ Notebooks/CMPE\ 295\ RAG\ System/dataset

chunks		vs_test.pkl   vs_train_valid.pkl
vs_query10.pkl	vs_train.pkl  vs_valid.pkl


In [None]:
# function to load dataset from the folder
def load_pickle(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)

In [None]:
# load the test datasets
test_data = load_pickle('/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/dataset/vs_test.pkl')

#load train and valid
train_valid_data = load_pickle('/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/dataset/vs_train_valid.pkl')

train_valid_df = pd.DataFrame(train_valid_data).T
test_df        = pd.DataFrame(test_data).T

train_valid_df = train_valid_df[[0]]
test_df        = test_df[[0]]

combined_df = pd.concat([train_valid_df, test_df], axis=0, ignore_index=True)

print("Combined dataset shape:", combined_df.shape)
print(combined_df.head())

In [None]:
dataset_texts = combined_df[0].dropna().tolist()

# Embedding

## Prep

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
import torch

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(torch.cuda.is_available())

True


In [None]:
EMBEDDING_MODEL_NAME = "Alibaba-NLP/gte-Qwen2-7B-instruct"

## Testing

In [None]:
def check_dimensions_and_estimate_time(
    dataset_texts,
    sample_size=1000,
    batch_size=32,
    model_name=EMBEDDING_MODEL_NAME,
    device=DEVICE
):

    # 1) Load model
    print(f"Loading model: {model_name} on device={device}")
    model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
    model.max_seq_length = 1024

    n = len(dataset_texts)
    if n == 0:
        print("Dataset is empty!")
        return

    # pick up to 'sample_size' from the start
    sample_size = min(sample_size, n)
    sample_texts = dataset_texts[:sample_size]
    print(f"Sample size: {sample_size} (out of {n}).")

    # 2) Time the embedding
    start_time = time.time()
    emb_sample = model.encode(
        sample_texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    end_time = time.time()
    elapsed = end_time - start_time

    # 3) Print dimension
    if emb_sample.shape[0] > 0:
        dim = emb_sample.shape[1]
    else:
        dim = 0
    print(f"Embedding dimension: {dim}")
    print(f"Embedding {sample_size} rows took {elapsed:.2f} seconds.")

    # 4) Extrapolate for 1M
    if sample_size > 0:
        factor = 1_000_000 / sample_size
        est = elapsed * factor
        print(f"Approx time for 1M rows: {est/60:.2f} min (~{est/3600:.2f} hrs)")

    return {
        "dimension": dim,
        "sample_time_seconds": elapsed
    }


In [None]:
stats = check_dimensions_and_estimate_time(
    dataset_texts=dataset_texts,
    sample_size=1000,
    batch_size=8,
    model_name=EMBEDDING_MODEL_NAME,
    device=DEVICE
)

Loading model: Alibaba-NLP/gte-Qwen2-7B-instruct on device=cuda


Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/3.66G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

tokenization_qwen.py:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

Sample size: 1000 (out of 1000000).


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Embedding dimension: 3584
Embedding 1000 rows took 80.33 seconds.
Approx time for 1M rows: 1338.84 min (~22.31 hrs)


## Embedding 1 M Rows

In [None]:
def embed_dataset_in_chunks(
    dataset_texts,
    out_dir,
    chunk_size=10000,
    st_batch_size=16,
    model_name="Alibaba-NLP/gte-Qwen2-7B-instruct",
    device="cuda",
    start_chunk_id=0
):
    """
    Splits dataset_texts into chunks of 'chunk_size', starting at 'start_chunk_id'.
    For each chunk, we embed in sub-batches (batch_size=st_batch_size) via SentenceTransformer,
    save a pickled DataFrame [text, embedding] => chunk_{i}.pkl.
    """
    # Create output dir
    os.makedirs(out_dir, exist_ok=True)

    # 1) Load model
    print(f"Loading model: {model_name} on device={device}")
    model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
    model.max_seq_length = 1024

    n = len(dataset_texts)
    print(f"Total texts: {n}, chunk_size={chunk_size}, st_batch_size={st_batch_size}, start_chunk_id={start_chunk_id}")

    # 2) Initialize chunk_id and start_idx
    chunk_id  = start_chunk_id
    start_idx = chunk_id * chunk_size  # e.g. chunk_id=26 => start at row 260000

    while start_idx < n:
        end_idx = min(start_idx + chunk_size, n)
        chunk_data = dataset_texts[start_idx:end_idx]
        c_size     = len(chunk_data)

        if c_size == 0:
            break

        print(f"\nChunk {chunk_id}: rows {start_idx}..{end_idx-1}, size={c_size}")

        # 3) embed in sub-batches
        embeddings = model.encode(
            chunk_data,
            batch_size=st_batch_size,
            show_progress_bar=True,
            convert_to_numpy=True
        )
        # shape (c_size, dimension)

        # 4) Build a DataFrame
        df_chunk = pd.DataFrame({
            "text": chunk_data,
            "embedding": [list(vec) for vec in embeddings]
        })

        # 5) Save
        chunk_filename = f"gte_Qwen2_7B_instruct_{chunk_id}.pkl"
        chunk_path = os.path.join(out_dir, chunk_filename)
        df_chunk.to_pickle(chunk_path)
        print(f"Saved chunk_{chunk_id} => {chunk_path}, shape={df_chunk.shape}")

        # free memory
        del df_chunk, chunk_data, embeddings
        torch.cuda.empty_cache()
        gc.collect()

        # move to the next chunk
        start_idx += chunk_size
        chunk_id += 1

    print("\nAll chunks embedded and saved!")


In [None]:
ALIBABANLP_EMB_PATH  = '/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/MyProject/embeddings/Alibaba_NLP_3584/dataset'

os.makedirs(ALIBABANLP_EMB_PATH,  exist_ok=True)

In [None]:
embed_dataset_in_chunks(
    dataset_texts = dataset_texts,
    out_dir       = ALIBABANLP_EMB_PATH,
    chunk_size    = 10000,
    st_batch_size = 16,
    model_name    = EMBEDDING_MODEL_NAME,
    device        = "cuda",
    start_chunk_id= 71
)

## Embedding Questions

In [None]:
def load_csv_file(csv_path, column_name="context"):
    df = pd.read_csv(csv_path, encoding='utf-8')
    return df[column_name].tolist()

In [None]:
# contexts = load_csv_file("/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/MyProject/qnc/combined/context_final.csv", column_name="context")
questions= load_csv_file("/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/MyProject/qnc/combined/question_final.csv", column_name="question")

In [None]:
def embed_questions_and_save(
    question_list,
    out_path,
    batch_size=32,
    model_name="Alibaba-NLP/gte-Qwen2-7B-instruct",
    device="cuda"
):
    """
    Embeds 'question_list' in sub-batches, saves a single DataFrame [text, embedding].
    """
    if len(question_list) == 0:
        print("No questions to embed.")
        return

    print(f"Loading model for question embedding: {model_name} on {device}")
    model = SentenceTransformer(model_name, device=device, trust_remote_code=True)

    q_embs = model.encode(
        question_list,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        prompt_name="query"
    )

    df_q = pd.DataFrame({
        "text": question_list,
        "embedding": [list(vec) for vec in q_embs]
    })
    df_q.to_pickle(out_path)
    print(f"Saved questions to {out_path}, shape={df_q.shape}")


In [None]:
QUESTIONS_DIR_PATH  = '/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/MyProject/embeddings/Alibaba_NLP_3584/questions'

os.makedirs(QUESTIONS_DIR_PATH,  exist_ok=True)

QUESTIONS_EMB_PATH  = '/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/MyProject/embeddings/Alibaba_NLP_3584/questions/gte_Qwen2_7B_instruct_questions.pkl'

In [None]:
embed_questions_and_save(
    question_list=questions,
    out_path=QUESTIONS_EMB_PATH,
    batch_size=16,
    model_name=EMBEDDING_MODEL_NAME,
    device="cuda"
)

# RAG (POST-Embedding)

## Preparation

### Import

In [None]:
import os
import gc
import numpy as np
import pandas as pd
import time

from sklearn.decomposition import PCA
# import umap
# import faiss
import hnswlib
from sklearn.neighbors import NearestNeighbors


### Variables

In [None]:
PCA_Method="pca"
UMAP_Method="umap"
TRUNCATE_Method="truncate"

DIM_4096 = 4096
DIM_2048 = 2048
DIM_1024 = 1024
DIM_512  = 512
DIM_256  = 256
DIM_128  = 128
DIM_64   = 64
DIM_32   = 32
DIM_16   = 16

### Load Embedding

In [None]:
import glob

In [None]:
def load_chunked_embeddings(folder):
    pkl_files = sorted(glob.glob(os.path.join(folder, "gte_Qwen2_7B_instruct_*.pkl")))
    df_list   = []
    for pf in pkl_files:
        df_ch = pd.read_pickle(pf)
        df_list.append(df_ch)
    if not df_list:
        print("No chunk files found in", folder)
        return pd.DataFrame(columns=["text","embedding"])

    df_full = pd.concat(df_list, ignore_index=True)
    return df_full

In [None]:
def load_questions(pkl_path):

    return pd.read_pickle(pkl_path)

In [None]:
def load_contexts(csv_path):

    df = pd.read_csv(csv_path, encoding='utf-8')
    return df['context'].tolist()  # or a Series

### Dimension Reduction

In [None]:
def reduce_embeddings(
    dataset_embs,
    question_embs,
    method_name,      # "pca", "truncate_first", "truncate_last", "truncate_four"
    target_dim,
    umap_kwargs=None,
    pca_kwargs=None
):
    if umap_kwargs is None: umap_kwargs = {}
    if pca_kwargs  is None: pca_kwargs  = {}

    method = method_name.lower()
    D = dataset_embs.shape[1]

    # PCA
    if method == "pca":
        pca_model  = PCA(n_components=target_dim, random_state=42, **pca_kwargs)
        dataset_red  = pca_model.fit_transform(dataset_embs)
        question_red = pca_model.transform(question_embs)

    # Truncation: first and last
    elif method == "truncate_first":
        if target_dim > D:
            raise ValueError(f"target_dim {target_dim} > original dim {D}")
        idx          = slice(0, target_dim)
        dataset_red  = dataset_embs[:, idx]
        question_red = question_embs[:, idx]

    elif method == "truncate_last":
        if target_dim > D:
            raise ValueError(f"target_dim {target_dim} > original dim {D}")
        idx          = slice(D - target_dim, D)
        dataset_red  = dataset_embs[:, idx]
        question_red = question_embs[:, idx]

    # Truncation: four‑slice
    elif method == "truncate_four":
        if target_dim > D:
            raise ValueError(f"target_dim {target_dim} > original dim {D}")
        if target_dim % 4 != 0:
            raise ValueError("truncate_four requires target_dim divisible by 4")

        seg  = target_dim // 4                     # width of each slice
        anchors = [0, 0.25, 0.50, 0.75]           # 0%, 25%, 50%, 75%
        cols = np.hstack([
            np.arange(int(a * D), int(a * D) + seg)
            for a in anchors
        ])
        dataset_red  = dataset_embs[:, cols]
        question_red = question_embs[:, cols]

    else:
        raise ValueError(f"Unknown method_name '{method_name}'")

    return dataset_red.astype(np.float32, copy=False), question_red.astype(np.float32, copy=False)

### Retrieval Functions

In [None]:
def knn_search(dataset_embs, question_embs, top_k=1):

    nn = NearestNeighbors(n_neighbors=top_k, algorithm='auto')
    nn.fit(dataset_embs)
    dist, idx = nn.kneighbors(question_embs, n_neighbors=top_k, return_distance=True)
    return idx  # shape (Q, top_k)

In [None]:
# def hnsw_search(dataset_embs, question_embs, m=32, ef_construction=200, top_k=1):

#     n, dim = dataset_embs.shape
#     index = faiss.IndexHNSWFlat(dim, m, faiss.METRIC_L2)
#     index.hnsw.efConstruction = ef_construction

#     index.add(dataset_embs)

#     index.hnsw.efSearch = 64

#     distances, indices = index.search(question_embs, top_k)
#     return indices

In [None]:
def build_hnswlib_index(dataset_embs, space="l2", ef_construction=200, M=16):

    N, dim = dataset_embs.shape

    # 1) Create index object
    index = hnswlib.Index(space=space, dim=dim)

    # 2) Init index - must set max_elements
    index.init_index(
        max_elements=N,
        ef_construction=ef_construction,
        M=M
    )

    # 3) Add items
    index.add_items(dataset_embs, np.arange(N))

    # Possibly set ef for search
    index.set_ef(64)
    return index

In [None]:
def hnswlib_search(ds_red, qs_red, space="l2", ef_construction=200, M=16, top_k=1):

    # 1) Build
    N, dim = ds_red.shape
    t0 = time.perf_counter()
    index = build_hnswlib_index(ds_red, space=space, ef_construction=ef_construction, M=M)
    dt_hnsw_build  = time.perf_counter() - t0

    # 2) Searching each query
    t0 = time.perf_counter()
    labels, _ = index.knn_query(qs_red, k=top_k)
    dt_hnsw_search  = time.perf_counter() - t0

    return labels, dt_hnsw_build, dt_hnsw_search


### Evaluation / Comparison Function

In [None]:
def evaluate_retrieval(indices, df_dataset, contexts):
    """
    indices: shape (Q,1) from knn or hnsw
    df_dataset: the big dataset with columns [text, embedding], index=0..N-1
    contexts: list of ground truth (the 'correct' text for question i)

    We do an exact match: retrieved_text == contexts[i].
    Returns how many matches or an accuracy count.
    """
    correct = 0
    Q = len(contexts)
    for i in range(Q):
        best_idx = indices[i,0]
        retrieved_text = df_dataset.iloc[best_idx]["text"]
        if retrieved_text == contexts[i]:
            correct += 1
    accuracy = correct / Q
    return accuracy

### Complete Process with some functions

In [None]:
def run_complete_experiment(
    df_dataset,           # DataFrame with [text, embedding], shape (N,2)
    dataset_embs,
    question_embs,
    contexts,
    method_name,
    target_dim,
    do_knn=True,
    do_hnsw=True,
    umap_kwargs=None,
    pca_kwargs=None
):

    # 2) Dimension reduction
    ds_red, qs_red = reduce_embeddings(
        dataset_embs,
        question_embs,
        method_name=method_name,
        target_dim=target_dim,
        umap_kwargs=umap_kwargs,
        pca_kwargs=pca_kwargs
    )

    # 3) KNN retrieval
    if do_knn:
        idx_knn = knn_search(ds_red, qs_red)
        knn_acc = evaluate_retrieval(idx_knn, df_dataset, contexts)
        print(f"KNN Accuracy, dim={target_dim}, method={method_name}: {knn_acc*100:.4f}%")
        result = round(knn_acc*100, 4)

    # 4) HNSW retrieval
    if do_hnsw:
        idx_hnsw = hnswlib_search(ds_red, qs_red)
        hnsw_acc = evaluate_retrieval(idx_hnsw, df_dataset, contexts)
        print(f"HNSW Accuracy, dim={target_dim}, method={method_name}: {hnsw_acc*100:.4f}%")
        result = round(hnsw_acc*100, 4)

    del ds_red, qs_red
    gc.collect()

    return result


In [None]:
def run_complete_experiment_with_time(
    df_dataset,           # DataFrame with [text, embedding], shape (N,2)
    dataset_embs,
    question_embs,
    contexts,
    method_name,
    target_dim,
    do_knn=False,
    do_hnsw=False,
    umap_kwargs=None,
    pca_kwargs=None
):
    print(f"\n=== {method_name.upper()}  |  dim={target_dim} ===")

    # 2) Dimension reduction
    t0 = time.perf_counter()
    ds_red, qs_red = reduce_embeddings(
        dataset_embs,
        question_embs,
        method_name=method_name,
        target_dim=target_dim,
        umap_kwargs=umap_kwargs,
        pca_kwargs=pca_kwargs
    )
    dt_reduce_sec = time.perf_counter() - t0
    # dt_reduce_min = dt_reduce_sec / 60
    print(f"Dimension reduction: {dt_reduce_sec:.2f} sec")

    # 3) KNN retrieval
    if do_knn:
        t0 = time.perf_counter()
        idx_knn = knn_search(ds_red, qs_red, top_k=1)
        dt_knn_sec  = time.perf_counter() - t0
        # dt_knn_min = dt_knn_sec / 60

        knn_acc = evaluate_retrieval(idx_knn, df_dataset, contexts)
        print(f"KNN   time: {dt_knn_sec:.2f} sec   |   acc: {knn_acc*100:.2f}%\n")
        # print(f"KNN Accuracy, dim={target_dim}, method={method_name}: {knn_acc*100:.4f}%")
        knn_accuracy = round(knn_acc*100, 4)

    # 4) HNSW retrieval
    if do_hnsw:
        idx_hnsw, dt_hnsw_build_sec, dt_hnsw_search_sec = hnswlib_search(ds_red, qs_red)
        # dt_hnsw_build_min = dt_hnsw_build_sec / 60
        # dt_hnsw_search_min = dt_hnsw_search_sec / 60
        hnsw_acc = evaluate_retrieval(idx_hnsw, df_dataset, contexts)
        print(f"HNSW Build time: {dt_hnsw_build_sec} sec   |   HNSW Search time: {dt_hnsw_search_sec} sec   |   acc: {hnsw_acc*100:.2f}%\n")
        # print(f"HNSW Accuracy, dim={target_dim}, method={method_name}: {hnsw_acc*100:.4f}%")
        hnsw_accuracy = round(hnsw_acc*100, 4)

    del ds_red, qs_red
    gc.collect()


    return {
        "dt_reduce": round(dt_reduce_sec, 2),
        "knn_accuracy": knn_accuracy if do_knn else None,
        "dt_knn": round(dt_knn_sec, 2) if do_knn else None,
        "hnsw_accuracy": hnsw_accuracy if do_hnsw else None,
        "dt_hnsw_build": round(dt_hnsw_build_sec, 2) if do_hnsw else None,
        "dt_hnsw_search": round(dt_hnsw_search_sec, 2) if do_hnsw else None
    }



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# dataset_folder = "/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/MyProject/embeddings/Linq/4096/dataset"
ALIBABANLP_EMB_PATH  = '/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/MyProject/embeddings/Alibaba_NLP_3584/dataset'
df_dataset = load_chunked_embeddings(ALIBABANLP_EMB_PATH)
print("Dataset loaded:", df_dataset.shape)

Dataset loaded: (1000000, 2)


In [None]:
QUESTIONS_EMB_PATH = '/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/MyProject/embeddings/Alibaba_NLP_3584/questions/gte_Qwen2_7B_instruct_questions.pkl'
df_questions = load_questions(QUESTIONS_EMB_PATH)
print("Questions loaded:", df_questions.shape)

Questions loaded: (2470, 2)


In [None]:
contexts_csv = "/content/drive/MyDrive/Colab Notebooks/CMPE 295 RAG System/MyProject/qnc/combined/context_final.csv"
contexts = load_contexts(contexts_csv)
print("Contexts loaded:", len(contexts))

Contexts loaded: 2470


In [None]:
dataset_embs = np.vstack(df_dataset["embedding"].values).astype("float32")
question_embs= np.vstack(df_questions["embedding"].values).astype("float32")

In [None]:
import plotly.graph_objects as go

def plot_accuracy_dicts(acc_dicts: dict, title="Accuracy vs Dimension"):

    # union of all dimensions, sorted high→low
    all_dims = sorted({d for dct in acc_dicts.values() for d in dct}, reverse=True)

    def y_for(label):
        return [acc_dicts[label].get(d, None) for d in all_dims]

    fig = go.Figure()
    marker_cycle = ["circle", "square", "triangle-up", "diamond", "cross"]
    for i, label in enumerate(acc_dicts):
        y_vals = y_for(label)
        fig.add_trace(
            go.Scatter(
                x=all_dims,
                y=y_vals,
                mode="markers+lines",
                marker=dict(symbol=marker_cycle[i % len(marker_cycle)], size=8),
                name=label,
                text=[
                    f"Dim {d}<br>{label}<br>{acc:.2f}%" if acc is not None
                    else f"Dim {d}<br>{label}<br>(missing)"
                    for d, acc in zip(all_dims, y_vals)
                ],
                hoverinfo="text",
                connectgaps=True
            )
        )

    fig.update_layout(
        title=title,
        xaxis_title="Embedding Dimension",
        yaxis_title="Accuracy (%)",
        yaxis=dict(range=[0, 100]),
        template="plotly_white",
        hovermode="closest",
        legend_title="Method"
    )
    fig.show()


In [None]:
def plot_time_dicts(time_dicts, title="Runtime vs Dimension (seconds)"):

    dims = sorted({d for m in time_dicts.values() for d in m}, reverse=True)

    marker_cycle = ["circle", "square", "triangle-up", "diamond", "cross"]
    fig = go.Figure()

    for i, (label, dct) in enumerate(time_dicts.items()):
        y_vals = [dct.get(d) for d in dims]
        fig.add_trace(go.Scatter(
            x=dims,
            y=y_vals,
            mode="markers+lines",
            marker=dict(symbol=marker_cycle[i % len(marker_cycle)], size=8),
            name=label,
            text=[f"Dim {d}<br>{label}<br>{val:.2f} s"
                  if val is not None else f"Dim {d}<br>{label}<br>(missing)"
                  for d, val in zip(dims, y_vals)],
            hoverinfo="text",
            connectgaps=True         # join across missing points
        ))

    fig.update_layout(
        title=title,
        xaxis_title="Embedding Dimension",
        yaxis_title="Elapsed Time (seconds)",
        template="plotly_white",
        hovermode="closest",
        legend_title="Curve"
    )
    fig.show()

## Experiments 1

### 3584

In [None]:
# dataset_embs = np.vstack(df_dataset["embedding"].values).astype(np.float32)  # shape (N, 3584)
# question_embs= np.vstack(df_questions["embedding"].values).astype(np.float32)# shape (Q, 3584)

In [None]:
print(f"\n--- KNN retrieval for dim=3584 ---")
idx_knn = knn_search(dataset_embs, question_embs, top_k=1)
acc_knn = evaluate_retrieval(idx_knn, df_dataset, contexts)
print(f"KNN Accuracy for dim=3584: {acc_knn*100:.2f}%")



--- KNN retrieval for dim=3584 ---
KNN Accuracy for dim=3584: 95.02%


In [None]:
print(f"\n--- HNSW retrieval for dim=3584 ---")
idx_hnsw = hnswlib_search(dataset_embs, question_embs)
acc_hnsw = evaluate_retrieval(idx_hnsw, df_dataset, contexts)
print(f"HNSW Accuracy for dim=3584: {acc_hnsw*100:.2f}%")



--- HNSW retrieval for dim=3584 ---
HNSW Accuracy for dim=3584: 93.32%


In [None]:
gc.collect()

0

### 2048

In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=PCA_Method,
    target_dim=DIM_2048
)

KNN Accuracy, dim=2048, method=pca: 94.98%
HNSW Accuracy, dim=2048, method=pca: 93.40%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=TRUNCATE_Method,
    target_dim=DIM_2048
)

KNN Accuracy, dim=2048, method=truncate: 94.82%
HNSW Accuracy, dim=2048, method=truncate: 93.28%


### 1024

In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=PCA_Method,
    target_dim=DIM_1024
)

KNN Accuracy, dim=1024, method=pca: 94.57%
HNSW Accuracy, dim=1024, method=pca: 92.96%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=TRUNCATE_Method,
    target_dim=DIM_1024
)

KNN Accuracy, dim=1024, method=truncate: 94.49%
HNSW Accuracy, dim=1024, method=truncate: 92.63%


### 512

#### Run 1

In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=PCA_Method,
    target_dim=DIM_512
)

KNN Accuracy, dim=512, method=pca: 93.52%
HNSW Accuracy, dim=512, method=pca: 91.54%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=TRUNCATE_Method,
    target_dim=DIM_512
)

KNN Accuracy, dim=512, method=truncate: 93.81%
HNSW Accuracy, dim=512, method=truncate: 91.66%


### 256

#### Run 1:

In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=PCA_Method,
    target_dim=DIM_256
)

KNN Accuracy, dim=256, method=pca: 92.15%
HNSW Accuracy, dim=256, method=pca: 89.47%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=TRUNCATE_Method,
    target_dim=DIM_256
)

KNN Accuracy, dim=256, method=truncate: 92.79%
HNSW Accuracy, dim=256, method=truncate: 89.23%


### 128

In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=PCA_Method,
    target_dim=DIM_128
)

KNN Accuracy, dim=128, method=pca: 88.42%
HNSW Accuracy, dim=128, method=pca: 86.03%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=TRUNCATE_Method,
    target_dim=DIM_128
)

KNN Accuracy, dim=128, method=truncate: 88.79%
HNSW Accuracy, dim=128, method=truncate: 80.12%


### 64

In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=PCA_Method,
    target_dim=DIM_64
)

KNN Accuracy, dim=64, method=pca: 78.14%
HNSW Accuracy, dim=64, method=pca: 75.51%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=TRUNCATE_Method,
    target_dim=DIM_64
)

KNN Accuracy, dim=64, method=truncate: 78.42%
HNSW Accuracy, dim=64, method=truncate: 62.19%


### 32

In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=PCA_Method,
    target_dim=DIM_32
)

KNN Accuracy, dim=32, method=pca: 50.40%
HNSW Accuracy, dim=32, method=pca: 49.68%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=TRUNCATE_Method,
    target_dim=DIM_32
)

KNN Accuracy, dim=32, method=truncate: 39.55%
HNSW Accuracy, dim=32, method=truncate: 34.70%


### 16

In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=PCA_Method,
    target_dim=DIM_16
)

KNN Accuracy, dim=16, method=pca: 14.53%
HNSW Accuracy, dim=16, method=pca: 14.45%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name=TRUNCATE_Method,
    target_dim=DIM_16
)

KNN Accuracy, dim=16, method=truncate: 6.56%
HNSW Accuracy, dim=16, method=truncate: 6.52%


### Plots and Trends

In [None]:
pca_knn = {
    16: 14.53,
    32: 50.4,
    64: 78.14,
    128: 88.42,
    256: 92.15,
    512: 93.52,
    1024: 94.57,
    2048: 94.98,
    3584: 95.02,
}

pca_hnsw = {
    16: 14.45,
    32: 49.68,
    64: 75.51,
    128: 86.03,
    256: 89.47,
    512: 91.54,
    1024: 92.96,
    2048: 93.4,
    3584: 93.32,
}

trunc_knn = {
    16: 6.56,
    32: 39.55,
    64: 78.42,
    128: 88.79,
    256: 92.79,
    512: 93.81,
    1024: 94.49,
    2048: 94.82,
    3584: 95.02,
}

trunc_hnsw = {
    16: 6.52,
    32: 34.7,
    64: 62.19,
    128: 80.12,
    256: 89.23,
    512: 91.66,
    1024: 92.63,
    2048: 93.28,
    3584: 93.32,
}


In [None]:
import plotly.graph_objects as go

In [None]:
all_dims = sorted(
    set().union(pca_knn, pca_hnsw, trunc_knn, trunc_hnsw),
    reverse=True            # high‑to‑low on X axis
)

In [None]:
def make_y(dict_obj, dim_list):
    """Return list of accuracies aligned to dim_list (NaN if dim not present)."""
    return [dict_obj.get(d, None) for d in dim_list]

In [None]:
traces_cfg = [
    ("PCA KNN",        pca_knn,    "square"),
    ("PCA HNSW",       pca_hnsw,   "circle"),
    ("Trunc KNN",      trunc_knn,  "square"),
    ("Trunc HNSW",     trunc_hnsw, "circle"),
]

In [None]:
fig = go.Figure()

In [None]:
for label, dct, marker in traces_cfg:
    y_vals = make_y(dct, all_dims)
    fig.add_trace(
        go.Scatter(
            x=all_dims,
            y=y_vals,
            mode="markers+lines",
            marker=dict(symbol=marker, size=8),
            name=label,
            text=[
                f"Dim&nbsp;{dim}<br>{label}<br>Acc: {acc:.2f}%"
                if acc is not None else f"Dim&nbsp;{dim}<br>{label}<br>(missing)"
                for dim, acc in zip(all_dims, y_vals)
            ],
            hoverinfo="text",
            connectgaps=True,        # connect even if some dims missing
        )
    )

In [None]:
fig.update_layout(
    title="Retrieval Accuracy vs Embedding Dimension",
    xaxis_title="Embedding Dimension",
    yaxis_title="Accuracy (%)",
    yaxis=dict(range=[0, 100]),
    template="plotly_white",
    hovermode="closest",
    legend_title="Method",
)

## Experiment 2

In [None]:
dimArr = []
for i in range(32, 256, 8):
  dimArr.append(i)
print(dimArr)
for i in range(256, 513, 32):
  dimArr.append(i)
print(dimArr)

[32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]
[32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 288, 320, 352, 384, 416, 448, 480, 512]


### Truncation - Keep the first portion

#### KNN- Pre-Testing

In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=DIM_64,
    do_hnsw=False
)

KNN Accuracy, dim=64, method=truncate_first: 78.42%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=96, # 96 = 64 + 32
    do_hnsw=False
)

KNN Accuracy, dim=96, method=truncate_first: 84.98%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=128, # 96 + 32 = 128
    do_hnsw=False
)

KNN Accuracy, dim=128, method=truncate_first: 88.79%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=160, # 96 + 32 -> 128 + 32 -> 160
    do_hnsw=False
)

KNN Accuracy, dim=160, method=truncate_first: 90.61%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=192, # 96 + 32 -> 128 + 32 -> 160 + 32 -> 192
    do_hnsw=False
)

KNN Accuracy, dim=192, method=truncate_first: 91.58%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=224,
    do_hnsw=False
)

KNN Accuracy, dim=224, method=truncate_first: 92.23%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=256,
    do_hnsw=False
)

KNN Accuracy, dim=256, method=truncate_first: 92.79%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=288,
    do_hnsw=False
)

KNN Accuracy, dim=288, method=truncate_first: 93.08%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=320,
    do_hnsw=False
)

KNN Accuracy, dim=320, method=truncate_first: 92.71%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=352,
    do_hnsw=False
)

KNN Accuracy, dim=352, method=truncate_first: 93.16%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=384,
    do_hnsw=False
)

KNN Accuracy, dim=384, method=truncate_first: 93.40%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=416,
    do_hnsw=False
)

KNN Accuracy, dim=416, method=truncate_first: 93.60%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=448,
    do_hnsw=False
)

KNN Accuracy, dim=448, method=truncate_first: 93.68%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=480,
    do_hnsw=False
)

KNN Accuracy, dim=480, method=truncate_first: 93.72%


In [None]:
run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=512,
    do_hnsw=False
)

KNN Accuracy, dim=512, method=truncate_first: 93.81%


#### KNN

In [None]:
trunc_first_knn = {}

for dim in dimArr:
  acc = run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=dim,
    do_hnsw=False
  )
  trunc_first_knn[dim] = acc


KNN Accuracy, dim=32, method=truncate_first: 39.5547%
KNN Accuracy, dim=40, method=truncate_first: 50.6478%
KNN Accuracy, dim=48, method=truncate_first: 62.7126%
KNN Accuracy, dim=56, method=truncate_first: 71.7814%
KNN Accuracy, dim=64, method=truncate_first: 78.4211%
KNN Accuracy, dim=72, method=truncate_first: 76.8826%
KNN Accuracy, dim=80, method=truncate_first: 80.5263%
KNN Accuracy, dim=88, method=truncate_first: 83.1579%
KNN Accuracy, dim=96, method=truncate_first: 84.9798%
KNN Accuracy, dim=104, method=truncate_first: 86.5587%
KNN Accuracy, dim=112, method=truncate_first: 87.8543%
KNN Accuracy, dim=120, method=truncate_first: 88.3806%
KNN Accuracy, dim=128, method=truncate_first: 88.7854%
KNN Accuracy, dim=136, method=truncate_first: 89.5547%
KNN Accuracy, dim=144, method=truncate_first: 89.6356%
KNN Accuracy, dim=152, method=truncate_first: 90.0810%
KNN Accuracy, dim=160, method=truncate_first: 90.6073%
KNN Accuracy, dim=168, method=truncate_first: 90.8502%
KNN Accuracy, dim=1

In [None]:
print(trunc_first_knn)

{32: 39.5547, 40: 50.6478, 48: 62.7126, 56: 71.7814, 64: 78.4211, 72: 76.8826, 80: 80.5263, 88: 83.1579, 96: 84.9798, 104: 86.5587, 112: 87.8543, 120: 88.3806, 128: 88.7854, 136: 89.5547, 144: 89.6356, 152: 90.081, 160: 90.6073, 168: 90.8502, 176: 91.0526, 184: 91.3765, 192: 91.5789, 200: 91.7409, 208: 91.8623, 216: 91.9838, 224: 92.2267, 232: 92.4291, 240: 92.5911, 248: 92.8745, 256: 92.7935, 288: 93.0769, 320: 92.7126, 352: 93.1579, 384: 93.4008, 416: 93.6032, 448: 93.6842, 480: 93.7247, 512: 93.8057}


In [None]:
for k, v in trunc_first_knn.items():
  print(f"{k}: {v},")

32: 39.5547,
40: 50.6478,
48: 62.7126,
56: 71.7814,
64: 78.4211,
72: 76.8826,
80: 80.5263,
88: 83.1579,
96: 84.9798,
104: 86.5587,
112: 87.8543,
120: 88.3806,
128: 88.7854,
136: 89.5547,
144: 89.6356,
152: 90.081,
160: 90.6073,
168: 90.8502,
176: 91.0526,
184: 91.3765,
192: 91.5789,
200: 91.7409,
208: 91.8623,
216: 91.9838,
224: 92.2267,
232: 92.4291,
240: 92.5911,
248: 92.8745,
256: 92.7935,
288: 93.0769,
320: 92.7126,
352: 93.1579,
384: 93.4008,
416: 93.6032,
448: 93.6842,
480: 93.7247,
512: 93.8057,


#### HNSW

### Truncation - Keep the last portion

#### KNN

In [None]:
trunc_last_knn = {}

for dim in dimArr:
  acc = run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_last",
    target_dim=dim,
    do_hnsw=False
  )
  trunc_last_knn[dim] = acc


KNN Accuracy, dim=32, method=truncate_last: 43.0364%
KNN Accuracy, dim=40, method=truncate_last: 55.5870%
KNN Accuracy, dim=48, method=truncate_last: 66.4777%
KNN Accuracy, dim=56, method=truncate_last: 73.6842%
KNN Accuracy, dim=64, method=truncate_last: 80.0810%
KNN Accuracy, dim=72, method=truncate_last: 82.3482%
KNN Accuracy, dim=80, method=truncate_last: 84.5344%
KNN Accuracy, dim=88, method=truncate_last: 86.1943%
KNN Accuracy, dim=96, method=truncate_last: 88.0162%
KNN Accuracy, dim=104, method=truncate_last: 88.5425%
KNN Accuracy, dim=112, method=truncate_last: 89.5951%
KNN Accuracy, dim=120, method=truncate_last: 90.2834%
KNN Accuracy, dim=128, method=truncate_last: 90.9717%
KNN Accuracy, dim=136, method=truncate_last: 91.0526%
KNN Accuracy, dim=144, method=truncate_last: 91.6599%
KNN Accuracy, dim=152, method=truncate_last: 92.0243%
KNN Accuracy, dim=160, method=truncate_last: 91.9433%
KNN Accuracy, dim=168, method=truncate_last: 92.3482%
KNN Accuracy, dim=176, method=truncat

In [None]:
print(trunc_last_knn)

{32: 43.0364, 40: 55.587, 48: 66.4777, 56: 73.6842, 64: 80.081, 72: 82.3482, 80: 84.5344, 88: 86.1943, 96: 88.0162, 104: 88.5425, 112: 89.5951, 120: 90.2834, 128: 90.9717, 136: 91.0526, 144: 91.6599, 152: 92.0243, 160: 91.9433, 168: 92.3482, 176: 92.5101, 184: 92.5911, 192: 92.6721, 200: 93.0364, 208: 93.2389, 216: 93.1984, 224: 93.3198, 232: 93.1579, 240: 93.1579, 248: 93.2794, 256: 93.7247, 288: 93.7247, 320: 93.6437, 352: 93.8866, 384: 93.8462, 416: 94.0486, 448: 94.0081, 480: 94.17, 512: 94.2105}


In [None]:
for k, v in trunc_last_knn.items():
  print(f"{k}: {v},")

32: 43.0364,
40: 55.587,
48: 66.4777,
56: 73.6842,
64: 80.081,
72: 82.3482,
80: 84.5344,
88: 86.1943,
96: 88.0162,
104: 88.5425,
112: 89.5951,
120: 90.2834,
128: 90.9717,
136: 91.0526,
144: 91.6599,
152: 92.0243,
160: 91.9433,
168: 92.3482,
176: 92.5101,
184: 92.5911,
192: 92.6721,
200: 93.0364,
208: 93.2389,
216: 93.1984,
224: 93.3198,
232: 93.1579,
240: 93.1579,
248: 93.2794,
256: 93.7247,
288: 93.7247,
320: 93.6437,
352: 93.8866,
384: 93.8462,
416: 94.0486,
448: 94.0081,
480: 94.17,
512: 94.2105,


#### HNSW

### Truncation - Four different slices

#### KNN

In [None]:
trunc_four_knn = {}

for dim in dimArr:
  acc = run_complete_experiment(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_four",
    target_dim=dim,
    do_hnsw=False
  )
  trunc_four_knn[dim] = acc


KNN Accuracy, dim=32, method=truncate_four: 37.3684%
KNN Accuracy, dim=40, method=truncate_four: 51.2551%
KNN Accuracy, dim=48, method=truncate_four: 60.6883%
KNN Accuracy, dim=56, method=truncate_four: 68.4615%
KNN Accuracy, dim=64, method=truncate_four: 74.5344%
KNN Accuracy, dim=72, method=truncate_four: 79.1093%
KNN Accuracy, dim=80, method=truncate_four: 82.9960%
KNN Accuracy, dim=88, method=truncate_four: 84.8178%
KNN Accuracy, dim=96, method=truncate_four: 86.1134%
KNN Accuracy, dim=104, method=truncate_four: 87.8138%
KNN Accuracy, dim=112, method=truncate_four: 88.3401%
KNN Accuracy, dim=120, method=truncate_four: 88.9069%
KNN Accuracy, dim=128, method=truncate_four: 90.0000%
KNN Accuracy, dim=136, method=truncate_four: 90.4453%
KNN Accuracy, dim=144, method=truncate_four: 90.9312%
KNN Accuracy, dim=152, method=truncate_four: 91.2146%
KNN Accuracy, dim=160, method=truncate_four: 91.3765%
KNN Accuracy, dim=168, method=truncate_four: 90.8097%
KNN Accuracy, dim=176, method=truncat

In [None]:
print(trunc_four_knn)

{32: 37.3684, 40: 51.2551, 48: 60.6883, 56: 68.4615, 64: 74.5344, 72: 79.1093, 80: 82.996, 88: 84.8178, 96: 86.1134, 104: 87.8138, 112: 88.3401, 120: 88.9069, 128: 90.0, 136: 90.4453, 144: 90.9312, 152: 91.2146, 160: 91.3765, 168: 90.8097, 176: 90.9717, 184: 91.1741, 192: 91.417, 200: 91.7409, 208: 92.0648, 216: 92.1053, 224: 92.2672, 232: 92.6316, 240: 92.9555, 248: 92.915, 256: 93.0364, 288: 92.8745, 320: 93.4413, 352: 93.5628, 384: 93.7247, 416: 93.8866, 448: 94.1296, 480: 94.0891, 512: 94.0891}


In [None]:
for k, v in trunc_four_knn.items():
  print(f"{k}: {v},")

32: 37.3684,
40: 51.2551,
48: 60.6883,
56: 68.4615,
64: 74.5344,
72: 79.1093,
80: 82.996,
88: 84.8178,
96: 86.1134,
104: 87.8138,
112: 88.3401,
120: 88.9069,
128: 90.0,
136: 90.4453,
144: 90.9312,
152: 91.2146,
160: 91.3765,
168: 90.8097,
176: 90.9717,
184: 91.1741,
192: 91.417,
200: 91.7409,
208: 92.0648,
216: 92.1053,
224: 92.2672,
232: 92.6316,
240: 92.9555,
248: 92.915,
256: 93.0364,
288: 92.8745,
320: 93.4413,
352: 93.5628,
384: 93.7247,
416: 93.8866,
448: 94.1296,
480: 94.0891,
512: 94.0891,


#### HNSW

### PCA

### Graphs

#### Truncation - keeping first vs last vs four

In [None]:
trunc_first_knn = {
    32: 39.5547,
    40: 50.6478,
    48: 62.7126,
    56: 71.7814,
    64: 78.4211,
    72: 76.8826,
    80: 80.5263,
    88: 83.1579,
    96: 84.9798,
    104: 86.5587,
    112: 87.8543,
    120: 88.3806,
    128: 88.7854,
    136: 89.5547,
    144: 89.6356,
    152: 90.081,
    160: 90.6073,
    168: 90.8502,
    176: 91.0526,
    184: 91.3765,
    192: 91.5789,
    200: 91.7409,
    208: 91.8623,
    216: 91.9838,
    224: 92.2267,
    232: 92.4291,
    240: 92.5911,
    248: 92.8745,
    256: 92.7935,
    288: 93.0769,
    320: 92.7126,
    352: 93.1579,
    384: 93.4008,
    416: 93.6032,
    448: 93.6842,
    480: 93.7247,
    512: 93.8057
}

trunc_last_knn = {
    32: 43.0364,
    40: 55.587,
    48: 66.4777,
    56: 73.6842,
    64: 80.081,
    72: 82.3482,
    80: 84.5344,
    88: 86.1943,
    96: 88.0162,
    104: 88.5425,
    112: 89.5951,
    120: 90.2834,
    128: 90.9717,
    136: 91.0526,
    144: 91.6599,
    152: 92.0243,
    160: 91.9433,
    168: 92.3482,
    176: 92.5101,
    184: 92.5911,
    192: 92.6721,
    200: 93.0364,
    208: 93.2389,
    216: 93.1984,
    224: 93.3198,
    232: 93.1579,
    240: 93.1579,
    248: 93.2794,
    256: 93.7247,
    288: 93.7247,
    320: 93.6437,
    352: 93.8866,
    384: 93.8462,
    416: 94.0486,
    448: 94.0081,
    480: 94.17,
    512: 94.2105
}

trunc_four_knn = {
    32: 37.3684,
    40: 51.2551,
    48: 60.6883,
    56: 68.4615,
    64: 74.5344,
    72: 79.1093,
    80: 82.996,
    88: 84.8178,
    96: 86.1134,
    104: 87.8138,
    112: 88.3401,
    120: 88.9069,
    128: 90.0,
    136: 90.4453,
    144: 90.9312,
    152: 91.2146,
    160: 91.3765,
    168: 90.8097,
    176: 90.9717,
    184: 91.1741,
    192: 91.417,
    200: 91.7409,
    208: 92.0648,
    216: 92.1053,
    224: 92.2672,
    232: 92.6316,
    240: 92.9555,
    248: 92.915,
    256: 93.0364,
    288: 92.8745,
    320: 93.4413,
    352: 93.5628,
    384: 93.7247,
    416: 93.8866,
    448: 94.1296,
    480: 94.0891,
    512: 94.0891
}

In [None]:
acc_dicts = {
    "Truncate - Keep First": trunc_first_knn,
    "Truncate - Keep Last": trunc_last_knn,
    "Truncate - Four": trunc_four_knn
}

In [None]:
plot_accuracy_dicts(acc_dicts, "Truncation - Keeping head; tail; four different area 0, 25, 50, 75")

#### OpenAI vs Alibaba-NLP

In [None]:
openai_truncate_dims=[512, 504, 496, 488, 480, 472, 464, 456, 448, 440, 432, 424, 416, 408, 400, 392, 384, 376, 368, 360, 352, 344, 336, 328, 320, 312, 304, 296, 288, 280, 272, 264, 256, 248, 240, 232, 224, 216, 208, 200, 192, 184, 176, 168, 160, 152, 144, 136, 128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32]
openai_back_accuracys_rounded=[93.48178, 93.60324, 93.60324, 93.48178, 93.23887, 93.23887, 93.23887, 93.23887, 93.27935, 93.36032, 93.19838, 93.31984, 93.11741, 92.95547, 92.95547, 92.87449, 92.79352, 92.59109, 92.63158, 92.63158, 92.79352, 92.79352, 92.79352, 92.91498, 92.91498, 92.83401, 92.59109, 92.46964, 92.38866, 92.38866, 92.14575, 92.46964, 92.38866, 92.38866, 92.14575, 92.02429, 92.06478, 91.74089, 91.74089, 91.49798, 91.17409, 90.64777, 90.5668, 90.5668, 90.0, 89.4332, 89.06883, 88.78543, 88.13765, 87.48988, 86.39676, 84.77733, 82.91498, 80.64777, 77.00405, 71.82186, 65.87045, 62.22672, 52.55061, 39.4332, 24.93927]
openai_front_accuracys_rounded=[93.40081, 93.4413, 93.36032, 93.52227, 93.56275, 93.23887, 93.23887, 93.27935, 93.23887, 93.36032, 93.15789, 93.31984, 93.03644, 93.27935, 93.40081, 93.23887, 93.23887, 93.23887, 93.15789, 92.95547, 92.99595, 93.11741, 92.83401, 92.95547, 92.95547, 92.79352, 92.75304, 92.63158, 92.67206, 92.51012, 92.34818, 92.14575, 92.02429, 91.86235, 91.90283, 91.98381, 91.7004, 91.21457, 91.01215, 90.8502, 90.40486, 90.32389, 89.75709, 90.0, 89.55466, 89.59514, 89.39271, 88.54251, 88.17814, 87.04453, 85.91093, 84.8583, 83.64372, 81.49798, 78.82591, 75.34413, 70.72874, 66.80162, 57.85425, 45.4251, 29.35223]

In [None]:
openai_truncate_front = {}
for dim, acc in zip(openai_truncate_dims, openai_front_accuracys_rounded):
  openai_truncate_front[dim] = acc

openai_truncate_back={}
for dim, acc in zip(openai_truncate_dims, openai_back_accuracys_rounded):
  openai_truncate_back[dim] = acc

In [None]:
acc_dicts_openai_alibaba_front = {
    "OpenAI Embedding":  openai_truncate_front,
    "Alibaba-NLP": trunc_first_knn
}

In [None]:
acc_dicts_openai_alibaba_back = {
    "OpenAI Embedding":  openai_truncate_back,
    "Alibaba-NLP": trunc_last_knn
}

In [None]:
plot_accuracy_dicts(acc_dicts_openai_alibaba_front, "Truncation - Keeping head/front: OpenAI vs Alibaba-NLP")

In [None]:
plot_accuracy_dicts(acc_dicts_openai_alibaba_back, "Truncation - Keeping Back: OpenAI vs Alibaba-NLP")

## Experiment 3

In [None]:
dimArr = []
for i in range(32, 256, 16):
  dimArr.append(i)
print(dimArr)
for i in range(256, 512, 32):
  dimArr.append(i)
print(dimArr)
for i in range(512, 1025, 128):
  dimArr.append(i)
print(dimArr)


[32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]
[32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480]
[32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 768, 896, 1024]


In [None]:
# return {
#     "dt_reduce": round(dt_reduce_min, 2),
#     "knn_accuracy": knn_accuracy if do_knn else None,
#     "dt_knn": round(dt_knn_min, 2) if do_knn else None,
#     "hnsw_accuracy": hnsw_accuracy if do_hnsw else None,
#     "dt_hnsw_build": round(dt_hnsw_build_min, 2) if do_hnsw else None,
#     "dt_hnsw_search": round(dt_hnsw_search_min, 2) if do_hnsw else None
# }

### KNN Time Complexity

In [None]:
trunc_first_knn_acc = {}
trunc_time = {}
knn_time = {}
trunc_and_knn_time = {}

In [None]:
for dim in dimArr:
  result = run_complete_experiment_with_time(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=dim,
    do_knn=True
  )
  trunc_first_knn_acc[dim] = result['knn_accuracy']
  trunc_time[dim] = result['dt_reduce']
  knn_time[dim] = result['dt_knn']
  trunc_and_knn_time[dim] = result['dt_reduce'] + result['dt_knn']


=== TRUNCATE_FIRST  |  dim=32 ===
Dimension reduction: 0.00 sec
KNN   time: 3.19 sec   |   acc: 39.55%


=== TRUNCATE_FIRST  |  dim=48 ===
Dimension reduction: 0.00 sec
KNN   time: 3.41 sec   |   acc: 62.71%


=== TRUNCATE_FIRST  |  dim=64 ===
Dimension reduction: 0.00 sec
KNN   time: 4.04 sec   |   acc: 78.42%


=== TRUNCATE_FIRST  |  dim=80 ===
Dimension reduction: 0.00 sec
KNN   time: 4.19 sec   |   acc: 80.53%


=== TRUNCATE_FIRST  |  dim=96 ===
Dimension reduction: 0.00 sec
KNN   time: 4.70 sec   |   acc: 84.98%


=== TRUNCATE_FIRST  |  dim=112 ===
Dimension reduction: 0.00 sec
KNN   time: 5.39 sec   |   acc: 87.85%


=== TRUNCATE_FIRST  |  dim=128 ===
Dimension reduction: 0.00 sec
KNN   time: 5.57 sec   |   acc: 88.79%


=== TRUNCATE_FIRST  |  dim=144 ===
Dimension reduction: 0.00 sec
KNN   time: 5.94 sec   |   acc: 89.64%


=== TRUNCATE_FIRST  |  dim=160 ===
Dimension reduction: 0.00 sec
KNN   time: 6.72 sec   |   acc: 90.61%


=== TRUNCATE_FIRST  |  dim=176 ===
Dimension reduc

In [None]:
for k, v in trunc_first_knn_acc.items():
  print(f"{k}: {v},")

32: 39.5547,
48: 62.7126,
64: 78.4211,
80: 80.5263,
96: 84.9798,
112: 87.8543,
128: 88.7854,
144: 89.6356,
160: 90.6073,
176: 91.0526,
192: 91.5789,
208: 91.8623,
224: 92.2267,
240: 92.5911,
256: 92.7935,
288: 93.0769,
320: 92.7126,
352: 93.1579,
384: 93.4008,
416: 93.6032,
448: 93.6842,
480: 93.7247,
512: 93.8057,
640: 94.251,
768: 94.2105,
896: 94.4939,
1024: 94.4939,


In [None]:
for k, v in trunc_time.items():
  print(f"{k}: {v},")

32: 0.0,
48: 0.0,
64: 0.0,
80: 0.0,
96: 0.0,
112: 0.0,
128: 0.0,
144: 0.0,
160: 0.0,
176: 0.0,
192: 0.0,
208: 0.0,
224: 0.0,
240: 0.0,
256: 0.0,
288: 0.0,
320: 0.0,
352: 0.0,
384: 0.0,
416: 0.0,
448: 0.0,
480: 0.0,
512: 0.0,
640: 0.0,
768: 0.0,
896: 0.0,
1024: 0.0,


In [None]:
for k, v in knn_time.items():
  print(f"{k}: {v},")

32: 0.07,
48: 0.06,
64: 0.07,
80: 0.07,
96: 0.08,
112: 0.09,
128: 0.1,
144: 0.1,
160: 0.11,
176: 0.12,
192: 0.13,
208: 0.13,
224: 0.14,
240: 0.15,
256: 0.16,
288: 0.17,
320: 0.18,
352: 0.2,
384: 0.22,
416: 0.23,
448: 0.24,
480: 0.25,
512: 0.28,
640: 0.32,
768: 0.39,
896: 0.44,
1024: 0.51,


In [None]:
for k, v in trunc_and_knn_time.items():
  print(f"{k}: {v},")

32: 3.19,
48: 3.41,
64: 4.04,
80: 4.19,
96: 4.7,
112: 5.39,
128: 5.57,
144: 5.94,
160: 6.72,
176: 7.19,
192: 7.59,
208: 7.88,
224: 8.22,
240: 8.48,
256: 9.05,
288: 9.83,
320: 10.84,
352: 11.82,
384: 12.69,
416: 13.38,
448: 14.64,
480: 15.2,
512: 16.65,
640: 19.67,
768: 23.52,
896: 27.04,
1024: 30.98,


In [None]:
trunc_first_knn_acc = {
    32: 39.5547,
    48: 62.7126,
    64: 78.4211,
    80: 80.5263,
    96: 84.9798,
    112: 87.8543,
    128: 88.7854,
    144: 89.6356,
    160: 90.6073,
    176: 91.0526,
    192: 91.5789,
    208: 91.8623,
    224: 92.2267,
    240: 92.5911,
    256: 92.7935,
    288: 93.0769,
    320: 92.7126,
    352: 93.1579,
    384: 93.4008,
    416: 93.6032,
    448: 93.6842,
    480: 93.7247,
    512: 93.8057,
    640: 94.251,
    768: 94.2105,
    896: 94.4939,
    1024: 94.4939
}

rag_acc = {
    "Truncation and KNN Accuracy": trunc_first_knn_acc
}

In [None]:
trunc_and_knn_time = {
    32: 3.19,
    48: 3.41,
    64: 4.04,
    80: 4.19,
    96: 4.7,
    112: 5.39,
    128: 5.57,
    144: 5.94,
    160: 6.72,
    176: 7.19,
    192: 7.59,
    208: 7.88,
    224: 8.22,
    240: 8.48,
    256: 9.05,
    288: 9.83,
    320: 10.84,
    352: 11.82,
    384: 12.69,
    416: 13.38,
    448: 14.64,
    480: 15.2,
    512: 16.65,
    640: 19.67,
    768: 23.52,
    896: 27.04,
    1024: 30.98
}

rag_time = {
    "Truncation and KNN Time": trunc_and_knn_time
}

In [None]:
plot_time_dicts(rag_time, "Truncation Time - Keeping head/front")

In [None]:
plot_accuracy_dicts(rag_acc, "Truncation Accuracy - Keeping head/front")

## New Method

### Run

In [None]:
dimArr = []
for i in range(32, 256, 8):
  dimArr.append(i)
print(dimArr)
for i in range(256, 512, 16):
  dimArr.append(i)
print(dimArr)
for i in range(512, 1025, 128):
  dimArr.append(i)
dimArr.extend([2048, 3584])
print(dimArr)

[32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]
[32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]
[32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 640, 768, 896, 1024, 2048, 3584]


In [None]:
def evaluate_retrieval(indices, df_dataset, contexts):

    if indices.ndim == 2:
        indices = indices.ravel()          # (Q,)
    correct = 0
    for i, row_id in enumerate(indices):
        if df_dataset.iloc[row_id]["text"] == contexts[i]:
            correct += 1
    return correct / len(contexts)


In [None]:
from typing import List, Dict

In [None]:
# def evaluate_retrieval_2(row_ids: np.ndarray, df_dataset, contexts) -> float:
#     hits = sum(df_dataset.iloc[rid]["text"] == ctx
#                for rid, ctx in zip(row_ids, contexts))
#     return hits / len(contexts)

In [None]:
def evaluate_exact(indices, df_dataset, contexts):
    hits = sum(df_dataset.iloc[idx]["text"] == ctx
               for idx, ctx in zip(indices, contexts))
    return hits / len(contexts)

In [None]:
def progressive_knn_sklearn(
        ds_embs: np.ndarray,
        qs_embs: np.ndarray,
        df_dataset, contexts,
        algorithm = "brute",
        start_dim: int      = 64,
        start_k: int        = 4,
        max_dim: int        = 512,
        step_factor: int    = 2,      # dim *= step_factor each loop
        step_k: int         = 2,      # how much k we keep, e.g. 2 -> k = k // step_k
        step_add: int       = None,   # add to dim
        verbose: bool       = True):

    if ((step_add is not None) and (step_factor is not None)) or ((step_add is None) and (step_factor is None)):
        raise ValueError("Choose either step_add or step factor")
    if step_add is None and step_factor <= 1:
        raise ValueError("step_factor must be > 1 when step_add is None")

    N, D = ds_embs.shape
    Q    = qs_embs.shape[0]

    def build_index(mat):
        # brute-force
        # return NearestNeighbors(n_neighbors=1, algorithm="brute", metric="euclidean").fit(mat)
        # auto
        return NearestNeighbors(n_neighbors=1, algorithm=algorithm, metric="euclidean").fit(mat)

    # --------------------------------------------------------
    # 1) initial global search on first start_dim dimensions
    t0 = time.perf_counter()
    index0 = build_index(ds_embs[:, :start_dim])
    _, I = index0.kneighbors(qs_embs[:, :start_dim], n_neighbors=start_k)
    pools = I.copy()                 # shape (Q, start_k)
    cand_set = np.unique(I.ravel())
    t_init = time.perf_counter() - t0
    if verbose:
        print(f"[init 0:{start_dim}] unique rows = {cand_set.size:,}")


    dim = start_dim
    k    = start_k
    t_slices = 0.0

    # ----------------------------------------
    # 2) loop
    while dim < max_dim:
        if step_add is None:
            new_dim = min(dim * step_factor, max_dim)
        else:
            new_dim = min(dim + step_add, max_dim)

        if new_dim >= max_dim:
            break

        dim = new_dim
        k   = max(1, k // step_k)          # halve pool size

        if verbose:
            print(f"[slice 0:{dim}] k={k}")

        t0 = time.perf_counter()
        # build index on current candidate rows
        cand_mat = ds_embs[cand_set][:, :dim]
        idx_local = build_index(cand_mat)

        # query all questions at once
        _, Iq = idx_local.kneighbors(qs_embs[:, :dim], n_neighbors=k)

        # map local → global row IDs
        pools = cand_set[Iq]

        # union of all rows for next round
        cand_set = np.unique(pools.ravel())
        t_slices += time.perf_counter() - t0

        if verbose:
            print(f"           candidates → {cand_set.size:,}")

    # ------------------------------------------------
    # 3) final 1-NN on remaining rows (0:max_dim)
    t0 = time.perf_counter()
    final_idx = build_index(ds_embs[cand_set][:, :max_dim])
    _, I_final = final_idx.kneighbors(qs_embs[:, :max_dim], n_neighbors=1)
    t_final = time.perf_counter() - t0
    row_ids = cand_set[I_final.ravel()]

    # 4) evaluate
    acc_pct = round(100 * evaluate_exact(row_ids, df_dataset, contexts), 4)
    return {
        "accuracy_pct": acc_pct,
        "final_pool"  : int(cand_set.size),
        "t_total_s": round(t_init+t_slices+t_final, 2)
    }

---
### start_dim 64 (deprecated)

#### to 512

In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 1000,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 617,960
[slice 0:128] k=500
           candidates → 408,147
[slice 0:256] k=250
           candidates → 280,666
{'accuracy_pct': 93.5223, 'final_pool': 280666, 't_total_s': 28.76}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 500,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 460,413
[slice 0:128] k=250
           candidates → 264,009
[slice 0:256] k=125
           candidates → 167,144
{'accuracy_pct': 93.3198, 'final_pool': 167144, 't_total_s': 19.6}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 100,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 170,125
[slice 0:128] k=50
           candidates → 76,047
[slice 0:256] k=25
           candidates → 42,987
{'accuracy_pct': 91.8219, 'final_pool': 42987, 't_total_s': 8.83}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 50,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 98,719
[slice 0:128] k=25
           candidates → 41,913
[slice 0:256] k=12
           candidates → 22,649
{'accuracy_pct': 90.6478, 'final_pool': 22649, 't_total_s': 6.62}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 10,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 23,439
[slice 0:128] k=5
           candidates → 10,094
[slice 0:256] k=2
           candidates → 4,635
{'accuracy_pct': 87.166, 'final_pool': 4635, 't_total_s': 4.93}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 5,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 12,043
[slice 0:128] k=2
           candidates → 4,577
[slice 0:256] k=1
           candidates → 2,429
{'accuracy_pct': 85.6275, 'final_pool': 2429, 't_total_s': 4.53}


##### to 256

In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 1000,
          max_dim     = 256,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 617,960
[slice 0:128] k=500
           candidates → 408,147
{'accuracy_pct': 92.6316, 'final_pool': 408147, 't_total_s': 21.96}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 500,
          max_dim     = 256,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 460,413
[slice 0:128] k=250
           candidates → 264,009
{'accuracy_pct': 92.4696, 'final_pool': 264009, 't_total_s': 16.22}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 100,
          max_dim     = 256,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 170,125
[slice 0:128] k=50
           candidates → 76,047
{'accuracy_pct': 91.2146, 'final_pool': 76047, 't_total_s': 7.59}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 50,
          max_dim     = 256,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 98,719
[slice 0:128] k=25
           candidates → 41,913
{'accuracy_pct': 90.1619, 'final_pool': 41913, 't_total_s': 6.1}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 25,
          max_dim     = 256,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 54,374
[slice 0:128] k=12
           candidates → 22,140
{'accuracy_pct': 89.5142, 'final_pool': 22140, 't_total_s': 5.34}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 64,
          start_k     = 10,
          max_dim     = 256,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:64] unique rows = 23,439
[slice 0:128] k=5
           candidates → 10,094
{'accuracy_pct': 86.9231, 'final_pool': 10094, 't_total_s': 4.65}


---
### From 128 (deprecated)

#### to 512

In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 128,
          start_k     = 500,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:128] unique rows = 490,675
[slice 0:256] k=250
           candidates → 313,840
{'accuracy_pct': 93.7652, 'final_pool': 313840, 't_total_s': 24.78}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 128,
          start_k     = 100,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:128] unique rows = 178,776
[slice 0:256] k=50
           candidates → 91,088
{'accuracy_pct': 93.6032, 'final_pool': 91088, 't_total_s': 11.13}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 128,
          start_k     = 50,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:128] unique rows = 102,393
[slice 0:256] k=25
           candidates → 50,103
{'accuracy_pct': 93.4413, 'final_pool': 50103, 't_total_s': 8.77}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 128,
          start_k     = 25,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:128] unique rows = 55,793
[slice 0:256] k=12
           candidates → 26,072
{'accuracy_pct': 93.3198, 'final_pool': 26072, 't_total_s': 7.49}


In [None]:
res = progressive_knn_sklearn(
          ds_embs     = dataset_embs,
          qs_embs     = question_embs,
          df_dataset  = df_dataset,
          contexts    = contexts,
          start_dim   = 128,
          start_k     = 10,
          max_dim     = 512,
          step_factor = 2,
          verbose     = True
)
print(res)

[init 0:128] unique rows = 23,700
[slice 0:256] k=5
           candidates → 11,519
{'accuracy_pct': 92.7935, 'final_pool': 11519, 't_total_s': 6.72}


### Time Measurement

#### KNN

##### Prep (Don't Run)

In [None]:
trunc_first_knn_acc = {}
trunc_time = {}
knn_time = {}
trunc_and_knn_time = {}

In [None]:
for dim in dimArr:
  result = run_complete_experiment_with_time(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=dim,
    do_knn=True
  )
  trunc_first_knn_acc[dim] = result['knn_accuracy']
  trunc_time[dim] = result['dt_reduce']
  knn_time[dim] = result['dt_knn']
  trunc_and_knn_time[dim] = result['dt_reduce'] + result['dt_knn']


=== TRUNCATE_FIRST  |  dim=32 ===
Dimension reduction: 0.00 sec
KNN   time: 3.19 sec   |   acc: 39.55%


=== TRUNCATE_FIRST  |  dim=48 ===
Dimension reduction: 0.00 sec
KNN   time: 3.52 sec   |   acc: 62.71%


=== TRUNCATE_FIRST  |  dim=64 ===
Dimension reduction: 0.00 sec
KNN   time: 3.72 sec   |   acc: 78.42%


=== TRUNCATE_FIRST  |  dim=80 ===
Dimension reduction: 0.00 sec
KNN   time: 4.21 sec   |   acc: 80.53%


=== TRUNCATE_FIRST  |  dim=96 ===
Dimension reduction: 0.00 sec
KNN   time: 4.99 sec   |   acc: 84.98%


=== TRUNCATE_FIRST  |  dim=112 ===
Dimension reduction: 0.00 sec
KNN   time: 5.13 sec   |   acc: 87.85%


=== TRUNCATE_FIRST  |  dim=128 ===
Dimension reduction: 0.00 sec
KNN   time: 5.58 sec   |   acc: 88.79%


=== TRUNCATE_FIRST  |  dim=144 ===
Dimension reduction: 0.00 sec
KNN   time: 7.23 sec   |   acc: 89.64%


=== TRUNCATE_FIRST  |  dim=160 ===
Dimension reduction: 0.00 sec
KNN   time: 6.80 sec   |   acc: 90.61%


=== TRUNCATE_FIRST  |  dim=176 ===
Dimension reduc

In [None]:
for k, v in trunc_first_knn_acc.items():
  print(f"{k}: {v},")

32: 39.5547,
48: 62.7126,
64: 78.4211,
80: 80.5263,
96: 84.9798,
112: 87.8543,
128: 88.7854,
144: 89.6356,
160: 90.6073,
176: 91.0526,
192: 91.5789,
208: 91.8623,
224: 92.2267,
240: 92.5911,
256: 92.7935,
288: 93.0769,
320: 92.7126,
352: 93.1579,
384: 93.4008,
416: 93.6032,
448: 93.6842,
480: 93.7247,
512: 93.8057,
640: 94.251,
768: 94.2105,
896: 94.4939,
1024: 94.4939,
2048: 94.8178,
3584: 95.0202,


In [None]:
for k, v in trunc_and_knn_time.items():
  print(f"{k}: {v},")

32: 3.19,
48: 3.52,
64: 3.72,
80: 4.21,
96: 4.99,
112: 5.13,
128: 5.58,
144: 7.23,
160: 6.8,
176: 7.39,
192: 7.96,
208: 7.85,
224: 8.58,
240: 8.97,
256: 9.32,
288: 11.21,
320: 10.84,
352: 12.74,
384: 12.72,
416: 14.56,
448: 15.49,
480: 15.28,
512: 17.29,
640: 19.43,
768: 23.11,
896: 26.95,
1024: 30.94,
2048: 61.4,
3584: 102.95,


##### Plots

In [None]:
trunc_first_knn_acc = {
    32: 39.5547,
    48: 62.7126,
    64: 78.4211,
    80: 80.5263,
    96: 84.9798,
    112: 87.8543,
    128: 88.7854,
    144: 89.6356,
    160: 90.6073,
    176: 91.0526,
    192: 91.5789,
    208: 91.8623,
    224: 92.2267,
    240: 92.5911,
    256: 92.7935,
    288: 93.0769,
    320: 92.7126,
    352: 93.1579,
    384: 93.4008,
    416: 93.6032,
    448: 93.6842,
    480: 93.7247,
    512: 93.8057,
    640: 94.251,
    768: 94.2105,
    896: 94.4939,
    1024: 94.4939,
    2048: 94.8178,
    3584: 95.0202
}

trunc_knn_acc_plot = {
    "Truncation and KNN Accuracy": trunc_first_knn_acc
}

In [None]:
trunc_and_knn_time = {
    32: 3.19,
    48: 3.52,
    64: 3.72,
    80: 4.21,
    96: 4.99,
    112: 5.13,
    128: 5.58,
    144: 7.23,
    160: 6.8,
    176: 7.39,
    192: 7.96,
    208: 7.85,
    224: 8.58,
    240: 8.97,
    256: 9.32,
    288: 11.21,
    320: 10.84,
    352: 12.74,
    384: 12.72,
    416: 14.56,
    448: 15.49,
    480: 15.28,
    512: 17.29,
    640: 19.43,
    768: 23.11,
    896: 26.95,
    1024: 30.94,
    2048: 61.4,
    3584: 102.95
}

trunc_knn_time_plot = {
    "Truncation and KNN Time": trunc_and_knn_time
}

In [None]:
print("Dim \tAccuracy \tTime")
for (k1, v1), (k2, v2) in zip(trunc_first_knn_acc.items(), trunc_and_knn_time.items()):
    print(f"{k1}:\t{v1}  \t{v2}")

Dim 	Accuracy 	Time
32:	39.5547  	3.19
48:	62.7126  	3.52
64:	78.4211  	3.72
80:	80.5263  	4.21
96:	84.9798  	4.99
112:	87.8543  	5.13
128:	88.7854  	5.58
144:	89.6356  	7.23
160:	90.6073  	6.8
176:	91.0526  	7.39
192:	91.5789  	7.96
208:	91.8623  	7.85
224:	92.2267  	8.58
240:	92.5911  	8.97
256:	92.7935  	9.32
288:	93.0769  	11.21
320:	92.7126  	10.84
352:	93.1579  	12.74
384:	93.4008  	12.72
416:	93.6032  	14.56
448:	93.6842  	15.49
480:	93.7247  	15.28
512:	93.8057  	17.29
640:	94.251  	19.43
768:	94.2105  	23.11
896:	94.4939  	26.95
1024:	94.4939  	30.94
2048:	94.8178  	61.4
3584:	95.0202  	102.95


In [None]:
plot_time_dicts(trunc_knn_time_plot, "Truncation Time - Keeping head/front")

In [None]:
plot_accuracy_dicts(trunc_knn_acc_plot, "Truncation Accuracy - Keeping head/front")

#### HNSW

##### Prep (Don't Run)

In [None]:
    # return {
    #     "dt_reduce": round(dt_reduce_sec, 2),
    #     "knn_accuracy": knn_accuracy if do_knn else None,
    #     "dt_knn": round(dt_knn_sec, 2) if do_knn else None,
    #     "hnsw_accuracy": hnsw_accuracy if do_hnsw else None,
    #     "dt_hnsw_build": round(dt_hnsw_build_sec, 2) if do_hnsw else None,
    #     "dt_hnsw_search": round(dt_hnsw_search_sec, 2) if do_hnsw else None
    # }

In [None]:
trunc_first_hnsw_acc = {}
trunc_time_hnsw = {}
hnsw_build_time = {}
hnsw_search_time = {}
trunc_and_hnsw_total_time = {}

In [None]:
for dim in dimArr:
  result2 = run_complete_experiment_with_time(
    df_dataset,
    dataset_embs,
    question_embs,
    contexts,
    method_name="truncate_first",
    target_dim=dim,
    do_hnsw=True
  )
  trunc_first_hnsw_acc[dim] = result2['hnsw_accuracy']
  trunc_time_hnsw[dim] = result2['dt_reduce']
  hnsw_build_time[dim] = result2['dt_hnsw_build']
  hnsw_search_time[dim] = result2['dt_hnsw_search']
  trunc_and_hnsw_total_time[dim] = result2['dt_reduce'] + result2['dt_hnsw_build'] + result2['dt_hnsw_search']


=== TRUNCATE_FIRST  |  dim=32 ===
Dimension reduction: 0.00 sec
HNSW Build time: 268.2661211789996 sec   |   HNSW Search time: 0.28763810400050716 sec   |   acc: 35.02%


=== TRUNCATE_FIRST  |  dim=48 ===
Dimension reduction: 0.00 sec
HNSW Build time: 305.5120680299997 sec   |   HNSW Search time: 0.20620476899966889 sec   |   acc: 50.20%


=== TRUNCATE_FIRST  |  dim=64 ===
Dimension reduction: 0.00 sec
HNSW Build time: 343.6591589849995 sec   |   HNSW Search time: 0.20794910400036315 sec   |   acc: 62.51%


=== TRUNCATE_FIRST  |  dim=80 ===
Dimension reduction: 0.00 sec
HNSW Build time: 408.9508079970001 sec   |   HNSW Search time: 0.38849807399947167 sec   |   acc: 66.68%


=== TRUNCATE_FIRST  |  dim=96 ===
Dimension reduction: 0.00 sec
HNSW Build time: 417.24540662399977 sec   |   HNSW Search time: 0.3875908290001462 sec   |   acc: 73.40%


=== TRUNCATE_FIRST  |  dim=112 ===
Dimension reduction: 0.00 sec
HNSW Build time: 448.58684843299943 sec   |   HNSW Search time: 0.3991125400007

In [None]:
for k, v in trunc_first_hnsw_acc.items():
  print(f"{k}: {v},")

In [None]:
for k, v in trunc_time_hnsw.items():
  print(f"{k}: {v},")

In [None]:
for k, v in hnsw_build_time.items():
  print(f"{k}: {v},")

In [None]:
for k, v in hnsw_search_time.items():
  print(f"{k}: {v},")

In [None]:
for k, v in trunc_and_hnsw_total_time.items():
  print(f"{k}: {v},")

##### Plots

In [None]:
trunc_first_hnsw_acc = {

}

rag_acc = {
    "Truncation and KNN Accuracy": trunc_first_hnsw_acc
}

In [None]:
hnsw_build_time = {}

In [None]:
hnsw_search_time = {}

In [None]:
trunc_and_hnsw_total_time = {}

In [None]:
hnsw_time = {
    "HNSW Buil Time": hnsw_build_time,
    "HNSW Search Time": hnsw_search_time,
    "HNSW Total Time": trunc_and_hnsw_total_time
}

In [None]:
plot_time_dicts(hnsw_time, "HNSW Time")

### New Method testings

In [None]:
# {'accuracy_pct': 93.5223, 'final_pool': 280666, 't_total_s': 28.76}

#### Run 1

In [None]:
res_step_factor = {}

##### 64 to 256

In [None]:
k = 4
res_step_factor["64_to_256"] = {}
while k <= 1024:
    res = progressive_knn_sklearn(
              ds_embs     = dataset_embs,
              qs_embs     = question_embs,
              df_dataset  = df_dataset,
              contexts    = contexts,
              start_dim   = 64,
              start_k     = k,
              max_dim     = 256,
              step_factor = 2,
              verbose     = True
    )
    res_step_factor["64_to_256"][k] = res
    k *= 2

[init 0:64] unique rows = 9,692
[slice 0:128] k=2
           candidates → 4,476
[init 0:64] unique rows = 18,930
[slice 0:128] k=4
           candidates → 8,250
[init 0:64] unique rows = 36,311
[slice 0:128] k=8
           candidates → 15,326
[init 0:64] unique rows = 67,642
[slice 0:128] k=16
           candidates → 28,312
[init 0:64] unique rows = 120,604
[slice 0:128] k=32
           candidates → 51,922
[init 0:64] unique rows = 203,382
[slice 0:128] k=64
           candidates → 93,279
[init 0:64] unique rows = 320,055
[slice 0:128] k=128
           candidates → 162,082
[init 0:64] unique rows = 465,741
[slice 0:128] k=256
           candidates → 268,422
[init 0:64] unique rows = 623,356
[slice 0:128] k=512
           candidates → 413,792


##### 64 to 512

In [None]:
k = 4
res_step_factor["64_to_512"] = {}
while k <= 1024:
    res = progressive_knn_sklearn(
              ds_embs     = dataset_embs,
              qs_embs     = question_embs,
              df_dataset  = df_dataset,
              contexts    = contexts,
              start_dim   = 64,
              start_k     = k,
              max_dim     = 512,
              step_factor = 2,
              verbose     = True
    )
    res_step_factor["64_to_512"][k] = res
    k *= 2

[init 0:64] unique rows = 9,692
[slice 0:128] k=2
           candidates → 4,476
[slice 0:256] k=1
           candidates → 2,418
[init 0:64] unique rows = 18,930
[slice 0:128] k=4
           candidates → 8,250
[slice 0:256] k=2
           candidates → 4,535
[init 0:64] unique rows = 36,311
[slice 0:128] k=8
           candidates → 15,326
[slice 0:256] k=4
           candidates → 8,456
[init 0:64] unique rows = 67,642
[slice 0:128] k=16
           candidates → 28,312
[slice 0:256] k=8
           candidates → 15,614
[init 0:64] unique rows = 120,604
[slice 0:128] k=32
           candidates → 51,922
[slice 0:256] k=16
           candidates → 29,028
[init 0:64] unique rows = 203,382
[slice 0:128] k=64
           candidates → 93,279
[slice 0:256] k=32
           candidates → 53,344
[init 0:64] unique rows = 320,055
[slice 0:128] k=128
           candidates → 162,082
[slice 0:256] k=64
           candidates → 96,664
[init 0:64] unique rows = 465,741
[slice 0:128] k=256
           candidates →

##### 64 to 1024

In [None]:
k = 4
res_step_factor["64_to_1024"] = {}
while k <= 1024:
    res = progressive_knn_sklearn(
              ds_embs     = dataset_embs,
              qs_embs     = question_embs,
              df_dataset  = df_dataset,
              contexts    = contexts,
              start_dim   = 64,
              start_k     = k,
              max_dim     = 1024,
              step_factor = 2,
              verbose     = True
    )
    res_step_factor["64_to_1024"][k] = res
    k *= 2

[init 0:64] unique rows = 9,692
[slice 0:128] k=2
           candidates → 4,476
[slice 0:256] k=1
           candidates → 2,418
[slice 0:512] k=1
           candidates → 2,367
[init 0:64] unique rows = 18,930
[slice 0:128] k=4
           candidates → 8,250
[slice 0:256] k=2
           candidates → 4,535
[slice 0:512] k=1
           candidates → 2,432
[init 0:64] unique rows = 36,311
[slice 0:128] k=8
           candidates → 15,326
[slice 0:256] k=4
           candidates → 8,456
[slice 0:512] k=2
           candidates → 4,664
[init 0:64] unique rows = 67,642
[slice 0:128] k=16
           candidates → 28,312
[slice 0:256] k=8
           candidates → 15,614
[slice 0:512] k=4
           candidates → 8,874
[init 0:64] unique rows = 120,604
[slice 0:128] k=32
           candidates → 51,922
[slice 0:256] k=16
           candidates → 29,028
[slice 0:512] k=8
           candidates → 16,839
[init 0:64] unique rows = 203,382
[slice 0:128] k=64
           candidates → 93,279
[slice 0:256] k=32
   

##### 128 to 512

In [None]:
k = 4
res_step_factor["128_to_512"] = {}
while k <= 1024:
    res = progressive_knn_sklearn(
              ds_embs     = dataset_embs,
              qs_embs     = question_embs,
              df_dataset  = df_dataset,
              contexts    = contexts,
              start_dim   = 128,
              start_k     = k,
              max_dim     = 512,
              step_factor = 2,
              verbose     = True
    )
    res_step_factor["128_to_512"][k] = res
    k *= 2

[init 0:128] unique rows = 9,720
[slice 0:256] k=2
           candidates → 4,827
[init 0:128] unique rows = 19,102
[slice 0:256] k=4
           candidates → 9,330
[init 0:128] unique rows = 37,025
[slice 0:256] k=8
           candidates → 17,861
[init 0:128] unique rows = 69,610
[slice 0:256] k=16
           candidates → 33,649
[init 0:128] unique rows = 125,710
[slice 0:256] k=32
           candidates → 62,148
[init 0:128] unique rows = 214,488
[slice 0:256] k=64
           candidates → 111,979
[init 0:128] unique rows = 340,626
[slice 0:256] k=128
           candidates → 194,603
[init 0:128] unique rows = 496,394
[slice 0:256] k=256
           candidates → 318,803
[init 0:128] unique rows = 660,039
[slice 0:256] k=512
           candidates → 483,875


##### 128 to 1024

In [None]:
k = 4
res_step_factor["128_to_1024"] = {}
while k <= 1024:
    res = progressive_knn_sklearn(
              ds_embs     = dataset_embs,
              qs_embs     = question_embs,
              df_dataset  = df_dataset,
              contexts    = contexts,
              start_dim   = 128,
              start_k     = k,
              max_dim     = 1024,
              step_factor = 2,
              verbose     = True
    )
    res_step_factor["128_to_1024"][k] = res
    k *= 2

[init 0:128] unique rows = 9,720
[slice 0:256] k=2
           candidates → 4,827
[slice 0:512] k=1
           candidates → 2,468
[init 0:128] unique rows = 19,102
[slice 0:256] k=4
           candidates → 9,330
[slice 0:512] k=2
           candidates → 4,854
[init 0:128] unique rows = 37,025
[slice 0:256] k=8
           candidates → 17,861
[slice 0:512] k=4
           candidates → 9,461
[init 0:128] unique rows = 69,610
[slice 0:256] k=16
           candidates → 33,649
[slice 0:512] k=8
           candidates → 18,205
[init 0:128] unique rows = 125,710
[slice 0:256] k=32
           candidates → 62,148
[slice 0:512] k=16
           candidates → 34,673
[init 0:128] unique rows = 214,488
[slice 0:256] k=64
           candidates → 111,979
[slice 0:512] k=32
           candidates → 64,709
[init 0:128] unique rows = 340,626
[slice 0:256] k=128
           candidates → 194,603
[slice 0:512] k=64
           candidates → 118,135
[init 0:128] unique rows = 496,394
[slice 0:256] k=256
           ca

##### 256 to 512

In [None]:
k = 4
res_step_factor["256_to_512"] = {}
while k <= 1024:
    res = progressive_knn_sklearn(
              ds_embs     = dataset_embs,
              qs_embs     = question_embs,
              df_dataset  = df_dataset,
              contexts    = contexts,
              start_dim   = 256,
              start_k     = k,
              max_dim     = 512,
              step_factor = 2,
              verbose     = True
    )
    res_step_factor["256_to_512"][k] = res
    k *= 2

[init 0:256] unique rows = 9,789
[init 0:256] unique rows = 19,332
[init 0:256] unique rows = 37,779
[init 0:256] unique rows = 72,422
[init 0:256] unique rows = 133,960
[init 0:256] unique rows = 234,587
[init 0:256] unique rows = 380,079
[init 0:256] unique rows = 557,914
[init 0:256] unique rows = 733,024


##### 256 to 1024

In [None]:
k = 4
res_step_factor["256_to_1024"] = {}
while k <= 1024:
    res = progressive_knn_sklearn(
              ds_embs     = dataset_embs,
              qs_embs     = question_embs,
              df_dataset  = df_dataset,
              contexts    = contexts,
              start_dim   = 256,
              start_k     = k,
              max_dim     = 1024,
              step_factor = 2,
              verbose     = True
    )
    res_step_factor["256_to_1024"][k] = res
    k *= 2

[init 0:256] unique rows = 9,789
[slice 0:512] k=2
           candidates → 4,911
[init 0:256] unique rows = 19,332
[slice 0:512] k=4
           candidates → 9,717
[init 0:256] unique rows = 37,779
[slice 0:512] k=8
           candidates → 19,032
[init 0:256] unique rows = 72,422
[slice 0:512] k=16
           candidates → 36,843
[init 0:256] unique rows = 133,960
[slice 0:512] k=32
           candidates → 69,693
[init 0:256] unique rows = 234,587
[slice 0:512] k=64
           candidates → 128,404
[init 0:256] unique rows = 380,079
[slice 0:512] k=128
           candidates → 226,653
[init 0:256] unique rows = 557,914
[slice 0:512] k=256
           candidates → 373,977
[init 0:256] unique rows = 733,024
[slice 0:512] k=512
           candidates → 562,613


##### 512 to 1024

In [None]:
k = 4
res_step_factor["512_to_1024"] = {}
while k <= 1024:
    res = progressive_knn_sklearn(
              ds_embs     = dataset_embs,
              qs_embs     = question_embs,
              df_dataset  = df_dataset,
              contexts    = contexts,
              start_dim   = 512,
              start_k     = k,
              max_dim     = 1024,
              step_factor = 2,
              verbose     = True
    )
    res_step_factor["512_to_1024"][k] = res
    k *= 2

[init 0:512] unique rows = 9,788
[init 0:512] unique rows = 19,406
[init 0:512] unique rows = 38,154
[init 0:512] unique rows = 73,712
[init 0:512] unique rows = 138,527
[init 0:512] unique rows = 247,929
[init 0:512] unique rows = 409,354
[init 0:512] unique rows = 606,799
[init 0:512] unique rows = 792,053


##### Check Results

In [None]:
print("Dim \tAccuracy \tTime")
for (k1, v1), (k2, v2) in zip(trunc_first_knn_acc.items(), trunc_and_knn_time.items()):
    print(f"{k1}:\t{v1}  \t{v2}")

Dim 	Accuracy 	Time
32:	39.5547  	3.19
48:	62.7126  	3.52
64:	78.4211  	3.72
80:	80.5263  	4.21
96:	84.9798  	4.99
112:	87.8543  	5.13
128:	88.7854  	5.58
144:	89.6356  	7.23
160:	90.6073  	6.8
176:	91.0526  	7.39
192:	91.5789  	7.96
208:	91.8623  	7.85
224:	92.2267  	8.58
240:	92.5911  	8.97
256:	92.7935  	9.32
288:	93.0769  	11.21
320:	92.7126  	10.84
352:	93.1579  	12.74
384:	93.4008  	12.72
416:	93.6032  	14.56
448:	93.6842  	15.49
480:	93.7247  	15.28
512:	93.8057  	17.29
640:	94.251  	19.43
768:	94.2105  	23.11
896:	94.4939  	26.95
1024:	94.4939  	30.94
2048:	94.8178  	61.4
3584:	95.0202  	102.95


Get All results

In [None]:
# print all results
for k, v in res_step_factor.items():
    print(f"========{k}========")
    for k2, v2 in v.items():
        print(f"{k}: {k2}: {v2}")

64_to_256: 4: {'accuracy_pct': 84.8178, 'final_pool': 4476, 't_total_s': 4.08}
64_to_256: 8: {'accuracy_pct': 86.6397, 'final_pool': 8250, 't_total_s': 4.22}
64_to_256: 16: {'accuracy_pct': 88.4211, 'final_pool': 15326, 't_total_s': 4.49}
64_to_256: 32: {'accuracy_pct': 89.7571, 'final_pool': 28312, 't_total_s': 5.07}
64_to_256: 64: {'accuracy_pct': 90.5263, 'final_pool': 51922, 't_total_s': 6.03}
64_to_256: 128: {'accuracy_pct': 91.5385, 'final_pool': 93279, 't_total_s': 7.95}
64_to_256: 256: {'accuracy_pct': 91.9838, 'final_pool': 162082, 't_total_s': 10.49}
64_to_256: 512: {'accuracy_pct': 92.4696, 'final_pool': 268422, 't_total_s': 14.46}
64_to_256: 1024: {'accuracy_pct': 92.6721, 'final_pool': 413792, 't_total_s': 20.07}
64_to_512: 4: {'accuracy_pct': 84.8178, 'final_pool': 2418, 't_total_s': 4.05}
64_to_512: 8: {'accuracy_pct': 86.8016, 'final_pool': 4535, 't_total_s': 4.25}
64_to_512: 16: {'accuracy_pct': 88.7854, 'final_pool': 8456, 't_total_s': 4.84}
64_to_512: 32: {'accuracy_

In [None]:
# trunc_first_knn_acc
# trunc_and_knn_time

Get those ones that has accuracy 93%+ and faster than dim 288 (since KNN accuracy at dim 288 first reached 93%)

In [None]:
res_group_1 = []
for k, v in res_step_factor.items():
    for k2, v2 in v.items():
        if v2['accuracy_pct'] >= 93 and v2['t_total_s'] < trunc_and_knn_time[288]:
            print(f"Dimension {k}: Start K {k2}: {v2}")
            res_group_1.append({"d": k, "k": k2, "acc": v2['accuracy_pct'], "t": v2['t_total_s']})

Dimension 128_to_512: Start K 16: {'accuracy_pct': 93.1174, 'final_pool': 17861, 't_total_s': 6.78}
Dimension 128_to_512: Start K 32: {'accuracy_pct': 93.4008, 'final_pool': 33649, 't_total_s': 7.41}
Dimension 128_to_512: Start K 64: {'accuracy_pct': 93.4818, 'final_pool': 62148, 't_total_s': 9.15}
Dimension 128_to_1024: Start K 8: {'accuracy_pct': 93.2389, 'final_pool': 4854, 't_total_s': 6.55}
Dimension 128_to_1024: Start K 16: {'accuracy_pct': 93.6437, 'final_pool': 9461, 't_total_s': 6.99}
Dimension 128_to_1024: Start K 32: {'accuracy_pct': 93.9676, 'final_pool': 18205, 't_total_s': 8.39}
Dimension 128_to_1024: Start K 64: {'accuracy_pct': 94.0486, 'final_pool': 34673, 't_total_s': 10.41}
Dimension 256_to_512: Start K 4: {'accuracy_pct': 93.4818, 'final_pool': 9789, 't_total_s': 9.56}
Dimension 256_to_512: Start K 8: {'accuracy_pct': 93.6437, 'final_pool': 19332, 't_total_s': 9.87}
Dimension 256_to_512: Start K 16: {'accuracy_pct': 93.7247, 'final_pool': 37779, 't_total_s': 10.31}


In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in res_group_1:
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")


Dimensions      Start K  Acc        Time      
128_to_512      16       93.1174    6.78      
128_to_512      32       93.4008    7.41      
128_to_512      64       93.4818    9.15      
128_to_1024     8        93.2389    6.55      
128_to_1024     16       93.6437    6.99      
128_to_1024     32       93.9676    8.39      
128_to_1024     64       94.0486    10.41     
256_to_512      4        93.4818    9.56      
256_to_512      8        93.6437    9.87      
256_to_512      16       93.7247    10.31     
256_to_1024     4        94.1296    9.69      
256_to_1024     8        94.3320    10.21     
256_to_1024     16       94.3320    10.98     


In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in sorted(res_group_1, key=lambda x: x['acc'], reverse=True):
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")

Dimensions      Start K  Acc        Time      
256_to_1024     8        94.3320    10.21     
256_to_1024     16       94.3320    10.98     
256_to_1024     4        94.1296    9.69      
128_to_1024     64       94.0486    10.41     
128_to_1024     32       93.9676    8.39      
256_to_512      16       93.7247    10.31     
128_to_1024     16       93.6437    6.99      
256_to_512      8        93.6437    9.87      
128_to_512      64       93.4818    9.15      
256_to_512      4        93.4818    9.56      
128_to_512      32       93.4008    7.41      
128_to_1024     8        93.2389    6.55      
128_to_512      16       93.1174    6.78      


In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in sorted(res_group_1, key=lambda x: x['t']):
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")

Dimensions      Start K  Acc        Time      
128_to_1024     8        93.2389    6.55      
128_to_512      16       93.1174    6.78      
128_to_1024     16       93.6437    6.99      
128_to_512      32       93.4008    7.41      
128_to_1024     32       93.9676    8.39      
128_to_512      64       93.4818    9.15      
256_to_512      4        93.4818    9.56      
256_to_1024     4        94.1296    9.69      
256_to_512      8        93.6437    9.87      
256_to_1024     8        94.3320    10.21     
256_to_512      16       93.7247    10.31     
128_to_1024     64       94.0486    10.41     
256_to_1024     16       94.3320    10.98     


Get the ones has accuracy 93% and faster than dim 1024

In [None]:
res_group_2 = []
for k, v in res_step_factor.items():
    for k2, v2 in v.items():
        if v2['accuracy_pct'] >= 93 and v2['t_total_s'] < trunc_and_knn_time[1024]:
            print(f"Dimension {k}: Start K {k2}: {v2}")
            res_group_2.append({"d": k, "k": k2, "acc": v2['accuracy_pct'], "t": v2['t_total_s']})

Dimension 64_to_512: Start K 512: {'accuracy_pct': 93.3198, 'final_pool': 170318, 't_total_s': 18.51}
Dimension 64_to_512: Start K 1024: {'accuracy_pct': 93.5628, 'final_pool': 285443, 't_total_s': 28.02}
Dimension 64_to_1024: Start K 256: {'accuracy_pct': 93.2794, 'final_pool': 59285, 't_total_s': 15.21}
Dimension 64_to_1024: Start K 512: {'accuracy_pct': 93.9271, 'final_pool': 108948, 't_total_s': 22.6}
Dimension 128_to_512: Start K 16: {'accuracy_pct': 93.1174, 'final_pool': 17861, 't_total_s': 6.78}
Dimension 128_to_512: Start K 32: {'accuracy_pct': 93.4008, 'final_pool': 33649, 't_total_s': 7.41}
Dimension 128_to_512: Start K 64: {'accuracy_pct': 93.4818, 'final_pool': 62148, 't_total_s': 9.15}
Dimension 128_to_512: Start K 128: {'accuracy_pct': 93.6437, 'final_pool': 111979, 't_total_s': 11.84}
Dimension 128_to_512: Start K 256: {'accuracy_pct': 93.7652, 'final_pool': 194603, 't_total_s': 15.94}
Dimension 128_to_512: Start K 512: {'accuracy_pct': 93.7652, 'final_pool': 318803, 't

In [None]:
for res in res_group_2:
    print(res)

{'d': '64_to_512', 'k': 512, 'acc': 93.3198, 't': 18.51}
{'d': '64_to_512', 'k': 1024, 'acc': 93.5628, 't': 28.02}
{'d': '64_to_1024', 'k': 256, 'acc': 93.2794, 't': 15.21}
{'d': '64_to_1024', 'k': 512, 'acc': 93.9271, 't': 22.6}
{'d': '128_to_512', 'k': 16, 'acc': 93.1174, 't': 6.78}
{'d': '128_to_512', 'k': 32, 'acc': 93.4008, 't': 7.41}
{'d': '128_to_512', 'k': 64, 'acc': 93.4818, 't': 9.15}
{'d': '128_to_512', 'k': 128, 'acc': 93.6437, 't': 11.84}
{'d': '128_to_512', 'k': 256, 'acc': 93.7652, 't': 15.94}
{'d': '128_to_512', 'k': 512, 'acc': 93.7652, 't': 21.78}
{'d': '128_to_512', 'k': 1024, 'acc': 93.7652, 't': 29.4}
{'d': '128_to_1024', 'k': 8, 'acc': 93.2389, 't': 6.55}
{'d': '128_to_1024', 'k': 16, 'acc': 93.6437, 't': 6.99}
{'d': '128_to_1024', 'k': 32, 'acc': 93.9676, 't': 8.39}
{'d': '128_to_1024', 'k': 64, 'acc': 94.0486, 't': 10.41}
{'d': '128_to_1024', 'k': 128, 'acc': 94.2915, 't': 14.3}
{'d': '128_to_1024', 'k': 256, 'acc': 94.413, 't': 20.14}
{'d': '128_to_1024', 'k': 

In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in sorted(res_group_2, key=lambda x: x['acc'], reverse=True):
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")

Dimensions      Start K  Acc        Time      
512_to_1024     4        94.4939    16.95     
512_to_1024     8        94.4939    17.49     
512_to_1024     16       94.4939    17.94     
512_to_1024     32       94.4939    19.69     
512_to_1024     64       94.4939    22.14     
512_to_1024     128      94.4939    26.11     
256_to_1024     64       94.4534    15.05     
256_to_1024     128      94.4534    19.61     
256_to_1024     256      94.4534    26.70     
128_to_1024     256      94.4130    20.14     
128_to_1024     512      94.4130    29.49     
256_to_1024     32       94.4130    12.40     
256_to_1024     8        94.3320    10.21     
256_to_1024     16       94.3320    10.98     
128_to_1024     128      94.2915    14.30     
256_to_1024     4        94.1296    9.69      
128_to_1024     64       94.0486    10.41     
128_to_1024     32       93.9676    8.39      
64_to_1024      512      93.9271    22.60     
256_to_512      64       93.8057    12.43     
256_to_512   

In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in sorted(res_group_2, key=lambda x: x['t']):
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")

Dimensions      Start K  Acc        Time      
128_to_1024     8        93.2389    6.55      
128_to_512      16       93.1174    6.78      
128_to_1024     16       93.6437    6.99      
128_to_512      32       93.4008    7.41      
128_to_1024     32       93.9676    8.39      
128_to_512      64       93.4818    9.15      
256_to_512      4        93.4818    9.56      
256_to_1024     4        94.1296    9.69      
256_to_512      8        93.6437    9.87      
256_to_1024     8        94.3320    10.21     
256_to_512      16       93.7247    10.31     
128_to_1024     64       94.0486    10.41     
256_to_1024     16       94.3320    10.98     
256_to_512      32       93.7652    11.22     
128_to_512      128      93.6437    11.84     
256_to_1024     32       94.4130    12.40     
256_to_512      64       93.8057    12.43     
128_to_1024     128      94.2915    14.30     
256_to_512      128      93.8057    14.83     
256_to_1024     64       94.4534    15.05     
64_to_1024   

#### Run 2

##### KNN

In [None]:
knn_acc_run_2 = {}
knn_time_run_2 = {}

for dim in dimArr:
    result = run_complete_experiment_with_time(
      df_dataset,
      dataset_embs,
      question_embs,
      contexts,
      method_name="truncate_first",
      target_dim=dim,
      do_knn=True
    )
    knn_acc_run_2[dim] = result['knn_accuracy']
    knn_time_run_2[dim] = result['dt_reduce'] + result['dt_knn']


=== TRUNCATE_FIRST  |  dim=32 ===
Dimension reduction: 0.00 sec
KNN   time: 3.31 sec   |   acc: 39.55%


=== TRUNCATE_FIRST  |  dim=40 ===
Dimension reduction: 0.00 sec
KNN   time: 3.33 sec   |   acc: 50.65%


=== TRUNCATE_FIRST  |  dim=48 ===
Dimension reduction: 0.00 sec
KNN   time: 3.45 sec   |   acc: 62.71%


=== TRUNCATE_FIRST  |  dim=56 ===
Dimension reduction: 0.00 sec
KNN   time: 3.80 sec   |   acc: 71.78%


=== TRUNCATE_FIRST  |  dim=64 ===
Dimension reduction: 0.00 sec
KNN   time: 3.93 sec   |   acc: 78.42%


=== TRUNCATE_FIRST  |  dim=72 ===
Dimension reduction: 0.00 sec
KNN   time: 4.09 sec   |   acc: 76.88%


=== TRUNCATE_FIRST  |  dim=80 ===
Dimension reduction: 0.00 sec
KNN   time: 5.35 sec   |   acc: 80.53%


=== TRUNCATE_FIRST  |  dim=88 ===
Dimension reduction: 0.00 sec
KNN   time: 4.69 sec   |   acc: 83.16%


=== TRUNCATE_FIRST  |  dim=96 ===
Dimension reduction: 0.00 sec
KNN   time: 5.22 sec   |   acc: 84.98%


=== TRUNCATE_FIRST  |  dim=104 ===
Dimension reduction

In [None]:
print("Dim \tAccuracy \tTime")
for (k1, v1), (k2, v2) in zip( knn_acc_run_2.items(), knn_time_run_2.items()):
    print(f"{k1}:\t{v1}  \t{v2}")

Dim 	Accuracy 	Time
32:	39.5547  	3.31
40:	50.6478  	3.33
48:	62.7126  	3.45
56:	71.7814  	3.8
64:	78.4211  	3.93
72:	76.8826  	4.09
80:	80.5263  	5.35
88:	83.1579  	4.69
96:	84.9798  	5.22
104:	86.5587  	5.48
112:	87.8543  	6.0
120:	88.3806  	5.45
128:	88.7854  	5.93
136:	89.5547  	6.06
144:	89.6356  	6.17
152:	90.081  	6.36
160:	90.6073  	6.64
168:	90.8502  	6.78
176:	91.0526  	7.2
184:	91.3765  	7.54
192:	91.5789  	7.29
200:	91.7409  	9.21
208:	91.8623  	7.89
216:	91.9838  	7.97
224:	92.2267  	8.27
232:	92.4291  	10.91
240:	92.5911  	9.18
248:	92.8745  	10.11
256:	92.7935  	10.29
272:	92.9555  	10.57
288:	93.0769  	11.84
304:	92.3887  	10.42
320:	92.7126  	10.97
336:	92.8745  	11.31
352:	93.1579  	14.22
368:	93.2794  	13.99
384:	93.4008  	12.84
400:	93.6032  	13.2
416:	93.6032  	14.23
432:	93.5223  	14.1
448:	93.6842  	14.98
464:	93.8057  	18.96
480:	93.7247  	15.5
496:	93.8057  	16.53
512:	93.8057  	17.99
640:	94.251  	20.11
768:	94.2105  	23.24
896:	94.4939  	26.62
1024:	94.4939  

In [None]:
knn_time_run_2_plot = {
    "KNN Time - Run 2": knn_time_run_2
}

knn_acc_run_2_plot = {
    "KNN Accuracy - Run 2": knn_acc_run_2
}

In [None]:
plot_time_dicts(knn_time_run_2_plot, "Truncation/KNN Time - Keeping head/front Run 2")

In [None]:
plot_accuracy_dicts(knn_acc_run_2_plot, "Truncation/KNN Accuracy - Keeping head/front Run 2")

##### New

In [None]:
# k = 4
# res_step_factor["256_to_512"] = {}
# while k <= 1024:
#     res = progressive_knn_sklearn(
#               ds_embs     = dataset_embs,
#               qs_embs     = question_embs,
#               df_dataset  = df_dataset,
#               contexts    = contexts,
#               start_dim   = 256,
#               start_k     = k,
#               max_dim     = 512,
#               step_factor = 2,
#               verbose     = True
#     )
#     res_step_factor["256_to_512"][k] = res
#     k *= 2

In [None]:
new_dim_list = [64, 128, 256, 512, 1024, 2048, 3584]
progressive_knn_result_run_2 = {}
for startD in new_dim_list:
    for maxD in new_dim_list:
        if startD < maxD:
            k = 4
            dimRange = (f"{startD}_to_{maxD}")
            progressive_knn_result_run_2[dimRange] = {}
            while k <= 1024:
                res = progressive_knn_sklearn(
                ds_embs     = dataset_embs,
                qs_embs     = question_embs,
                df_dataset  = df_dataset,
                contexts    = contexts,
                start_dim   = startD,
                start_k     = k,
                max_dim     = maxD,
                step_factor = 2,
                verbose     = True
              )
                progressive_knn_result_run_2[dimRange][k] = res
                k *= 2


[init 0:64] unique rows = 9,692
[init 0:64] unique rows = 18,930
[init 0:64] unique rows = 36,311
[init 0:64] unique rows = 67,642
[init 0:64] unique rows = 120,604
[init 0:64] unique rows = 203,382
[init 0:64] unique rows = 320,055
[init 0:64] unique rows = 465,741
[init 0:64] unique rows = 623,356
[init 0:64] unique rows = 9,692
[slice 0:128] k=2
           candidates → 4,476
[init 0:64] unique rows = 18,930
[slice 0:128] k=4
           candidates → 8,250
[init 0:64] unique rows = 36,311
[slice 0:128] k=8
           candidates → 15,326
[init 0:64] unique rows = 67,642
[slice 0:128] k=16
           candidates → 28,312
[init 0:64] unique rows = 120,604
[slice 0:128] k=32
           candidates → 51,922
[init 0:64] unique rows = 203,382
[slice 0:128] k=64
           candidates → 93,279
[init 0:64] unique rows = 320,055
[slice 0:128] k=128
           candidates → 162,082
[init 0:64] unique rows = 465,741
[slice 0:128] k=256
           candidates → 268,422
[init 0:64] unique rows = 623,356

---
###### Original KNN reaches and stables at 93% accuracy at dimension 352, we find any cases reach 93% AND faster than dimension 352 run

In [None]:
result_run_2_group_1 = []
for k, v in progressive_knn_result_run_2.items():
    for k2, v2 in v.items():
        if v2['accuracy_pct'] >= 93 and v2['t_total_s'] < knn_time_run_2[352]:
            print(f"Dimension {k}: Start K {k2}: {v2}")
            result_run_2_group_1.append({"d": k, "k": k2, "acc": v2['accuracy_pct'], "t": v2['t_total_s']})

Dimension 128_to_512: Start K 16: {'accuracy_pct': 93.1174, 'final_pool': 17861, 't_total_s': 6.8}
Dimension 128_to_512: Start K 32: {'accuracy_pct': 93.4008, 'final_pool': 33649, 't_total_s': 7.47}
Dimension 128_to_512: Start K 64: {'accuracy_pct': 93.4818, 'final_pool': 62148, 't_total_s': 8.8}
Dimension 128_to_512: Start K 128: {'accuracy_pct': 93.6437, 'final_pool': 111979, 't_total_s': 11.28}
Dimension 128_to_1024: Start K 8: {'accuracy_pct': 93.2389, 'final_pool': 4854, 't_total_s': 6.23}
Dimension 128_to_1024: Start K 16: {'accuracy_pct': 93.6437, 'final_pool': 9461, 't_total_s': 7.06}
Dimension 128_to_1024: Start K 32: {'accuracy_pct': 93.9676, 'final_pool': 18205, 't_total_s': 8.01}
Dimension 128_to_1024: Start K 64: {'accuracy_pct': 94.0486, 'final_pool': 34673, 't_total_s': 10.14}
Dimension 128_to_1024: Start K 128: {'accuracy_pct': 94.2915, 'final_pool': 64709, 't_total_s': 13.72}
Dimension 128_to_3584: Start K 8: {'accuracy_pct': 93.2389, 'final_pool': 2465, 't_total_s': 6

In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in result_run_2_group_1:
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")


Dimensions      Start K  Acc        Time      
128_to_512      16       93.1174    6.80      
128_to_512      32       93.4008    7.47      
128_to_512      64       93.4818    8.80      
128_to_512      128      93.6437    11.28     
128_to_1024     8        93.2389    6.23      
128_to_1024     16       93.6437    7.06      
128_to_1024     32       93.9676    8.01      
128_to_1024     64       94.0486    10.14     
128_to_1024     128      94.2915    13.72     
128_to_3584     8        93.2389    6.95      
128_to_3584     16       93.8866    7.63      
128_to_3584     32       94.2915    9.40      
128_to_3584     64       94.4939    12.78     
256_to_512      4        93.4818    9.48      
256_to_512      8        93.6437    9.56      
256_to_512      16       93.7247    10.05     
256_to_512      32       93.7652    10.85     
256_to_512      64       93.8057    12.42     
256_to_1024     4        94.1296    9.81      
256_to_1024     8        94.3320    10.15     
256_to_1024  

In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in sorted(result_run_2_group_1, key=lambda x: x['acc'], reverse=True):
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")

Dimensions      Start K  Acc        Time      
256_to_3584     16       94.8178    12.11     
256_to_3584     8        94.7368    10.78     
128_to_3584     64       94.4939    12.78     
256_to_1024     32       94.4130    12.55     
256_to_1024     8        94.3320    10.15     
256_to_1024     16       94.3320    11.06     
128_to_1024     128      94.2915    13.72     
128_to_3584     32       94.2915    9.40      
256_to_1024     4        94.1296    9.81      
256_to_3584     4        94.1296    10.44     
128_to_1024     64       94.0486    10.14     
128_to_1024     32       93.9676    8.01      
128_to_3584     16       93.8866    7.63      
256_to_512      64       93.8057    12.42     
256_to_512      32       93.7652    10.85     
256_to_512      16       93.7247    10.05     
128_to_512      128      93.6437    11.28     
128_to_1024     16       93.6437    7.06      
256_to_512      8        93.6437    9.56      
128_to_512      64       93.4818    8.80      
256_to_512   

In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in sorted(result_run_2_group_1, key=lambda x: x['t']):
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")

Dimensions      Start K  Acc        Time      
128_to_1024     8        93.2389    6.23      
128_to_512      16       93.1174    6.80      
128_to_3584     8        93.2389    6.95      
128_to_1024     16       93.6437    7.06      
128_to_512      32       93.4008    7.47      
128_to_3584     16       93.8866    7.63      
128_to_1024     32       93.9676    8.01      
128_to_512      64       93.4818    8.80      
128_to_3584     32       94.2915    9.40      
256_to_512      4        93.4818    9.48      
256_to_512      8        93.6437    9.56      
256_to_1024     4        94.1296    9.81      
256_to_512      16       93.7247    10.05     
128_to_1024     64       94.0486    10.14     
256_to_1024     8        94.3320    10.15     
256_to_3584     4        94.1296    10.44     
256_to_3584     8        94.7368    10.78     
256_to_512      32       93.7652    10.85     
256_to_1024     16       94.3320    11.06     
128_to_512      128      93.6437    11.28     
256_to_3584  

In [None]:
# Dimensions      Start K  Acc        Time
# 128_to_1024     8        93.2389    6.55
# 128_to_512      16       93.1174    6.78
# 128_to_1024     16       93.6437    6.99
# 128_to_512      32       93.4008    7.41
# 128_to_1024     32       93.9676    8.39
# 128_to_512      64       93.4818    9.15
# 256_to_512      4        93.4818    9.56
# 256_to_1024     4        94.1296    9.69
# 256_to_512      8        93.6437    9.87
# 256_to_1024     8        94.3320    10.21
# 256_to_512      16       93.7247    10.31
# 128_to_1024     64       94.0486    10.41
# 256_to_1024     16       94.3320    10.98

---
###### accuracy >= 94% and faster than dim 1024

In [None]:
result_run_2_group_2 = []
for k, v in progressive_knn_result_run_2.items():
    for k2, v2 in v.items():
        if v2['accuracy_pct'] >= 94 and v2['t_total_s'] < knn_time_run_2[1024]:
            print(f"Dimension {k}: Start K {k2}: {v2}")
            result_run_2_group_2.append({"d": k, "k": k2, "acc": v2['accuracy_pct'], "t": v2['t_total_s']})

Dimension 64_to_3584: Start K 512: {'accuracy_pct': 94.332, 'final_pool': 36268, 't_total_s': 30.01}
Dimension 128_to_1024: Start K 64: {'accuracy_pct': 94.0486, 'final_pool': 34673, 't_total_s': 10.14}
Dimension 128_to_1024: Start K 128: {'accuracy_pct': 94.2915, 'final_pool': 64709, 't_total_s': 13.72}
Dimension 128_to_1024: Start K 256: {'accuracy_pct': 94.413, 'final_pool': 118135, 't_total_s': 19.38}
Dimension 128_to_1024: Start K 512: {'accuracy_pct': 94.413, 'final_pool': 207762, 't_total_s': 27.53}
Dimension 128_to_3584: Start K 32: {'accuracy_pct': 94.2915, 'final_pool': 4899, 't_total_s': 9.4}
Dimension 128_to_3584: Start K 64: {'accuracy_pct': 94.4939, 'final_pool': 9701, 't_total_s': 12.78}
Dimension 128_to_3584: Start K 128: {'accuracy_pct': 94.7368, 'final_pool': 19072, 't_total_s': 18.2}
Dimension 128_to_3584: Start K 256: {'accuracy_pct': 94.8988, 'final_pool': 37075, 't_total_s': 28.16}
Dimension 256_to_1024: Start K 4: {'accuracy_pct': 94.1296, 'final_pool': 4911, 't_

In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in sorted(result_run_2_group_2, key=lambda x: x['acc'], reverse=True):
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")

Dimensions      Start K  Acc        Time      
512_to_3584     16       95.0202    20.46     
512_to_3584     32       95.0202    24.15     
512_to_3584     64       95.0202    30.13     
256_to_3584     64       94.9798    19.58     
256_to_3584     128      94.9798    28.06     
256_to_3584     32       94.9393    14.90     
512_to_3584     8        94.9393    18.72     
128_to_3584     256      94.8988    28.16     
512_to_3584     4        94.8583    17.73     
256_to_3584     16       94.8178    12.11     
128_to_3584     128      94.7368    18.20     
256_to_3584     8        94.7368    10.78     
128_to_3584     64       94.4939    12.78     
512_to_1024     4        94.4939    17.20     
512_to_1024     8        94.4939    17.34     
512_to_1024     16       94.4939    18.09     
512_to_1024     32       94.4939    19.42     
512_to_1024     64       94.4939    21.75     
512_to_1024     128      94.4939    25.51     
256_to_1024     64       94.4534    14.82     
256_to_1024  

In [None]:
# Dimensions      Start K  Acc        Time
# 512_to_1024     4        94.4939    16.95
# 512_to_1024     8        94.4939    17.49
# 512_to_1024     16       94.4939    17.94
# 512_to_1024     32       94.4939    19.69
# 512_to_1024     64       94.4939    22.14
# 512_to_1024     128      94.4939    26.11
# 256_to_1024     64       94.4534    15.05
# 256_to_1024     128      94.4534    19.61
# 256_to_1024     256      94.4534    26.70
# 128_to_1024     256      94.4130    20.14
# 128_to_1024     512      94.4130    29.49
# 256_to_1024     32       94.4130    12.40
# 256_to_1024     8        94.3320    10.21
# 256_to_1024     16       94.3320    10.98
# 128_to_1024     128      94.2915    14.30
# 256_to_1024     4        94.1296    9.69
# 128_to_1024     64       94.0486    10.41
# 128_to_1024     32       93.9676    8.39
# 64_to_1024      512      93.9271    22.60
# 256_to_512      64       93.8057    12.43
# 256_to_512      128      93.8057    14.83
# 256_to_512      256      93.8057    18.34
# 256_to_512      512      93.8057    23.05
# 256_to_512      1024     93.8057    27.81
# 128_to_512      256      93.7652    15.94
# 128_to_512      512      93.7652    21.78
# 128_to_512      1024     93.7652    29.40
# 256_to_512      32       93.7652    11.22
# 256_to_512      16       93.7247    10.31
# 128_to_512      128      93.6437    11.84
# 128_to_1024     16       93.6437    6.99
# 256_to_512      8        93.6437    9.87
# 64_to_512       1024     93.5628    28.02
# 128_to_512      64       93.4818    9.15
# 256_to_512      4        93.4818    9.56
# 128_to_512      32       93.4008    7.41
# 64_to_512       512      93.3198    18.51
# 64_to_1024      256      93.2794    15.21
# 128_to_1024     8        93.2389    6.55
# 128_to_512      16       93.1174    6.78

In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in sorted(result_run_2_group_2, key=lambda x: x['t']):
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")

Dimensions      Start K  Acc        Time      
128_to_3584     32       94.2915    9.40      
256_to_1024     4        94.1296    9.81      
128_to_1024     64       94.0486    10.14     
256_to_1024     8        94.3320    10.15     
256_to_3584     4        94.1296    10.44     
256_to_3584     8        94.7368    10.78     
256_to_1024     16       94.3320    11.06     
256_to_3584     16       94.8178    12.11     
256_to_1024     32       94.4130    12.55     
128_to_3584     64       94.4939    12.78     
128_to_1024     128      94.2915    13.72     
256_to_1024     64       94.4534    14.82     
256_to_3584     32       94.9393    14.90     
512_to_1024     4        94.4939    17.20     
512_to_1024     8        94.4939    17.34     
512_to_3584     4        94.8583    17.73     
512_to_1024     16       94.4939    18.09     
128_to_3584     128      94.7368    18.20     
512_to_3584     8        94.9393    18.72     
256_to_1024     128      94.4534    19.11     
128_to_1024  

---
###### Find cases that reach 95% and faster than dim 3584

In [None]:
result_run_2_group_3 = []
for k, v in progressive_knn_result_run_2.items():
    for k2, v2 in v.items():
        if v2['accuracy_pct'] >= 95 and v2['t_total_s'] < knn_time_run_2[3584]:
            print(f"Dimension {k}: Start K {k2}: {v2}")
            result_run_2_group_3.append({"d": k, "k": k2, "acc": v2['accuracy_pct'], "t": v2['t_total_s']})

Dimension 256_to_3584: Start K 512: {'accuracy_pct': 95.0202, 'final_pool': 135851, 't_total_s': 65.42}
Dimension 256_to_3584: Start K 1024: {'accuracy_pct': 95.0202, 'final_pool': 244130, 't_total_s': 100.39}
Dimension 512_to_3584: Start K 16: {'accuracy_pct': 95.0202, 'final_pool': 9794, 't_total_s': 20.46}
Dimension 512_to_3584: Start K 32: {'accuracy_pct': 95.0202, 'final_pool': 19411, 't_total_s': 24.15}
Dimension 512_to_3584: Start K 64: {'accuracy_pct': 95.0202, 'final_pool': 38068, 't_total_s': 30.13}
Dimension 512_to_3584: Start K 128: {'accuracy_pct': 95.0202, 'final_pool': 73433, 't_total_s': 41.66}
Dimension 512_to_3584: Start K 256: {'accuracy_pct': 95.0202, 'final_pool': 137857, 't_total_s': 63.27}
Dimension 512_to_3584: Start K 512: {'accuracy_pct': 95.0202, 'final_pool': 247841, 't_total_s': 92.55}
Dimension 1024_to_3584: Start K 8: {'accuracy_pct': 95.0202, 'final_pool': 9799, 't_total_s': 33.06}
Dimension 1024_to_3584: Start K 16: {'accuracy_pct': 95.0202, 'final_pool

In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in sorted(result_run_2_group_3, key=lambda x: x['acc'], reverse=True):
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")

Dimensions      Start K  Acc        Time      
256_to_3584     512      95.0202    65.42     
256_to_3584     1024     95.0202    100.39    
512_to_3584     16       95.0202    20.46     
512_to_3584     32       95.0202    24.15     
512_to_3584     64       95.0202    30.13     
512_to_3584     128      95.0202    41.66     
512_to_3584     256      95.0202    63.27     
512_to_3584     512      95.0202    92.55     
1024_to_3584    8        95.0202    33.06     
1024_to_3584    16       95.0202    35.32     
1024_to_3584    32       95.0202    39.32     
1024_to_3584    64       95.0202    47.27     
1024_to_3584    128      95.0202    61.21     
1024_to_3584    256      95.0202    83.86     


In [None]:
print(f"{'Dimensions':<15} {'Start K':<8} {'Acc':<10} {'Time':<10}")
for res in sorted(result_run_2_group_3, key=lambda x: x['t']):
    print(f"{res['d']:<15} {res['k']:<8} {res['acc']:<10.4f} {res['t']:<10.2f}")

Dimensions      Start K  Acc        Time      
512_to_3584     16       95.0202    20.46     
512_to_3584     32       95.0202    24.15     
512_to_3584     64       95.0202    30.13     
1024_to_3584    8        95.0202    33.06     
1024_to_3584    16       95.0202    35.32     
1024_to_3584    32       95.0202    39.32     
512_to_3584     128      95.0202    41.66     
1024_to_3584    64       95.0202    47.27     
1024_to_3584    128      95.0202    61.21     
512_to_3584     256      95.0202    63.27     
256_to_3584     512      95.0202    65.42     
1024_to_3584    256      95.0202    83.86     
512_to_3584     512      95.0202    92.55     
256_to_3584     1024     95.0202    100.39    


---
All

In [None]:
for k, v in progressive_knn_result_run_2.items():
    for k2, v2 in v.items():
        print(f"Dimension {k}: Start K {k2}: {v2}")

Dimension 64_to_128: Start K 4: {'accuracy_pct': 83.7247, 'final_pool': 9692, 't_total_s': 4.23}
Dimension 64_to_128: Start K 8: {'accuracy_pct': 85.2227, 'final_pool': 18930, 't_total_s': 4.08}
Dimension 64_to_128: Start K 16: {'accuracy_pct': 86.4372, 'final_pool': 36311, 't_total_s': 4.36}
Dimension 64_to_128: Start K 32: {'accuracy_pct': 87.247, 'final_pool': 67642, 't_total_s': 4.5}
Dimension 64_to_128: Start K 64: {'accuracy_pct': 87.8947, 'final_pool': 120604, 't_total_s': 5.22}
Dimension 64_to_128: Start K 128: {'accuracy_pct': 88.502, 'final_pool': 203382, 't_total_s': 6.17}
Dimension 64_to_128: Start K 256: {'accuracy_pct': 88.583, 'final_pool': 320055, 't_total_s': 7.78}
Dimension 64_to_128: Start K 512: {'accuracy_pct': 88.7449, 'final_pool': 465741, 't_total_s': 9.91}
Dimension 64_to_128: Start K 1024: {'accuracy_pct': 88.7449, 'final_pool': 623356, 't_total_s': 12.97}
Dimension 64_to_256: Start K 4: {'accuracy_pct': 84.8178, 'final_pool': 4476, 't_total_s': 4.06}
Dimensio