## Setup

In [None]:
from google.colab import drive
from google.colab import userdata
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install openpyxl openai

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import plotly.graph_objects as go
import time
import re

from openai import OpenAI
# import json

In [None]:
openai_client = OpenAI(api_key=userdata.get('openai_api'))

## Class

In [None]:
class Rag:
  def __init__(self,
               search_algorithm = "hnsw",
               embedding_model = "embed-multilingual-v3.0",
               model: np.ndarray = None,
               query_model: np.ndarray = None,
               questions = None,
               documents = None):
    searching_algorithms = {"hnsw", "knn", "annoy"}
    if search_algorithm not in searching_algorithms:
      raise ValueError(f"Invalid search algorithm. Must be one of {searching_algorithms}.")

    embedding_models = {"embed-multilingual-v3.0", "embed-multilingual-light-v3.0", "embed-multilingual-v2.0", "text-embedding-3-large"}
    if embedding_model not in embedding_models:
      raise ValueError(f"Invalid embedding model. Must be one of {embedding_models}.")

    self.search_algorithm = search_algorithm
    self.embedding_model = embedding_model
    self.model = model
    self.documents = documents
    self.query_model = query_model
    self.questions = questions

    if self.embedding_model == "embed-multilingual-v3.0":
      self.dimension = 1024
    elif self.embedding_model == "embed-multilingual-light-v3.0":
      self.dimension = 384
    elif self.embedding_model == "embed-multilingual-v2.0":
      self.dimension = 768
    else:
      self.dimension = 3072

  def set_dimension(self, dimension: int):
    self.dimension = dimension

  def insert_model(self, model: np.ndarray):
    self.model = model

  def set_search_algorithm(self, search_algorithm: str):
    searching_algorithms = {"hnsw", "knn", "annoy"}
    if search_algorithm not in searching_algorithms:
      raise ValueError(f"Invalid search algorithm. Must be one of {searching_algorithms}.")
    self.search_algorithm = search_algorithm

  def set_embedding_model(self, embedding_model: str):
    embedding_models = {"embed-multilingual-v3.0", "embed-multilingual-light-v3.0", "embed-multilingual-v2.0", "text-embedding-3-large"}
    if embedding_model not in embedding_models:
      raise ValueError(f"Invalid embedding model. Must be one of {embedding_models}.")
    self.embedding_model = embedding_model
    if self.embedding_model == "embed-multilingual-v3.0":
      self.dimension = 1024
    elif self.embedding_model == "embed-multilingual-light-v3.0":
      self.dimension = 384
    elif self.embedding_model == "embed-multilingual-v2.0":
      self.dimension = 768
    else:
      self.dimension = 3072

  def embed_doc(self,
                save_to_file: bool = False,
                save_type: str = "pickle",
                save_path: str = None):
    save_types = {"pickle", "json"}
    if save_to_file is True and save_type not in save_types:
      raise ValueError(f"Invalid save type. Must be one of {save_types}.")

    if save_to_file is True and save_path is None:
      raise ValueError("Save path is not inserted.")

    data_tensors = np.zeros((len(self.documents), self.dimension))

    if self.embedding_model == "text-embedding-3-large":
      for i in range(0, len(self.documents), 1000):
        chunk = self.documents[i:i+1000]
        print(f"chunking index {i} to {i+1000}")
        response = self.__embed_chunk_openai(chunk)

        embeded_chunk = np.array([resp.embedding for resp in response.data])
        data_tensors[i:i+len(embeded_chunk)] = embeded_chunk
    else:
      for i in range(0, len(self.documents), 96):
        chunk = self.documents[i:i+96]
        print(f"chunking index {i} to {i+96}")
        embeded_chunk = np.array(self.__embed_chunk_doc(chunk).float_)
        data_tensors[i:i+len(embeded_chunk)] = embeded_chunk

    self.model = data_tensors

    if save_to_file:
      ndarr_list = data_tensors.tolist()
      data_dict = dict(zip(self.documents, ndarr_list))

      if save_type == "pickle":
        with open(save_path, 'wb') as f:
          pickle.dump(data_tensors, f)
      elif save_type == "json":
        with open(save_path, 'w') as f:
          json.dump(data_dict, f)

  def __embed_chunk_doc(self, texts):
    response = cohere_client.v2.embed(
      model=self.embedding_model,
      texts=texts,
      input_type='search_document',
      embedding_types=["float"],
    )
    return response.embeddings

  def __embed_chunk_openai(self, texts):
    response = openai_client.embeddings.create(
      model=self.embedding_model,
      input=texts,
      encoding_format="float",
      dimensions=self.dimension
    )
    return response

  def export_model(self,
                   save_type: str = "pickle",
                   save_path: str = None):
    save_types = {"pickle", "json"}
    if save_type not in save_types:
      raise ValueError(f"Invalid save type. Must be one of {save_types}.")

    if self.model is None:
      raise ValueError("Embedded Model is not inserted.")

    if save_path is None:
      raise ValueError("Save path is not inserted.")

    ndarr_list = self.model.tolist()
    data_dict = dict(zip(self.documents, ndarr_list))

    if save_type == "pickle":
      with open(save_path, 'wb') as f:
        pickle.dump(self.model, f)
    elif save_type == "json":
      with open(save_path, 'w') as f:
        json.dump(data_dict, f)

  def __embed_query(self,
                    questions):
    data_tensors = np.zeros((len(questions), self.dimension))

    if self.embedding_model == "text-embedding-3-large":
      for i in range(0, len(questions), 1000):
        chunk = questions[i:i+1000]
        print(f"chunking index {i} to {i+1000}")
        response = self.__embed_chunk_openai(chunk)

        embeded_chunk = np.array([resp.embedding for resp in response.data])
        data_tensors[i:i+len(embeded_chunk)] = embeded_chunk
    else:
      for i in range(0, len(questions), 96):
        chunk = questions[i:i+96]
        print(f"chunking index {i} to {i+96}")
        embeded_chunk = np.array(self.__embed_chunk_query(chunk).float_)
        data_tensors[i:i+len(embeded_chunk)] = embeded_chunk

    self.query_model = data_tensors

  def __embed_chunk_query(self, texts):
    response = cohere_client.v2.embed(
      texts=texts,
      model=self.embedding_model,
      input_type='search_query',
      embedding_types=["float"],
    )
    return response.embeddings

  def build_graph(self):
    if self.model is None:
      raise ValueError("Embedded Model is not inserted.")

    if self.search_algorithm == "hnsw":
      self.__find_hnsw()
    elif self.search_algorithm == "knn":
      self.__find_knn()

  def __find_hnsw(self):
    hnsw = hnswlib.Index(space = 'l2', dim = self.dimension)
    hnsw.init_index(max_elements = len(self.model))
    hnsw.add_items(self.model)

    if self.query_model is None:
      self.__embed_query(self.questions)

    labels, distances = hnsw.knn_query(self.query_model, k = 1)

    self.ann_documents = np.vectorize(lambda x: self.documents[x])(labels)

  def __find_knn(self):
    knn = NearestNeighbors(n_neighbors=1, algorithm='brute', p = 2)
    knn.fit(self.model)

    if self.query_model is None:
      self.__embed_query(self.questions)

    neighbors = knn.kneighbors(self.query_model, return_distance=False)
    self.ann_documents = np.vectorize(lambda x: self.documents[x])(neighbors)


## Embedding

In [None]:
openai_3072 = Rag(search_algorithm = "hnsw", embedding_model = "text-embedding-3-large", documents = data_texts)
openai_3072.embed_doc(save_to_file=True, save_path="drive/MyDrive/dataset/openai_3072.pkl")

## Data Preprocessing

In [None]:
data_texts = pd.read_pickle('drive/MyDrive/dataset/data_texts.pkl')
query_texts = pd.read_pickle('drive/MyDrive/dataset/query_texts.pkl')
for t in query_texts:
  data_texts.append(t)

In [None]:
random_1000 = random.sample(data_texts, 1000)

In [None]:
useless_patterns = [
    "This is a list of", "List of", "A list of", "This article presents", "a list of",
    "Events from the year", "listed below", "lists of", "Lists of", "listing", "Events in the year", "This article is about",
    "The following is a",
]

In [None]:
useless_indices = [i for i, doc in enumerate(random_1000) if any(pattern in doc for pattern in useless_patterns)]

In [None]:
while(len(useless_indices) != 0):
  random_doc = random.sample(data_texts, len(useless_indices))

  count = 0
  for i in useless_indices:
    random_1000[i] = random_doc[count]
    count = count + 1

  useless_indices = [i for i, doc in enumerate(random_1000) if any(pattern in doc for pattern in useless_patterns)]

In [None]:
with open("drive/MyDrive/dataset/1000_docs.txt", 'w') as f:
  for doc in random_1000:
    f.write(doc + "\n")

In [None]:
questions = []
for d in random_1000:
  response = openai_client.chat.completions.create(
      model="gpt-4o",
      messages=[
          {"role": "developer", "content": "You are a question generator. I am going to provide you documents, and you need to generate one and only one question based on what I was provided. Make sure the question can be answered based on the document. You cannot generate question that cannot be answered by what I provided. Questions can be but not limited to Yes or no, or open answer question. If you cannot generate question based on what I provided, say 'no question'."},
          {"role": "user", "content": d}
      ]
  )
  questions.append(response.choices[0].message.content)

In [None]:
no_q_ind = []
for i in range(len(questions)):
  if questions[i] == "No question." or questions[i] == "no question." or questions[i] == "No question" or questions[i] == "no question":
    no_q_ind.append(i)

In [None]:
while(len(no_q_ind) != 0):
  random_doc = random.sample(data_texts, len(no_q_ind))

  count = 0
  for i in no_q_ind:
    random_1000[i] = random_doc[count]
    count = count + 1

  questions = []
  for d in random_1000:
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "developer", "content": "You are a question generator. I am going to provide you documents, and you need to generate one and only one question based on what I was provided. Make sure the question can be answered based on the document. You cannot generate question that cannot be answered by what I provided. Questions can be but not limited to Yes or no, or open answer question. If you cannot generate question based on what I provided, say 'no question'."},
            {"role": "user", "content": d}
        ]
    )
    questions.append(response.choices[0].message.content)

  no_q_ind = []
  for i in range(len(questions)):
    if questions[i] == "No question." or questions[i] == "no question." or questions[i] == "No question" or questions[i] == "no question":
      no_q_ind.append(i)

In [None]:
with open("drive/MyDrive/dataset/filtered_docs.txt", 'w') as f:
  for doc in random_1000:
    f.write(doc + "\n")

In [None]:
with open("drive/MyDrive/dataset/filtered_questions.txt", 'w') as f:
  for doc in questions:
    f.write(doc + "\n")

## Import Data

In [None]:
truth_docs = []
with open("drive/MyDrive/dataset/filtered_docs.txt", "r") as f:
  for line in f:
      truth_docs.append(line.strip())

In [None]:
total_questions = []
with open("drive/MyDrive/dataset/filtered_questions.txt", 'r') as f:
  for line in f:
    total_questions.append(line.strip())

In [None]:
data_texts = pd.read_pickle('drive/MyDrive/dataset/data_texts.pkl')
query_texts = pd.read_pickle('drive/MyDrive/dataset/query_texts.pkl')
for t in query_texts:
  data_texts.append(t)

In [None]:
query_model = pd.read_pickle('drive/MyDrive/dataset/openai_questions_3072.pkl')

In [None]:
model_openai_3072 = pd.read_pickle("drive/MyDrive/dataset/openai_3072.pkl")

## KNN + Truncaton

In [None]:
openai_truncate_dims=[3072, 2048, 1024, 512, 504, 496, 488, 480, 472, 464, 456, 448, 440, 432, 424, 416, 408, 400, 392, 384, 376, 368, 360, 352, 344, 336, 328, 320, 312, 304, 296, 288, 280, 272, 264, 256, 248, 240, 232, 224, 216, 208, 200, 192, 184, 176, 168, 160, 152, 144, 136, 128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 16]
openai_knn_times = []
openai_knn_accuracies = []

In [None]:
for dim in openai_truncate_dims:
  inner_times = []
  for i in range(10):
    if dim == 3072:
      t_model = model_openai_3072
      qt_model = query_model
    else:
      t_model = model_openai_3072[:, :dim]
      qt_model = query_model[:, :dim]

    openai_truncate_knn = Rag(search_algorithm = "knn", embedding_model = "text-embedding-3-large", documents = data_texts, model=t_model, query_model=qt_model, questions=total_questions)
    openai_truncate_knn.set_dimension(dim)

    time_start = time.perf_counter()
    openai_truncate_knn.build_graph()
    time_taken = time.perf_counter() - time_start

    if i == 0:
      matches = 0
      for k in range(len(openai_truncate_knn.ann_documents)):
        if(openai_truncate_knn.ann_documents[k] == truth_docs[k]):
          matches += 1
      accuracy = matches/len(openai_truncate_knn.ann_documents)
      openai_knn_accuracies.append(accuracy)

    inner_times.append(time_taken)
  openai_knn_times.append(np.median(inner_times))
  print(f"Dimension: {dim}, Accuracy: {accuracy}, Time: {np.median(inner_times)}")

print("\n\n")
print(f"openai_knn_times = {openai_knn_times}")
print(f"openai_knn_accuracies = {openai_knn_accuracies}")

Dimension: 3072, Accuracy: 0.9445344129554656, Time: 80.30796138800008
Dimension: 2048, Accuracy: 0.9417004048582996, Time: 59.85726520599974
Dimension: 1024, Accuracy: 0.9384615384615385, Time: 30.848723479
Dimension: 512, Accuracy: 0.9340080971659919, Time: 16.823100542500015
Dimension: 504, Accuracy: 0.934412955465587, Time: 16.382559360000187
Dimension: 496, Accuracy: 0.9336032388663967, Time: 16.378384182499985
Dimension: 488, Accuracy: 0.9352226720647774, Time: 16.152041923500065
Dimension: 480, Accuracy: 0.9356275303643725, Time: 15.839321727999959
Dimension: 472, Accuracy: 0.9323886639676113, Time: 15.562328585499927
Dimension: 464, Accuracy: 0.9323886639676113, Time: 15.397715194500051
Dimension: 456, Accuracy: 0.9327935222672065, Time: 15.034426902999712
Dimension: 448, Accuracy: 0.9323886639676113, Time: 14.966193356499844
Dimension: 440, Accuracy: 0.9336032388663967, Time: 14.80337167000016
Dimension: 432, Accuracy: 0.9315789473684211, Time: 14.512032969500524
Dimension: 42

## KNN Data

In [None]:
openai_knn_times = [np.float64(80.30796138800008), np.float64(59.85726520599974), np.float64(30.848723479), np.float64(16.823100542500015), np.float64(16.382559360000187), np.float64(16.378384182499985), np.float64(16.152041923500065), np.float64(15.839321727999959), np.float64(15.562328585499927), np.float64(15.397715194500051), np.float64(15.034426902999712), np.float64(14.966193356499844), np.float64(14.80337167000016), np.float64(14.512032969500524), np.float64(14.364354464500138), np.float64(14.180842715999916), np.float64(13.95462186550003), np.float64(13.740425381000023), np.float64(13.579998712999895), np.float64(13.135071033500026), np.float64(13.004008940000404), np.float64(12.774348460500278), np.float64(12.666377661500064), np.float64(12.486957361499663), np.float64(12.224990507000257), np.float64(11.902284980999866), np.float64(11.85173079849983), np.float64(11.539623063499675), np.float64(11.495052441000098), np.float64(11.087097823000022), np.float64(10.880427696500192), np.float64(10.60308332049999), np.float64(10.5226856889999), np.float64(10.270327214499957), np.float64(10.009250661000351), np.float64(9.681865367999308), np.float64(9.463787915000012), np.float64(9.203758073500012), np.float64(9.041721822), np.float64(8.936536589500065), np.float64(8.689080166000167), np.float64(8.46391806299971), np.float64(8.149454071499349), np.float64(7.9914161019996754), np.float64(7.712699044499459), np.float64(7.463763314500284), np.float64(7.177186990499649), np.float64(6.926390914500189), np.float64(6.714913484498538), np.float64(6.491792287500175), np.float64(6.279732289999629), np.float64(6.117997556000773), np.float64(5.814920502500172), np.float64(5.6076803494997876), np.float64(5.359256483000536), np.float64(5.171743945999879), np.float64(4.985221910499604), np.float64(4.722418594999908), np.float64(4.449626024001191), np.float64(4.156216848500662), np.float64(3.921978135500467), np.float64(3.689453785999831), np.float64(3.4487271829993915), np.float64(3.29000839299988), np.float64(2.8166315785001643)]
openai_knn_accuracies = [0.9445344129554656, 0.9417004048582996, 0.9384615384615385, 0.9340080971659919, 0.934412955465587, 0.9336032388663967, 0.9352226720647774, 0.9356275303643725, 0.9323886639676113, 0.9323886639676113, 0.9327935222672065, 0.9323886639676113, 0.9336032388663967, 0.9315789473684211, 0.9331983805668016, 0.9303643724696357, 0.9327935222672065, 0.9340080971659919, 0.9323886639676113, 0.9323886639676113, 0.9323886639676113, 0.9315789473684211, 0.9295546558704454, 0.9299595141700405, 0.9311740890688259, 0.9283400809716599, 0.9295546558704454, 0.9295546558704454, 0.9279352226720647, 0.9275303643724696, 0.9263157894736842, 0.9267206477732793, 0.9251012145748988, 0.9234817813765183, 0.9214574898785425, 0.9202429149797571, 0.9186234817813765, 0.9190283400809717, 0.9198380566801619, 0.917004048582996, 0.9121457489878543, 0.9101214574898785, 0.908502024291498, 0.9040485829959514, 0.9032388663967611, 0.8975708502024291, 0.9, 0.8955465587044534, 0.8959514170040486, 0.8939271255060729, 0.8854251012145749, 0.8817813765182186, 0.8704453441295547, 0.8591093117408907, 0.848582995951417, 0.8364372469635628, 0.8149797570850202, 0.7882591093117409, 0.7534412955465587, 0.7072874493927126, 0.6680161943319838, 0.5785425101214575, 0.454251012145749, 0.2935222672064777, 0.03319838056680162]

In [None]:
for i in range(len(openai_knn_accuracies)):
  openai_knn_accuracies[i] = round(openai_knn_accuracies[i] * 100, 4)

for i in range(len(openai_knn_times)):
  openai_knn_times[i] = round(openai_knn_times[i], 4)

In [None]:
openai_knn_accuracies

[94.4534,
 94.17,
 93.8462,
 93.4008,
 93.4413,
 93.3603,
 93.5223,
 93.5628,
 93.2389,
 93.2389,
 93.2794,
 93.2389,
 93.3603,
 93.1579,
 93.3198,
 93.0364,
 93.2794,
 93.4008,
 93.2389,
 93.2389,
 93.2389,
 93.1579,
 92.9555,
 92.996,
 93.1174,
 92.834,
 92.9555,
 92.9555,
 92.7935,
 92.753,
 92.6316,
 92.6721,
 92.5101,
 92.3482,
 92.1457,
 92.0243,
 91.8623,
 91.9028,
 91.9838,
 91.7004,
 91.2146,
 91.0121,
 90.8502,
 90.4049,
 90.3239,
 89.7571,
 90.0,
 89.5547,
 89.5951,
 89.3927,
 88.5425,
 88.1781,
 87.0445,
 85.9109,
 84.8583,
 83.6437,
 81.498,
 78.8259,
 75.3441,
 70.7287,
 66.8016,
 57.8543,
 45.4251,
 29.3522,
 3.3198]

In [None]:
openai_knn_times

[np.float64(80.308),
 np.float64(59.8573),
 np.float64(30.8487),
 np.float64(16.8231),
 np.float64(16.3826),
 np.float64(16.3784),
 np.float64(16.152),
 np.float64(15.8393),
 np.float64(15.5623),
 np.float64(15.3977),
 np.float64(15.0344),
 np.float64(14.9662),
 np.float64(14.8034),
 np.float64(14.512),
 np.float64(14.3644),
 np.float64(14.1808),
 np.float64(13.9546),
 np.float64(13.7404),
 np.float64(13.58),
 np.float64(13.1351),
 np.float64(13.004),
 np.float64(12.7743),
 np.float64(12.6664),
 np.float64(12.487),
 np.float64(12.225),
 np.float64(11.9023),
 np.float64(11.8517),
 np.float64(11.5396),
 np.float64(11.4951),
 np.float64(11.0871),
 np.float64(10.8804),
 np.float64(10.6031),
 np.float64(10.5227),
 np.float64(10.2703),
 np.float64(10.0093),
 np.float64(9.6819),
 np.float64(9.4638),
 np.float64(9.2038),
 np.float64(9.0417),
 np.float64(8.9365),
 np.float64(8.6891),
 np.float64(8.4639),
 np.float64(8.1495),
 np.float64(7.9914),
 np.float64(7.7127),
 np.float64(7.4638),
 np.flo

In [None]:
data = {
    'Dimension': openai_truncate_dims,
    'Accuracy (%)': openai_knn_accuracies,
    'Time (s)': openai_knn_times
}

df = pd.DataFrame(data)
df


Unnamed: 0,Dimension,Accuracy (%),Time (s)
0,3072,94.4534,80.3080
1,2048,94.1700,59.8573
2,1024,93.8462,30.8487
3,512,93.4008,16.8231
4,504,93.4413,16.3826
...,...,...,...
60,56,66.8016,3.9220
61,48,57.8543,3.6895
62,40,45.4251,3.4487
63,32,29.3522,3.2900


In [None]:
df.to_excel("drive/MyDrive/dataset/regular.xlsx", index=False)

## Points

In [None]:
def progressive_knn_sklearn(
        model: np.ndarray,
        query_model: np.ndarray,
        documents,
        truth_docs,
        algorithm = "brute",
        start_dim: int      = 64,
        start_k: int        = 1000,
        max_dim: int        = 512,
        step_factor: int    = 2,      # dim *= step_factor each loop
        step_k: int         = 2,
        verbose: bool       = True):

    def build_index(mat):
        # brute-force
        # return NearestNeighbors(n_neighbors=1, algorithm="brute", metric="euclidean").fit(mat)
        # auto
        return NearestNeighbors(n_neighbors=1, algorithm=algorithm, p=2).fit(mat)

    # --------------------------------------------------------
    # 1) initial global search on first start_dim dimensions
    t0 = time.perf_counter()
    index0 = build_index(model[:, :start_dim])
    I = index0.kneighbors(query_model[:, :start_dim], n_neighbors=start_k, return_distance=False)
    pools = I.copy()                 # shape (Q, start_k)
    cand_set = np.unique(I.ravel())
    t_init = time.perf_counter() - t0
    if verbose:
        print(f"[init 0:{start_dim}] unique rows = {cand_set.size:,}")


    dim = start_dim
    k    = start_k
    t_slices = 0.0

    # ----------------------------------------
    # 2) loop
    while dim < max_dim:
        dim = min(dim * step_factor, max_dim)
        if dim >= max_dim:
            break

        k   = max(1, k // step_k)          # halve pool size

        if verbose:
            print(f"[slice 0:{dim}] k={k}")

        t0 = time.perf_counter()
        # build index on current candidate rows
        cand_mat = model[cand_set][:, :dim]
        idx_local = build_index(cand_mat)

        # query all questions at once
        Iq = idx_local.kneighbors(query_model[:, :dim], n_neighbors=k, return_distance=False)

        # map local → global row IDs
        pools = cand_set[Iq]

        # union of all rows for next round
        cand_set = np.unique(pools.ravel())
        t_slices += time.perf_counter() - t0

        if verbose:
            print(f"           candidates → {cand_set.size:,}")

    # ------------------------------------------------
    # 3) final 1-NN on remaining rows (0:max_dim)
    t0 = time.perf_counter()
    final_idx = build_index(model[cand_set][:, :max_dim])
    I_final = final_idx.kneighbors(query_model[:, :max_dim], n_neighbors=1, return_distance=False)
    t_final = time.perf_counter() - t0
    row_ids = cand_set[I_final.ravel()]

    # 4) evaluate
    acc_pct = round(100 * evaluate_exact(row_ids, documents, truth_docs), 4)
    return {
        "accuracy_pct": acc_pct,
        "final_pool"  : int(cand_set.size),
        "t_total_s": round(t_init+t_slices+t_final, 2)
    }

def evaluate_exact(indices, documents, truth_docs):
  ann_documents = np.vectorize(lambda x: documents[x])(indices)
  matches = 0
  for k in range(len(ann_documents)):
    if(ann_documents[k] == truth_docs[k]):
      matches += 1
  return matches/len(ann_documents)

### Step K = 2

In [None]:
min_ds = [64, 128, 256, 512, 1024, 2048]
max_ds = [128, 256, 512, 1024, 2048, 3072]
start_ks = [1024, 512, 256, 128, 64, 32, 16, 8, 4]

times = []
accuracies = []

for k in start_ks:
  for min_d in min_ds:
    for max_d in max_ds:
      if min_d >= max_d:
        continue
      inner_times = []
      inner_accuracies = 0
      for i in range(10):
        res = progressive_knn_sklearn(
                  model       = model_openai_3072,
                  query_model = query_model,
                  documents   = data_texts,
                  truth_docs  = truth_docs,
                  start_dim   = min_d,
                  start_k     = k,
                  max_dim     = max_d,
                  step_factor = 2,
                  step_k      = 2,
                  verbose     = False,
        )
        inner_times.append(res["t_total_s"])
      median_time = np.median(inner_times)
      times.append(median_time)
      accuracy = res["accuracy_pct"]
      accuracies.append(accuracy)
      print(f"Min Dimension: {min_d}, Max Dimension: {max_d}, Start k: {k}, accuracy: {accuracy}, Time: {median_time}")

print("\n\n")
print(f"times = {times}")
print(f"accuracies = {accuracies}")

Min Dimension: 64, Max Dimension: 128, Start k: 256, accuracy: 87.8543, Time: 9.84
Min Dimension: 64, Max Dimension: 256, Start k: 256, accuracy: 91.2551, Time: 13.445
Min Dimension: 64, Max Dimension: 512, Start k: 256, accuracy: 92.4291, Time: 16.19
Min Dimension: 64, Max Dimension: 1024, Start k: 256, accuracy: 92.915, Time: 18.825000000000003
Min Dimension: 64, Max Dimension: 2048, Start k: 256, accuracy: 93.1174, Time: 21.07
Min Dimension: 64, Max Dimension: 3072, Start k: 256, accuracy: 93.3603, Time: 23.085
Min Dimension: 128, Max Dimension: 256, Start k: 256, accuracy: 92.0243, Time: 13.8
Min Dimension: 128, Max Dimension: 512, Start k: 256, accuracy: 93.4008, Time: 19.615000000000002
Min Dimension: 128, Max Dimension: 1024, Start k: 256, accuracy: 93.8057, Time: 24.520000000000003
Min Dimension: 128, Max Dimension: 2048, Start k: 256, accuracy: 94.1296, Time: 29.66
Min Dimension: 128, Max Dimension: 3072, Start k: 256, accuracy: 94.413, Time: 32.17
Min Dimension: 256, Max Dime

### Step K = 4

In [None]:
min_ds = [64, 128, 256, 512, 1024, 2048]
max_ds = [128, 256, 512, 1024, 2048, 3072]
start_ks = [128, 64, 32, 16, 8, 4]

sk4_times = []
sk4_accuracies = []

for k in start_ks:
  for min_d in min_ds:
    for max_d in max_ds:
      if min_d >= max_d:
        continue
      inner_times = []
      inner_accuracies = 0
      for i in range(10):
        res = progressive_knn_sklearn(
                  model       = model_openai_3072,
                  query_model = query_model,
                  documents   = data_texts,
                  truth_docs  = truth_docs,
                  start_dim   = min_d,
                  start_k     = k,
                  max_dim     = max_d,
                  step_factor = 2,
                  step_k      = 4,
                  verbose     = False,
        )
        inner_times.append(res["t_total_s"])
      median_time = np.median(inner_times)
      sk4_times.append(median_time)
      accuracy = res["accuracy_pct"]
      sk4_accuracies.append(accuracy)
      print(f"Min Dimension: {min_d}, Max Dimension: {max_d}, Start k: {k}, accuracy: {accuracy}, Time: {median_time}")

print("\n\n")
print(f"times = {sk4_times}")
print(f"accuracies = {sk4_accuracies}")

Min Dimension: 64, Max Dimension: 128, Start k: 128, accuracy: 87.4494, Time: 7.5649999999999995
Min Dimension: 64, Max Dimension: 256, Start k: 128, accuracy: 90.6883, Time: 8.684999999999999
Min Dimension: 64, Max Dimension: 512, Start k: 128, accuracy: 91.8219, Time: 9.265
Min Dimension: 64, Max Dimension: 1024, Start k: 128, accuracy: 92.1457, Time: 9.355
Min Dimension: 64, Max Dimension: 2048, Start k: 128, accuracy: 92.1457, Time: 9.585
Min Dimension: 64, Max Dimension: 3072, Start k: 128, accuracy: 92.1457, Time: 9.879999999999999
Min Dimension: 128, Max Dimension: 256, Start k: 128, accuracy: 92.0243, Time: 10.469999999999999
Min Dimension: 128, Max Dimension: 512, Start k: 128, accuracy: 93.4008, Time: 12.2
Min Dimension: 128, Max Dimension: 1024, Start k: 128, accuracy: 93.8057, Time: 13.085
Min Dimension: 128, Max Dimension: 2048, Start k: 128, accuracy: 94.0081, Time: 13.379999999999999
Min Dimension: 128, Max Dimension: 3072, Start k: 128, accuracy: 94.0081, Time: 13.97
Mi

### Step K = 8

In [None]:
min_ds = [64, 128, 256, 512, 1024, 2048]
max_ds = [128, 256, 512, 1024, 2048, 3072]
start_ks = [4]

sk8_times = []
sk8_accuracies = []

for k in start_ks:
  for min_d in min_ds:
    for max_d in max_ds:
      if min_d >= max_d:
        continue
      inner_times = []
      inner_accuracies = 0
      for i in range(10):
        res = progressive_knn_sklearn(
                  model       = model_openai_3072,
                  query_model = query_model,
                  documents   = data_texts,
                  truth_docs  = truth_docs,
                  start_dim   = min_d,
                  start_k     = k,
                  max_dim     = max_d,
                  step_factor = 2,
                  step_k      = 8,
                  verbose     = False,
        )
        inner_times.append(res["t_total_s"])
      median_time = np.median(inner_times)
      sk8_times.append(median_time)
      accuracy = res["accuracy_pct"]
      sk8_accuracies.append(accuracy)
      print(f"Min Dimension: {min_d}, Max Dimension: {max_d}, Start k: {k}, accuracy: {accuracy}, Time: {median_time}")

print("\n\n")
print(f"times = {sk8_times}")
print(f"accuracies = {sk8_accuracies}")

Min Dimension: 64, Max Dimension: 128, Start k: 4, accuracy: 80.1619, Time: 4.525
Min Dimension: 64, Max Dimension: 256, Start k: 4, accuracy: 80.2024, Time: 4.4350000000000005
Min Dimension: 64, Max Dimension: 512, Start k: 4, accuracy: 80.2024, Time: 4.475
Min Dimension: 64, Max Dimension: 1024, Start k: 4, accuracy: 80.2024, Time: 4.625
Min Dimension: 64, Max Dimension: 2048, Start k: 4, accuracy: 80.2024, Time: 4.82
Min Dimension: 64, Max Dimension: 3072, Start k: 4, accuracy: 80.2024, Time: 5.13
Min Dimension: 128, Max Dimension: 256, Start k: 4, accuracy: 91.336, Time: 6.48
Min Dimension: 128, Max Dimension: 512, Start k: 4, accuracy: 91.336, Time: 6.529999999999999
Min Dimension: 128, Max Dimension: 1024, Start k: 4, accuracy: 91.336, Time: 6.6
Min Dimension: 128, Max Dimension: 2048, Start k: 4, accuracy: 91.336, Time: 6.875
Min Dimension: 128, Max Dimension: 3072, Start k: 4, accuracy: 91.336, Time: 7.13
Min Dimension: 256, Max Dimension: 512, Start k: 4, accuracy: 93.1984, Ti

## Data Extraction

In [None]:
long_text = """
Min Dimension: 64, Max Dimension: 128, Start k: 1024, accuracy: 88.1377, Time: 16.115
Min Dimension: 64, Max Dimension: 256, Start k: 1024, accuracy: 91.9433, Time: 25.33
Min Dimension: 64, Max Dimension: 512, Start k: 1024, accuracy: 93.2794, Time: 34.345
Min Dimension: 64, Max Dimension: 1024, Start k: 1024, accuracy: 93.7247, Time: 42.735
Min Dimension: 64, Max Dimension: 2048, Start k: 1024, accuracy: 94.0486, Time: 51.845
Min Dimension: 64, Max Dimension: 3072, Start k: 1024, accuracy: 94.332, Time: 58.685
Min Dimension: 128, Max Dimension: 256, Start k: 1024, accuracy: 92.0243, Time: 21.58
Min Dimension: 128, Max Dimension: 512, Start k: 1024, accuracy: 93.4008, Time: 36.965
Min Dimension: 128, Max Dimension: 1024, Start k: 1024, accuracy: 93.8462, Time: 51.415
Min Dimension: 128, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 68.00999999999999
Min Dimension: 128, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 80.2
Min Dimension: 256, Max Dimension: 512, Start k: 1024, accuracy: 93.4008, Time: 31.855
Min Dimension: 256, Max Dimension: 1024, Start k: 1024, accuracy: 93.8462, Time: 56.69
Min Dimension: 256, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 85.38
Min Dimension: 256, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 110.28
Min Dimension: 512, Max Dimension: 1024, Start k: 1024, accuracy: 93.8462, Time: 53.32
Min Dimension: 512, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 96.87
Min Dimension: 512, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 135.19
Min Dimension: 1024, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 91.67
Min Dimension: 1024, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 152.32
Min Dimension: 2048, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 139.3
Min Dimension: 64, Max Dimension: 128, Start k: 512, accuracy: 88.0972, Time: 12.495000000000001
Min Dimension: 64, Max Dimension: 256, Start k: 512, accuracy: 91.6599, Time: 18.325
Min Dimension: 64, Max Dimension: 512, Start k: 512, accuracy: 92.996, Time: 23.4
Min Dimension: 64, Max Dimension: 1024, Start k: 512, accuracy: 93.4413, Time: 28.305
Min Dimension: 64, Max Dimension: 2048, Start k: 512, accuracy: 93.7247, Time: 32.985
Min Dimension: 64, Max Dimension: 3072, Start k: 512, accuracy: 93.9676, Time: 36.28
Min Dimension: 128, Max Dimension: 256, Start k: 512, accuracy: 92.0243, Time: 16.97
Min Dimension: 128, Max Dimension: 512, Start k: 512, accuracy: 93.4008, Time: 26.58
Min Dimension: 128, Max Dimension: 1024, Start k: 512, accuracy: 93.8057, Time: 35.545
Min Dimension: 128, Max Dimension: 2048, Start k: 512, accuracy: 94.1296, Time: 44.655
Min Dimension: 128, Max Dimension: 3072, Start k: 512, accuracy: 94.413, Time: 51.68
Min Dimension: 256, Max Dimension: 512, Start k: 512, accuracy: 93.4008, Time: 26.5
Min Dimension: 256, Max Dimension: 1024, Start k: 512, accuracy: 93.8462, Time: 43.459999999999994
Min Dimension: 256, Max Dimension: 2048, Start k: 512, accuracy: 94.17, Time: 60.275
Min Dimension: 256, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 71.46000000000001
Min Dimension: 512, Max Dimension: 1024, Start k: 512, accuracy: 93.8462, Time: 43.165
Min Dimension: 512, Max Dimension: 2048, Start k: 512, accuracy: 94.17, Time: 72.41
Min Dimension: 512, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 96.61
Min Dimension: 1024, Max Dimension: 2048, Start k: 512, accuracy: 94.17, Time: 78.44
Min Dimension: 1024, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 116.235
Min Dimension: 2048, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 120.41
Min Dimension: 64, Max Dimension: 128, Start k: 256, accuracy: 87.8543, Time: 9.84
Min Dimension: 64, Max Dimension: 256, Start k: 256, accuracy: 91.2551, Time: 13.445
Min Dimension: 64, Max Dimension: 512, Start k: 256, accuracy: 92.4291, Time: 16.19
Min Dimension: 64, Max Dimension: 1024, Start k: 256, accuracy: 92.915, Time: 18.825000000000003
Min Dimension: 64, Max Dimension: 2048, Start k: 256, accuracy: 93.1174, Time: 21.07
Min Dimension: 64, Max Dimension: 3072, Start k: 256, accuracy: 93.3603, Time: 23.085
Min Dimension: 128, Max Dimension: 256, Start k: 256, accuracy: 92.0243, Time: 13.8
Min Dimension: 128, Max Dimension: 512, Start k: 256, accuracy: 93.4008, Time: 19.615000000000002
Min Dimension: 128, Max Dimension: 1024, Start k: 256, accuracy: 93.8057, Time: 24.520000000000003
Min Dimension: 128, Max Dimension: 2048, Start k: 256, accuracy: 94.1296, Time: 29.66
Min Dimension: 128, Max Dimension: 3072, Start k: 256, accuracy: 94.413, Time: 32.17
Min Dimension: 256, Max Dimension: 512, Start k: 256, accuracy: 93.4008, Time: 20.305
Min Dimension: 256, Max Dimension: 1024, Start k: 256, accuracy: 93.8462, Time: 29.655
Min Dimension: 256, Max Dimension: 2048, Start k: 256, accuracy: 94.17, Time: 38.95
Min Dimension: 256, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 46.03
Min Dimension: 512, Max Dimension: 1024, Start k: 256, accuracy: 93.8462, Time: 35.05
Min Dimension: 512, Max Dimension: 2048, Start k: 256, accuracy: 94.17, Time: 52.58
Min Dimension: 512, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 65.49000000000001
Min Dimension: 1024, Max Dimension: 2048, Start k: 256, accuracy: 94.17, Time: 61.83
Min Dimension: 1024, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 83.625
Min Dimension: 2048, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 98.435
Min Dimension: 64, Max Dimension: 128, Start k: 128, accuracy: 87.4494, Time: 7.470000000000001
Min Dimension: 64, Max Dimension: 256, Start k: 128, accuracy: 90.6883, Time: 9.45
Min Dimension: 64, Max Dimension: 512, Start k: 128, accuracy: 91.9028, Time: 10.955
Min Dimension: 64, Max Dimension: 1024, Start k: 128, accuracy: 92.2672, Time: 12.370000000000001
Min Dimension: 64, Max Dimension: 2048, Start k: 128, accuracy: 92.5101, Time: 13.69
Min Dimension: 64, Max Dimension: 3072, Start k: 128, accuracy: 92.753, Time: 14.67
Min Dimension: 128, Max Dimension: 256, Start k: 128, accuracy: 92.0243, Time: 10.5
Min Dimension: 128, Max Dimension: 512, Start k: 128, accuracy: 93.4413, Time: 13.6
Min Dimension: 128, Max Dimension: 1024, Start k: 128, accuracy: 93.8462, Time: 16.47
Min Dimension: 128, Max Dimension: 2048, Start k: 128, accuracy: 94.0891, Time: 18.950000000000003
Min Dimension: 128, Max Dimension: 3072, Start k: 128, accuracy: 94.332, Time: 21.265
Min Dimension: 256, Max Dimension: 512, Start k: 128, accuracy: 93.4008, Time: 16.64
Min Dimension: 256, Max Dimension: 1024, Start k: 128, accuracy: 93.8462, Time: 21.98
Min Dimension: 256, Max Dimension: 2048, Start k: 128, accuracy: 94.17, Time: 27.015
Min Dimension: 256, Max Dimension: 3072, Start k: 128, accuracy: 94.4534, Time: 29.775
Min Dimension: 512, Max Dimension: 1024, Start k: 128, accuracy: 93.8462, Time: 27.035
Min Dimension: 512, Max Dimension: 2048, Start k: 128, accuracy: 94.17, Time: 36.675
Min Dimension: 512, Max Dimension: 3072, Start k: 128, accuracy: 94.4534, Time: 44.1
Min Dimension: 1024, Max Dimension: 2048, Start k: 128, accuracy: 94.17, Time: 50.065
Min Dimension: 1024, Max Dimension: 3072, Start k: 128, accuracy: 94.4534, Time: 62.66
Min Dimension: 2048, Max Dimension: 3072, Start k: 128, accuracy: 94.4534, Time: 83.365
Min Dimension: 64, Max Dimension: 128, Start k: 64, accuracy: 87.004, Time: 6.1850000000000005
Min Dimension: 64, Max Dimension: 256, Start k: 64, accuracy: 89.8381, Time: 7.36
Min Dimension: 64, Max Dimension: 512, Start k: 64, accuracy: 90.8502, Time: 8.120000000000001
Min Dimension: 64, Max Dimension: 1024, Start k: 64, accuracy: 91.2146, Time: 8.94
Min Dimension: 64, Max Dimension: 2048, Start k: 64, accuracy: 91.498, Time: 9.715
Min Dimension: 64, Max Dimension: 3072, Start k: 64, accuracy: 91.6599, Time: 10.344999999999999
Min Dimension: 128, Max Dimension: 256, Start k: 64, accuracy: 91.9838, Time: 9.004999999999999
Min Dimension: 128, Max Dimension: 512, Start k: 64, accuracy: 93.2794, Time: 10.76
Min Dimension: 128, Max Dimension: 1024, Start k: 64, accuracy: 93.6842, Time: 11.785
Min Dimension: 128, Max Dimension: 2048, Start k: 64, accuracy: 93.9271, Time: 13.16
Min Dimension: 128, Max Dimension: 3072, Start k: 64, accuracy: 94.17, Time: 14.04
Min Dimension: 256, Max Dimension: 512, Start k: 64, accuracy: 93.4008, Time: 13.29
Min Dimension: 256, Max Dimension: 1024, Start k: 64, accuracy: 93.8462, Time: 16.215
Min Dimension: 256, Max Dimension: 2048, Start k: 64, accuracy: 94.17, Time: 18.715
Min Dimension: 256, Max Dimension: 3072, Start k: 64, accuracy: 94.4534, Time: 20.5
Min Dimension: 512, Max Dimension: 1024, Start k: 64, accuracy: 93.8462, Time: 22.61
Min Dimension: 512, Max Dimension: 2048, Start k: 64, accuracy: 94.17, Time: 27.81
Min Dimension: 512, Max Dimension: 3072, Start k: 64, accuracy: 94.4534, Time: 31.255
Min Dimension: 1024, Max Dimension: 2048, Start k: 64, accuracy: 94.17, Time: 40.480000000000004
Min Dimension: 1024, Max Dimension: 3072, Start k: 64, accuracy: 94.4534, Time: 48.34
Min Dimension: 2048, Max Dimension: 3072, Start k: 64, accuracy: 94.4534, Time: 74.445
Min Dimension: 64, Max Dimension: 128, Start k: 32, accuracy: 86.3968, Time: 5.4399999999999995
Min Dimension: 64, Max Dimension: 256, Start k: 32, accuracy: 89.0283, Time: 6.025
Min Dimension: 64, Max Dimension: 512, Start k: 32, accuracy: 89.8785, Time: 6.46
Min Dimension: 64, Max Dimension: 1024, Start k: 32, accuracy: 90.2024, Time: 6.67
Min Dimension: 64, Max Dimension: 2048, Start k: 32, accuracy: 90.4049, Time: 7.07
Min Dimension: 64, Max Dimension: 3072, Start k: 32, accuracy: 90.4049, Time: 7.3149999999999995
Min Dimension: 128, Max Dimension: 256, Start k: 32, accuracy: 91.9838, Time: 7.46
Min Dimension: 128, Max Dimension: 512, Start k: 32, accuracy: 93.2389, Time: 8.425
Min Dimension: 128, Max Dimension: 1024, Start k: 32, accuracy: 93.6437, Time: 9.2
Min Dimension: 128, Max Dimension: 2048, Start k: 32, accuracy: 93.8462, Time: 9.99
Min Dimension: 128, Max Dimension: 3072, Start k: 32, accuracy: 94.0486, Time: 10.524999999999999
Min Dimension: 256, Max Dimension: 512, Start k: 32, accuracy: 93.4008, Time: 11.985
Min Dimension: 256, Max Dimension: 1024, Start k: 32, accuracy: 93.8462, Time: 13.525
Min Dimension: 256, Max Dimension: 2048, Start k: 32, accuracy: 94.1296, Time: 14.89
Min Dimension: 256, Max Dimension: 3072, Start k: 32, accuracy: 94.413, Time: 15.9
Min Dimension: 512, Max Dimension: 1024, Start k: 32, accuracy: 93.8462, Time: 19.965
Min Dimension: 512, Max Dimension: 2048, Start k: 32, accuracy: 94.17, Time: 22.66
Min Dimension: 512, Max Dimension: 3072, Start k: 32, accuracy: 94.4534, Time: 24.325
Min Dimension: 1024, Max Dimension: 2048, Start k: 32, accuracy: 94.17, Time: 36.175
Min Dimension: 1024, Max Dimension: 3072, Start k: 32, accuracy: 94.4534, Time: 39.565
Min Dimension: 2048, Max Dimension: 3072, Start k: 32, accuracy: 94.4534, Time: 66.34
Min Dimension: 64, Max Dimension: 128, Start k: 16, accuracy: 84.8178, Time: 4.725
Min Dimension: 64, Max Dimension: 256, Start k: 16, accuracy: 86.8826, Time: 5.145
Min Dimension: 64, Max Dimension: 512, Start k: 16, accuracy: 87.7328, Time: 5.415
Min Dimension: 64, Max Dimension: 1024, Start k: 16, accuracy: 87.9352, Time: 5.635
Min Dimension: 64, Max Dimension: 2048, Start k: 16, accuracy: 87.9352, Time: 5.795
Min Dimension: 64, Max Dimension: 3072, Start k: 16, accuracy: 87.9352, Time: 6.055
Min Dimension: 128, Max Dimension: 256, Start k: 16, accuracy: 91.9028, Time: 6.800000000000001
Min Dimension: 128, Max Dimension: 512, Start k: 16, accuracy: 92.996, Time: 7.3149999999999995
Min Dimension: 128, Max Dimension: 1024, Start k: 16, accuracy: 93.4413, Time: 7.705
Min Dimension: 128, Max Dimension: 2048, Start k: 16, accuracy: 93.6032, Time: 8.18
Min Dimension: 128, Max Dimension: 3072, Start k: 16, accuracy: 93.6032, Time: 8.425
Min Dimension: 256, Max Dimension: 512, Start k: 16, accuracy: 93.4008, Time: 10.735
Min Dimension: 256, Max Dimension: 1024, Start k: 16, accuracy: 93.8462, Time: 11.545
Min Dimension: 256, Max Dimension: 2048, Start k: 16, accuracy: 94.0891, Time: 12.114999999999998
Min Dimension: 256, Max Dimension: 3072, Start k: 16, accuracy: 94.3725, Time: 12.66
Min Dimension: 512, Max Dimension: 1024, Start k: 16, accuracy: 93.8462, Time: 19.06
Min Dimension: 512, Max Dimension: 2048, Start k: 16, accuracy: 94.1296, Time: 20.65
Min Dimension: 512, Max Dimension: 3072, Start k: 16, accuracy: 94.413, Time: 21.32
Min Dimension: 1024, Max Dimension: 2048, Start k: 16, accuracy: 94.17, Time: 34.625
Min Dimension: 1024, Max Dimension: 3072, Start k: 16, accuracy: 94.4534, Time: 36.540000000000006
Min Dimension: 2048, Max Dimension: 3072, Start k: 16, accuracy: 94.4534, Time: 64.85499999999999
Min Dimension: 64, Max Dimension: 128, Start k: 8, accuracy: 83.0364, Time: 4.539999999999999
Min Dimension: 64, Max Dimension: 256, Start k: 8, accuracy: 84.5749, Time: 4.7
Min Dimension: 64, Max Dimension: 512, Start k: 8, accuracy: 85.3036, Time: 4.8
Min Dimension: 64, Max Dimension: 1024, Start k: 8, accuracy: 85.3036, Time: 5.05
Min Dimension: 64, Max Dimension: 2048, Start k: 8, accuracy: 85.3036, Time: 5.295
Min Dimension: 64, Max Dimension: 3072, Start k: 8, accuracy: 85.3036, Time: 5.41
Min Dimension: 128, Max Dimension: 256, Start k: 8, accuracy: 91.7814, Time: 6.48
Min Dimension: 128, Max Dimension: 512, Start k: 8, accuracy: 92.7126, Time: 6.74
Min Dimension: 128, Max Dimension: 1024, Start k: 8, accuracy: 93.0364, Time: 6.9
Min Dimension: 128, Max Dimension: 2048, Start k: 8, accuracy: 93.0364, Time: 7.23
Min Dimension: 128, Max Dimension: 3072, Start k: 8, accuracy: 93.0364, Time: 7.445
Min Dimension: 256, Max Dimension: 512, Start k: 8, accuracy: 93.2794, Time: 10.215
Min Dimension: 256, Max Dimension: 1024, Start k: 8, accuracy: 93.6842, Time: 10.809999999999999
Min Dimension: 256, Max Dimension: 2048, Start k: 8, accuracy: 93.8866, Time: 11.195
Min Dimension: 256, Max Dimension: 3072, Start k: 8, accuracy: 93.8866, Time: 11.5
Min Dimension: 512, Max Dimension: 1024, Start k: 8, accuracy: 93.8462, Time: 18.174999999999997
Min Dimension: 512, Max Dimension: 2048, Start k: 8, accuracy: 94.1296, Time: 19.245
Min Dimension: 512, Max Dimension: 3072, Start k: 8, accuracy: 94.413, Time: 19.729999999999997
Min Dimension: 1024, Max Dimension: 2048, Start k: 8, accuracy: 94.17, Time: 33.17
Min Dimension: 1024, Max Dimension: 3072, Start k: 8, accuracy: 94.4534, Time: 34.295
Min Dimension: 2048, Max Dimension: 3072, Start k: 8, accuracy: 94.4534, Time: 61.370000000000005
Min Dimension: 64, Max Dimension: 128, Start k: 4, accuracy: 80.1619, Time: 4.3100000000000005
Min Dimension: 64, Max Dimension: 256, Start k: 4, accuracy: 81.0931, Time: 4.395
Min Dimension: 64, Max Dimension: 512, Start k: 4, accuracy: 81.0526, Time: 4.5
Min Dimension: 64, Max Dimension: 1024, Start k: 4, accuracy: 81.0526, Time: 4.615
Min Dimension: 64, Max Dimension: 2048, Start k: 4, accuracy: 81.0526, Time: 4.76
Min Dimension: 64, Max Dimension: 3072, Start k: 4, accuracy: 81.0526, Time: 5.0649999999999995
Min Dimension: 128, Max Dimension: 256, Start k: 4, accuracy: 91.336, Time: 6.26
Min Dimension: 128, Max Dimension: 512, Start k: 4, accuracy: 92.1053, Time: 6.38
Min Dimension: 128, Max Dimension: 1024, Start k: 4, accuracy: 92.1053, Time: 6.43
Min Dimension: 128, Max Dimension: 2048, Start k: 4, accuracy: 92.1053, Time: 6.710000000000001
Min Dimension: 128, Max Dimension: 3072, Start k: 4, accuracy: 92.1053, Time: 7.0
Min Dimension: 256, Max Dimension: 512, Start k: 4, accuracy: 93.1984, Time: 9.965
Min Dimension: 256, Max Dimension: 1024, Start k: 4, accuracy: 93.6032, Time: 10.175
Min Dimension: 256, Max Dimension: 2048, Start k: 4, accuracy: 93.6032, Time: 10.58
Min Dimension: 256, Max Dimension: 3072, Start k: 4, accuracy: 93.6032, Time: 10.97
Min Dimension: 512, Max Dimension: 1024, Start k: 4, accuracy: 93.8462, Time: 17.645
Min Dimension: 512, Max Dimension: 2048, Start k: 4, accuracy: 94.0891, Time: 18.240000000000002
Min Dimension: 512, Max Dimension: 3072, Start k: 4, accuracy: 94.0891, Time: 18.485
Min Dimension: 1024, Max Dimension: 2048, Start k: 4, accuracy: 94.17, Time: 32.364999999999995
Min Dimension: 1024, Max Dimension: 3072, Start k: 4, accuracy: 94.413, Time: 32.925
Min Dimension: 2048, Max Dimension: 3072, Start k: 4, accuracy: 94.4534, Time: 62.315
"""

In [None]:
new_method_accuracies = [float(x) for x in re.findall(r'accuracy: ([\d.]+)', long_text)]
new_method_times = [float(x) for x in re.findall(r'Time: ([\d.]+)', long_text)]
new_method_min_dims = [int(x) for x in re.findall(r'Min Dimension: (\d+),', long_text)]
new_method_max_dims = [int(x) for x in re.findall(r'Max Dimension: (\d+),', long_text)]
new_method_start_k = [int(x) for x in re.findall(r'Start k: (\d+),', long_text)]

for i in range(len(new_method_accuracies)):
  new_method_accuracies[i] = round(new_method_accuracies[i], 4)

for i in range(len(new_method_times)):
  new_method_times[i] = round(new_method_times[i], 4)

print("new_method_accuracies =", new_method_accuracies)
print("new_method_times =", new_method_times)

new_method_accuracies = [88.1377, 91.9433, 93.2794, 93.7247, 94.0486, 94.332, 92.0243, 93.4008, 93.8462, 94.17, 94.4534, 93.4008, 93.8462, 94.17, 94.4534, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 88.0972, 91.6599, 92.996, 93.4413, 93.7247, 93.9676, 92.0243, 93.4008, 93.8057, 94.1296, 94.413, 93.4008, 93.8462, 94.17, 94.4534, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 87.8543, 91.2551, 92.4291, 92.915, 93.1174, 93.3603, 92.0243, 93.4008, 93.8057, 94.1296, 94.413, 93.4008, 93.8462, 94.17, 94.4534, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 87.4494, 90.6883, 91.9028, 92.2672, 92.5101, 92.753, 92.0243, 93.4413, 93.8462, 94.0891, 94.332, 93.4008, 93.8462, 94.17, 94.4534, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 87.004, 89.8381, 90.8502, 91.2146, 91.498, 91.6599, 91.9838, 93.2794, 93.6842, 93.9271, 94.17, 93.4008, 93.8462, 94.17, 94.4534, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 86.3968, 89.0283, 89.8785, 90.2024, 90.4049, 90.4049, 91.9838, 93.2389, 9

In [None]:
data = {
    'Min Dimension': new_method_min_dims,
    'Max Dimension': new_method_max_dims,
    'Start k': new_method_start_k,
    'accuracy': new_method_accuracies,
    'Time': new_method_times
}

df = pd.DataFrame(data)
df

Unnamed: 0,Min Dimension,Max Dimension,Start k,accuracy,Time
0,64,128,1024,88.1377,16.115
1,64,256,1024,91.9433,25.330
2,64,512,1024,93.2794,34.345
3,64,1024,1024,93.7247,42.735
4,64,2048,1024,94.0486,51.845
...,...,...,...,...,...
184,512,2048,4,94.0891,18.240
185,512,3072,4,94.0891,18.485
186,1024,2048,4,94.1700,32.365
187,1024,3072,4,94.4130,32.925


In [None]:
df.to_excel("drive/MyDrive/dataset/progressive.xlsx", index=False)

In [None]:
valid_accuracies = []
valid_times = []
valid_max_dims = []
valid_min_dims = []
valid_start_k = []
invalid_accuracies = []
invalid_times = []
invalid_max_dims = []
invalid_min_dims = []
invalid_start_k = []

for i in range(len(new_method_min_dims)):
  if new_method_max_dims[i] == 3072:
    if new_method_accuracies[i] >= 93.5 and new_method_times[i] <= 10:
      valid_accuracies.append(new_method_accuracies[i])
      valid_times.append(new_method_times[i])
      valid_max_dims.append(new_method_max_dims[i])
      valid_min_dims.append(new_method_min_dims[i])
      valid_start_k.append(new_method_start_k[i])
    elif new_method_accuracies[i] < 94 or new_method_times[i] > 80.308:
      invalid_accuracies.append(new_method_accuracies[i])
      invalid_times.append(new_method_times[i])
      invalid_max_dims.append(new_method_max_dims[i])
      invalid_min_dims.append(new_method_min_dims[i])
      invalid_start_k.append(new_method_start_k[i])
    else:
      valid_accuracies.append(new_method_accuracies[i])
      valid_times.append(new_method_times[i])
      valid_max_dims.append(new_method_max_dims[i])
      valid_min_dims.append(new_method_min_dims[i])
      valid_start_k.append(new_method_start_k[i])
  elif new_method_max_dims[i] == 2048:
    if new_method_accuracies[i] >= 93 and new_method_times[i] <= 10:
      valid_accuracies.append(new_method_accuracies[i])
      valid_times.append(new_method_times[i])
      valid_max_dims.append(new_method_max_dims[i])
      valid_min_dims.append(new_method_min_dims[i])
      valid_start_k.append(new_method_start_k[i])
    elif new_method_accuracies[i] < 94 or new_method_times[i] > 59.8573:
      invalid_accuracies.append(new_method_accuracies[i])
      invalid_times.append(new_method_times[i])
      invalid_max_dims.append(new_method_max_dims[i])
      invalid_min_dims.append(new_method_min_dims[i])
      invalid_start_k.append(new_method_start_k[i])
    else:
      valid_accuracies.append(new_method_accuracies[i])
      valid_times.append(new_method_times[i])
      valid_max_dims.append(new_method_max_dims[i])
      valid_min_dims.append(new_method_min_dims[i])
      valid_start_k.append(new_method_start_k[i])
  elif new_method_max_dims[i] == 1024:
    if new_method_accuracies[i] < 93 or new_method_times[i] > 30.8487:
      invalid_accuracies.append(new_method_accuracies[i])
      invalid_times.append(new_method_times[i])
      invalid_max_dims.append(new_method_max_dims[i])
      invalid_min_dims.append(new_method_min_dims[i])
      invalid_start_k.append(new_method_start_k[i])
    else:
      valid_accuracies.append(new_method_accuracies[i])
      valid_times.append(new_method_times[i])
      valid_max_dims.append(new_method_max_dims[i])
      valid_min_dims.append(new_method_min_dims[i])
      valid_start_k.append(new_method_start_k[i])
  elif new_method_max_dims[i] == 512:
    if new_method_accuracies[i] < 93 or new_method_times[i] > 16.8231:
      invalid_accuracies.append(new_method_accuracies[i])
      invalid_times.append(new_method_times[i])
      invalid_max_dims.append(new_method_max_dims[i])
      invalid_min_dims.append(new_method_min_dims[i])
      invalid_start_k.append(new_method_start_k[i])
    else:
      valid_accuracies.append(new_method_accuracies[i])
      valid_times.append(new_method_times[i])
      valid_max_dims.append(new_method_max_dims[i])
      valid_min_dims.append(new_method_min_dims[i])
      valid_start_k.append(new_method_start_k[i])
  else:
    if new_method_accuracies[i] < 93 or new_method_times[i] > 8:
      invalid_accuracies.append(new_method_accuracies[i])
      invalid_times.append(new_method_times[i])
      invalid_max_dims.append(new_method_max_dims[i])
      invalid_min_dims.append(new_method_min_dims[i])
      invalid_start_k.append(new_method_start_k[i])
    else:
      valid_accuracies.append(new_method_accuracies[i])
      valid_times.append(new_method_times[i])
      valid_max_dims.append(new_method_max_dims[i])
      valid_min_dims.append(new_method_min_dims[i])
      valid_start_k.append(new_method_start_k[i])

In [None]:
data = {
    'Min Dimension': valid_min_dims,
    'Max Dimension': valid_max_dims,
    'Start k': valid_start_k,
    'accuracy': valid_accuracies,
    'Time': valid_times
}

df = pd.DataFrame(data)
df

Unnamed: 0,Min Dimension,Max Dimension,Start k,accuracy,Time
0,64,2048,1024,94.0486,51.845
1,64,3072,1024,94.3320,58.685
2,128,3072,1024,94.4534,80.200
3,64,1024,512,93.4413,28.305
4,128,2048,512,94.1296,44.655
...,...,...,...,...,...
81,512,2048,4,94.0891,18.240
82,512,3072,4,94.0891,18.485
83,1024,2048,4,94.1700,32.365
84,1024,3072,4,94.4130,32.925


In [None]:
df.to_excel("drive/MyDrive/dataset/progressive_valid.xlsx", index=False)

In [None]:
data = {
    'Min Dimension': invalid_min_dims,
    'Max Dimension': invalid_max_dims,
    'Start k': invalid_start_k,
    'accuracy': invalid_accuracies,
    'Time': invalid_times
}

df = pd.DataFrame(data)
df

Unnamed: 0,Min Dimension,Max Dimension,Start k,accuracy,Time
0,64,128,1024,88.1377,16.115
1,64,256,1024,91.9433,25.330
2,64,512,1024,93.2794,34.345
3,64,1024,1024,93.7247,42.735
4,128,256,1024,92.0243,21.580
...,...,...,...,...,...
98,128,1024,4,92.1053,6.430
99,128,2048,4,92.1053,6.710
100,128,3072,4,92.1053,7.000
101,256,2048,4,93.6032,10.580


In [None]:
df.to_excel("drive/MyDrive/dataset/progressive_invalid.xlsx", index=False)

In [None]:
sk4_long_text = """
Min Dimension: 64, Max Dimension: 128, Start k: 1024, accuracy: 88.1377, Time: 15.905000000000001
Min Dimension: 64, Max Dimension: 256, Start k: 1024, accuracy: 91.9433, Time: 22.32
Min Dimension: 64, Max Dimension: 512, Start k: 1024, accuracy: 93.2794, Time: 25.685
Min Dimension: 64, Max Dimension: 1024, Start k: 1024, accuracy: 93.7247, Time: 28.189999999999998
Min Dimension: 64, Max Dimension: 2048, Start k: 1024, accuracy: 94.0486, Time: 29.23
Min Dimension: 64, Max Dimension: 3072, Start k: 1024, accuracy: 94.0486, Time: 29.395000000000003
Min Dimension: 128, Max Dimension: 256, Start k: 1024, accuracy: 92.0243, Time: 21.155
Min Dimension: 128, Max Dimension: 512, Start k: 1024, accuracy: 93.4008, Time: 31.380000000000003
Min Dimension: 128, Max Dimension: 1024, Start k: 1024, accuracy: 93.8462, Time: 36.425
Min Dimension: 128, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 39.285
Min Dimension: 128, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 40.205
Min Dimension: 256, Max Dimension: 512, Start k: 1024, accuracy: 93.4008, Time: 31.455
Min Dimension: 256, Max Dimension: 1024, Start k: 1024, accuracy: 93.8462, Time: 48.155
Min Dimension: 256, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 57.815
Min Dimension: 256, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 61.465
Min Dimension: 512, Max Dimension: 1024, Start k: 1024, accuracy: 93.8462, Time: 51.504999999999995
Min Dimension: 512, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 80.805
Min Dimension: 512, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 93.84
Min Dimension: 1024, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 91.475
Min Dimension: 1024, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 131.77499999999998
Min Dimension: 2048, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 139.905
Min Dimension: 64, Max Dimension: 128, Start k: 512, accuracy: 88.0972, Time: 12.735
Min Dimension: 64, Max Dimension: 256, Start k: 512, accuracy: 91.6599, Time: 16.92
Min Dimension: 64, Max Dimension: 512, Start k: 512, accuracy: 92.996, Time: 18.87
Min Dimension: 64, Max Dimension: 1024, Start k: 512, accuracy: 93.4413, Time: 19.75
Min Dimension: 64, Max Dimension: 2048, Start k: 512, accuracy: 93.6437, Time: 19.965
Min Dimension: 64, Max Dimension: 3072, Start k: 512, accuracy: 93.6437, Time: 20.270000000000003
Min Dimension: 128, Max Dimension: 256, Start k: 512, accuracy: 92.0243, Time: 17.490000000000002
Min Dimension: 128, Max Dimension: 512, Start k: 512, accuracy: 93.4008, Time: 23.695
Min Dimension: 128, Max Dimension: 1024, Start k: 512, accuracy: 93.8057, Time: 27.134999999999998
Min Dimension: 128, Max Dimension: 2048, Start k: 512, accuracy: 94.1296, Time: 28.58
Min Dimension: 128, Max Dimension: 3072, Start k: 512, accuracy: 94.3725, Time: 28.66
Min Dimension: 256, Max Dimension: 512, Start k: 512, accuracy: 93.4008, Time: 25.740000000000002
Min Dimension: 256, Max Dimension: 1024, Start k: 512, accuracy: 93.8462, Time: 35.66
Min Dimension: 256, Max Dimension: 2048, Start k: 512, accuracy: 94.17, Time: 41.474999999999994
Min Dimension: 256, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 42.69
Min Dimension: 512, Max Dimension: 1024, Start k: 512, accuracy: 93.8462, Time: 42.955
Min Dimension: 512, Max Dimension: 2048, Start k: 512, accuracy: 94.17, Time: 61.980000000000004
Min Dimension: 512, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 67.68
Min Dimension: 1024, Max Dimension: 2048, Start k: 512, accuracy: 94.17, Time: 75.555
Min Dimension: 1024, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 98.60499999999999
Min Dimension: 2048, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 120.63
Min Dimension: 64, Max Dimension: 128, Start k: 256, accuracy: 87.8543, Time: 10.004999999999999
Min Dimension: 64, Max Dimension: 256, Start k: 256, accuracy: 91.2551, Time: 11.809999999999999
Min Dimension: 64, Max Dimension: 512, Start k: 256, accuracy: 92.3482, Time: 12.65
Min Dimension: 64, Max Dimension: 1024, Start k: 256, accuracy: 92.834, Time: 13.15
Min Dimension: 64, Max Dimension: 2048, Start k: 256, accuracy: 92.834, Time: 13.325
Min Dimension: 64, Max Dimension: 3072, Start k: 256, accuracy: 92.834, Time: 13.565000000000001
Min Dimension: 128, Max Dimension: 256, Start k: 256, accuracy: 92.0243, Time: 13.235
Min Dimension: 128, Max Dimension: 512, Start k: 256, accuracy: 93.4008, Time: 16.585
Min Dimension: 128, Max Dimension: 1024, Start k: 256, accuracy: 93.8057, Time: 18.21
Min Dimension: 128, Max Dimension: 2048, Start k: 256, accuracy: 94.1296, Time: 18.895
Min Dimension: 128, Max Dimension: 3072, Start k: 256, accuracy: 94.1296, Time: 19.215
Min Dimension: 256, Max Dimension: 512, Start k: 256, accuracy: 93.4008, Time: 20.465
Min Dimension: 256, Max Dimension: 1024, Start k: 256, accuracy: 93.8462, Time: 25.795
Min Dimension: 256, Max Dimension: 2048, Start k: 256, accuracy: 94.17, Time: 28.36
Min Dimension: 256, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 30.11
Min Dimension: 512, Max Dimension: 1024, Start k: 256, accuracy: 93.8462, Time: 35.165
Min Dimension: 512, Max Dimension: 2048, Start k: 256, accuracy: 94.17, Time: 45.115
Min Dimension: 512, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 48.474999999999994
Min Dimension: 1024, Max Dimension: 2048, Start k: 256, accuracy: 94.17, Time: 61.754999999999995
Min Dimension: 1024, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 73.69
Min Dimension: 2048, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 97.63
Min Dimension: 64, Max Dimension: 128, Start k: 128, accuracy: 87.4494, Time: 7.5649999999999995
Min Dimension: 64, Max Dimension: 256, Start k: 128, accuracy: 90.6883, Time: 8.684999999999999
Min Dimension: 64, Max Dimension: 512, Start k: 128, accuracy: 91.8219, Time: 9.265
Min Dimension: 64, Max Dimension: 1024, Start k: 128, accuracy: 92.1457, Time: 9.355
Min Dimension: 64, Max Dimension: 2048, Start k: 128, accuracy: 92.1457, Time: 9.585
Min Dimension: 64, Max Dimension: 3072, Start k: 128, accuracy: 92.1457, Time: 9.879999999999999
Min Dimension: 128, Max Dimension: 256, Start k: 128, accuracy: 92.0243, Time: 10.469999999999999
Min Dimension: 128, Max Dimension: 512, Start k: 128, accuracy: 93.4008, Time: 12.2
Min Dimension: 128, Max Dimension: 1024, Start k: 128, accuracy: 93.8057, Time: 13.085
Min Dimension: 128, Max Dimension: 2048, Start k: 128, accuracy: 94.0081, Time: 13.379999999999999
Min Dimension: 128, Max Dimension: 3072, Start k: 128, accuracy: 94.0081, Time: 13.97
Min Dimension: 256, Max Dimension: 512, Start k: 128, accuracy: 93.4008, Time: 16.23
Min Dimension: 256, Max Dimension: 1024, Start k: 128, accuracy: 93.8462, Time: 19.23
Min Dimension: 256, Max Dimension: 2048, Start k: 128, accuracy: 94.17, Time: 21.245
Min Dimension: 256, Max Dimension: 3072, Start k: 128, accuracy: 94.413, Time: 21.44
Min Dimension: 512, Max Dimension: 1024, Start k: 128, accuracy: 93.8462, Time: 27.5
Min Dimension: 512, Max Dimension: 2048, Start k: 128, accuracy: 94.17, Time: 32.575
Min Dimension: 512, Max Dimension: 3072, Start k: 128, accuracy: 94.4534, Time: 34.67
Min Dimension: 1024, Max Dimension: 2048, Start k: 128, accuracy: 94.17, Time: 49.260000000000005
Min Dimension: 1024, Max Dimension: 3072, Start k: 128, accuracy: 94.4534, Time: 55.974999999999994
Min Dimension: 2048, Max Dimension: 3072, Start k: 128, accuracy: 94.4534, Time: 83.47999999999999
Min Dimension: 64, Max Dimension: 128, Start k: 64, accuracy: 87.004, Time: 6.12
Min Dimension: 64, Max Dimension: 256, Start k: 64, accuracy: 89.8381, Time: 6.715
Min Dimension: 64, Max Dimension: 512, Start k: 64, accuracy: 90.8097, Time: 7.015000000000001
Min Dimension: 64, Max Dimension: 1024, Start k: 64, accuracy: 90.8097, Time: 7.1
Min Dimension: 64, Max Dimension: 2048, Start k: 64, accuracy: 90.8097, Time: 7.3
Min Dimension: 64, Max Dimension: 3072, Start k: 64, accuracy: 90.8097, Time: 7.575
Min Dimension: 128, Max Dimension: 256, Start k: 64, accuracy: 91.9838, Time: 8.524999999999999
Min Dimension: 128, Max Dimension: 512, Start k: 64, accuracy: 93.2794, Time: 9.47
Min Dimension: 128, Max Dimension: 1024, Start k: 64, accuracy: 93.6842, Time: 9.96
Min Dimension: 128, Max Dimension: 2048, Start k: 64, accuracy: 93.6842, Time: 10.15
Min Dimension: 128, Max Dimension: 3072, Start k: 64, accuracy: 93.6842, Time: 10.375
Min Dimension: 256, Max Dimension: 512, Start k: 64, accuracy: 93.4008, Time: 13.225000000000001
Min Dimension: 256, Max Dimension: 1024, Start k: 64, accuracy: 93.8462, Time: 14.805
Min Dimension: 256, Max Dimension: 2048, Start k: 64, accuracy: 94.17, Time: 15.53
Min Dimension: 256, Max Dimension: 3072, Start k: 64, accuracy: 94.17, Time: 15.785
Min Dimension: 512, Max Dimension: 1024, Start k: 64, accuracy: 93.8462, Time: 22.565
Min Dimension: 512, Max Dimension: 2048, Start k: 64, accuracy: 94.17, Time: 25.22
Min Dimension: 512, Max Dimension: 3072, Start k: 64, accuracy: 94.4534, Time: 26.16
Min Dimension: 1024, Max Dimension: 2048, Start k: 64, accuracy: 94.17, Time: 40.834999999999994
Min Dimension: 1024, Max Dimension: 3072, Start k: 64, accuracy: 94.4534, Time: 44.285
Min Dimension: 2048, Max Dimension: 3072, Start k: 64, accuracy: 94.4534, Time: 71.86
Min Dimension: 64, Max Dimension: 128, Start k: 32, accuracy: 86.3968, Time: 5.17
Min Dimension: 64, Max Dimension: 256, Start k: 32, accuracy: 89.0283, Time: 5.5
Min Dimension: 64, Max Dimension: 512, Start k: 32, accuracy: 89.8785, Time: 5.715
Min Dimension: 64, Max Dimension: 1024, Start k: 32, accuracy: 89.8785, Time: 5.824999999999999
Min Dimension: 64, Max Dimension: 2048, Start k: 32, accuracy: 89.8785, Time: 6.0649999999999995
Min Dimension: 64, Max Dimension: 3072, Start k: 32, accuracy: 89.8785, Time: 6.345000000000001
Min Dimension: 128, Max Dimension: 256, Start k: 32, accuracy: 91.9838, Time: 7.405
Min Dimension: 128, Max Dimension: 512, Start k: 32, accuracy: 93.1984, Time: 7.92
Min Dimension: 128, Max Dimension: 1024, Start k: 32, accuracy: 93.5628, Time: 8.155000000000001
Min Dimension: 128, Max Dimension: 2048, Start k: 32, accuracy: 93.5628, Time: 8.36
Min Dimension: 128, Max Dimension: 3072, Start k: 32, accuracy: 93.5628, Time: 8.645
Min Dimension: 256, Max Dimension: 512, Start k: 32, accuracy: 93.4008, Time: 11.625
Min Dimension: 256, Max Dimension: 1024, Start k: 32, accuracy: 93.8462, Time: 12.35
Min Dimension: 256, Max Dimension: 2048, Start k: 32, accuracy: 94.0486, Time: 12.615
Min Dimension: 256, Max Dimension: 3072, Start k: 32, accuracy: 94.0486, Time: 13.02
Min Dimension: 512, Max Dimension: 1024, Start k: 32, accuracy: 93.8462, Time: 20.405
Min Dimension: 512, Max Dimension: 2048, Start k: 32, accuracy: 94.17, Time: 21.845
Min Dimension: 512, Max Dimension: 3072, Start k: 32, accuracy: 94.413, Time: 22.345
Min Dimension: 1024, Max Dimension: 2048, Start k: 32, accuracy: 94.17, Time: 37.085
Min Dimension: 1024, Max Dimension: 3072, Start k: 32, accuracy: 94.4534, Time: 37.795
Min Dimension: 2048, Max Dimension: 3072, Start k: 32, accuracy: 94.4534, Time: 66.24000000000001
Min Dimension: 64, Max Dimension: 128, Start k: 16, accuracy: 84.8178, Time: 4.720000000000001
Min Dimension: 64, Max Dimension: 256, Start k: 16, accuracy: 86.8826, Time: 4.875
Min Dimension: 64, Max Dimension: 512, Start k: 16, accuracy: 86.8826, Time: 4.96
Min Dimension: 64, Max Dimension: 1024, Start k: 16, accuracy: 86.8826, Time: 5.09
Min Dimension: 64, Max Dimension: 2048, Start k: 16, accuracy: 86.8826, Time: 5.365
Min Dimension: 64, Max Dimension: 3072, Start k: 16, accuracy: 86.8826, Time: 5.609999999999999
Min Dimension: 128, Max Dimension: 256, Start k: 16, accuracy: 91.9028, Time: 6.805
Min Dimension: 128, Max Dimension: 512, Start k: 16, accuracy: 92.915, Time: 7.02
Min Dimension: 128, Max Dimension: 1024, Start k: 16, accuracy: 92.915, Time: 7.175
Min Dimension: 128, Max Dimension: 2048, Start k: 16, accuracy: 92.915, Time: 7.369999999999999
Min Dimension: 128, Max Dimension: 3072, Start k: 16, accuracy: 92.915, Time: 7.605
Min Dimension: 256, Max Dimension: 512, Start k: 16, accuracy: 93.4008, Time: 10.65
Min Dimension: 256, Max Dimension: 1024, Start k: 16, accuracy: 93.8462, Time: 11.215
Min Dimension: 256, Max Dimension: 2048, Start k: 16, accuracy: 93.8462, Time: 11.56
Min Dimension: 256, Max Dimension: 3072, Start k: 16, accuracy: 93.8462, Time: 11.805
Min Dimension: 512, Max Dimension: 1024, Start k: 16, accuracy: 93.8462, Time: 18.555
Min Dimension: 512, Max Dimension: 2048, Start k: 16, accuracy: 94.1296, Time: 19.405
Min Dimension: 512, Max Dimension: 3072, Start k: 16, accuracy: 94.1296, Time: 20.19
Min Dimension: 1024, Max Dimension: 2048, Start k: 16, accuracy: 94.17, Time: 34.455
Min Dimension: 1024, Max Dimension: 3072, Start k: 16, accuracy: 94.4534, Time: 35.425
Min Dimension: 2048, Max Dimension: 3072, Start k: 16, accuracy: 94.4534, Time: 65.38499999999999
Min Dimension: 64, Max Dimension: 128, Start k: 8, accuracy: 83.0364, Time: 4.51
Min Dimension: 64, Max Dimension: 256, Start k: 8, accuracy: 84.3725, Time: 4.595
Min Dimension: 64, Max Dimension: 512, Start k: 8, accuracy: 84.3725, Time: 4.76
Min Dimension: 64, Max Dimension: 1024, Start k: 8, accuracy: 84.3725, Time: 4.824999999999999
Min Dimension: 64, Max Dimension: 2048, Start k: 8, accuracy: 84.3725, Time: 5.1
Min Dimension: 64, Max Dimension: 3072, Start k: 8, accuracy: 84.3725, Time: 5.324999999999999
Min Dimension: 128, Max Dimension: 256, Start k: 8, accuracy: 91.7814, Time: 6.63
Min Dimension: 128, Max Dimension: 512, Start k: 8, accuracy: 92.7126, Time: 6.755
Min Dimension: 128, Max Dimension: 1024, Start k: 8, accuracy: 92.7126, Time: 6.8100000000000005
Min Dimension: 128, Max Dimension: 2048, Start k: 8, accuracy: 92.7126, Time: 6.98
Min Dimension: 128, Max Dimension: 3072, Start k: 8, accuracy: 92.7126, Time: 7.28
Min Dimension: 256, Max Dimension: 512, Start k: 8, accuracy: 93.2794, Time: 10.36
Min Dimension: 256, Max Dimension: 1024, Start k: 8, accuracy: 93.6437, Time: 10.51
Min Dimension: 256, Max Dimension: 2048, Start k: 8, accuracy: 93.6437, Time: 10.73
Min Dimension: 256, Max Dimension: 3072, Start k: 8, accuracy: 93.6437, Time: 11.015
Min Dimension: 512, Max Dimension: 1024, Start k: 8, accuracy: 93.8462, Time: 17.950000000000003
Min Dimension: 512, Max Dimension: 2048, Start k: 8, accuracy: 94.0891, Time: 18.380000000000003
Min Dimension: 512, Max Dimension: 3072, Start k: 8, accuracy: 94.0891, Time: 18.675
Min Dimension: 1024, Max Dimension: 2048, Start k: 8, accuracy: 94.17, Time: 32.769999999999996
Min Dimension: 1024, Max Dimension: 3072, Start k: 8, accuracy: 94.413, Time: 33.59
Min Dimension: 2048, Max Dimension: 3072, Start k: 8, accuracy: 94.4534, Time: 63.120000000000005
Min Dimension: 64, Max Dimension: 128, Start k: 4, accuracy: 80.1619, Time: 4.41
Min Dimension: 64, Max Dimension: 256, Start k: 4, accuracy: 80.2024, Time: 4.43
Min Dimension: 64, Max Dimension: 512, Start k: 4, accuracy: 80.2024, Time: 4.475
Min Dimension: 64, Max Dimension: 1024, Start k: 4, accuracy: 80.2024, Time: 4.79
Min Dimension: 64, Max Dimension: 2048, Start k: 4, accuracy: 80.2024, Time: 4.99
Min Dimension: 64, Max Dimension: 3072, Start k: 4, accuracy: 80.2024, Time: 5.305
Min Dimension: 128, Max Dimension: 256, Start k: 4, accuracy: 91.336, Time: 6.585
Min Dimension: 128, Max Dimension: 512, Start k: 4, accuracy: 91.336, Time: 6.625
Min Dimension: 128, Max Dimension: 1024, Start k: 4, accuracy: 91.336, Time: 6.82
Min Dimension: 128, Max Dimension: 2048, Start k: 4, accuracy: 91.336, Time: 7.07
Min Dimension: 128, Max Dimension: 3072, Start k: 4, accuracy: 91.336, Time: 7.23
Min Dimension: 256, Max Dimension: 512, Start k: 4, accuracy: 93.1984, Time: 10.274999999999999
Min Dimension: 256, Max Dimension: 1024, Start k: 4, accuracy: 93.1984, Time: 10.145
Min Dimension: 256, Max Dimension: 2048, Start k: 4, accuracy: 93.1984, Time: 10.295
Min Dimension: 256, Max Dimension: 3072, Start k: 4, accuracy: 93.1984, Time: 10.56
Min Dimension: 512, Max Dimension: 1024, Start k: 4, accuracy: 93.8462, Time: 17.355
Min Dimension: 512, Max Dimension: 2048, Start k: 4, accuracy: 93.8462, Time: 17.605
Min Dimension: 512, Max Dimension: 3072, Start k: 4, accuracy: 93.8462, Time: 17.895000000000003
Min Dimension: 1024, Max Dimension: 2048, Start k: 4, accuracy: 94.17, Time: 31.59
Min Dimension: 1024, Max Dimension: 3072, Start k: 4, accuracy: 94.17, Time: 31.775
Min Dimension: 2048, Max Dimension: 3072, Start k: 4, accuracy: 94.4534, Time: 60.585
"""

In [None]:
sk4_accuracies = [float(x) for x in re.findall(r'accuracy: ([\d.]+)', sk4_long_text)]
sk4_times = [float(x) for x in re.findall(r'Time: ([\d.]+)', sk4_long_text)]
sk4_min_dims = [int(x) for x in re.findall(r'Min Dimension: (\d+),', sk4_long_text)]
sk4_max_dims = [int(x) for x in re.findall(r'Max Dimension: (\d+),', sk4_long_text)]
sk4_start_k = [int(x) for x in re.findall(r'Start k: (\d+),', sk4_long_text)]

for i in range(len(sk4_accuracies)):
  sk4_accuracies[i] = round(sk4_accuracies[i], 4)

for i in range(len(sk4_times)):
  sk4_times[i] = round(sk4_times[i], 4)

print("sk4_accuracies =", sk4_accuracies)
print("sk4_times =", sk4_times)

sk4_accuracies = [88.1377, 91.9433, 93.2794, 93.7247, 94.0486, 94.0486, 92.0243, 93.4008, 93.8462, 94.17, 94.4534, 93.4008, 93.8462, 94.17, 94.4534, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 88.0972, 91.6599, 92.996, 93.4413, 93.6437, 93.6437, 92.0243, 93.4008, 93.8057, 94.1296, 94.3725, 93.4008, 93.8462, 94.17, 94.4534, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 87.8543, 91.2551, 92.3482, 92.834, 92.834, 92.834, 92.0243, 93.4008, 93.8057, 94.1296, 94.1296, 93.4008, 93.8462, 94.17, 94.4534, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 87.4494, 90.6883, 91.8219, 92.1457, 92.1457, 92.1457, 92.0243, 93.4008, 93.8057, 94.0081, 94.0081, 93.4008, 93.8462, 94.17, 94.413, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 87.004, 89.8381, 90.8097, 90.8097, 90.8097, 90.8097, 91.9838, 93.2794, 93.6842, 93.6842, 93.6842, 93.4008, 93.8462, 94.17, 94.17, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 86.3968, 89.0283, 89.8785, 89.8785, 89.8785, 89.8785, 91.9838, 93.1984, 93.56

In [None]:
sk8_long_text = """

min_ds = [64, 128, 256, 512, 1024, 2048]
max_ds = [128, 256, 512, 1024, 2048, 3072]
start_ks = [1024, 512, 256, 128, 64, 32, 16, 8, 4]

sk8_times = []
sk8_accuracies = []

for k in start_ks:
  for min_d in min_ds:
    for max_d in max_ds:

Min Dimension: 64, Max Dimension: 128, Start k: 1024, accuracy: 88.1377, Time: 16.14
Min Dimension: 64, Max Dimension: 256, Start k: 1024, accuracy: 91.9433, Time: 20.525
Min Dimension: 64, Max Dimension: 512, Start k: 1024, accuracy: 93.2794, Time: 21.42
Min Dimension: 64, Max Dimension: 1024, Start k: 1024, accuracy: 93.6842, Time: 21.525
Min Dimension: 64, Max Dimension: 2048, Start k: 1024, accuracy: 93.6842, Time: 21.795
Min Dimension: 64, Max Dimension: 3072, Start k: 1024, accuracy: 93.6842, Time: 22.055
Min Dimension: 128, Max Dimension: 256, Start k: 1024, accuracy: 92.0243, Time: 21.32
Min Dimension: 128, Max Dimension: 512, Start k: 1024, accuracy: 93.4008, Time: 27.925
Min Dimension: 128, Max Dimension: 1024, Start k: 1024, accuracy: 93.8462, Time: 29.509999999999998
Min Dimension: 128, Max Dimension: 2048, Start k: 1024, accuracy: 94.0891, Time: 30.215
Min Dimension: 128, Max Dimension: 3072, Start k: 1024, accuracy: 94.0891, Time: 30.325
Min Dimension: 256, Max Dimension: 512, Start k: 1024, accuracy: 93.4008, Time: 32.54
Min Dimension: 256, Max Dimension: 1024, Start k: 1024, accuracy: 93.8462, Time: 42.505
Min Dimension: 256, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 45.7
Min Dimension: 256, Max Dimension: 3072, Start k: 1024, accuracy: 94.413, Time: 45.75
Min Dimension: 512, Max Dimension: 1024, Start k: 1024, accuracy: 93.8462, Time: 52.254999999999995
Min Dimension: 512, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 70.12
Min Dimension: 512, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 73.625
Min Dimension: 1024, Max Dimension: 2048, Start k: 1024, accuracy: 94.17, Time: 92.32
Min Dimension: 1024, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 116.355
Min Dimension: 2048, Max Dimension: 3072, Start k: 1024, accuracy: 94.4534, Time: 141.14499999999998
Min Dimension: 64, Max Dimension: 128, Start k: 512, accuracy: 88.0972, Time: 12.61
Min Dimension: 64, Max Dimension: 256, Start k: 512, accuracy: 91.6599, Time: 14.93
Min Dimension: 64, Max Dimension: 512, Start k: 512, accuracy: 92.834, Time: 15.725000000000001
Min Dimension: 64, Max Dimension: 1024, Start k: 512, accuracy: 92.834, Time: 15.7
Min Dimension: 64, Max Dimension: 2048, Start k: 512, accuracy: 92.834, Time: 15.915
Min Dimension: 64, Max Dimension: 3072, Start k: 512, accuracy: 92.834, Time: 16.259999999999998
Min Dimension: 128, Max Dimension: 256, Start k: 512, accuracy: 92.0243, Time: 17.415
Min Dimension: 128, Max Dimension: 512, Start k: 512, accuracy: 93.4008, Time: 20.835
Min Dimension: 128, Max Dimension: 1024, Start k: 512, accuracy: 93.8057, Time: 22.064999999999998
Min Dimension: 128, Max Dimension: 2048, Start k: 512, accuracy: 93.8057, Time: 22.1
Min Dimension: 128, Max Dimension: 3072, Start k: 512, accuracy: 93.8057, Time: 22.240000000000002
Min Dimension: 256, Max Dimension: 512, Start k: 512, accuracy: 93.4008, Time: 26.259999999999998
Min Dimension: 256, Max Dimension: 1024, Start k: 512, accuracy: 93.8462, Time: 31.82
Min Dimension: 256, Max Dimension: 2048, Start k: 512, accuracy: 94.17, Time: 33.26
Min Dimension: 256, Max Dimension: 3072, Start k: 512, accuracy: 94.17, Time: 34.07
Min Dimension: 512, Max Dimension: 1024, Start k: 512, accuracy: 93.8462, Time: 43.485
Min Dimension: 512, Max Dimension: 2048, Start k: 512, accuracy: 94.17, Time: 53.45
Min Dimension: 512, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 55.17
Min Dimension: 1024, Max Dimension: 2048, Start k: 512, accuracy: 94.17, Time: 76.945
Min Dimension: 1024, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 89.585
Min Dimension: 2048, Max Dimension: 3072, Start k: 512, accuracy: 94.4534, Time: 119.87
Min Dimension: 64, Max Dimension: 128, Start k: 256, accuracy: 87.8543, Time: 9.725
Min Dimension: 64, Max Dimension: 256, Start k: 256, accuracy: 91.2551, Time: 10.995000000000001
Min Dimension: 64, Max Dimension: 512, Start k: 256, accuracy: 92.3077, Time: 11.355
Min Dimension: 64, Max Dimension: 1024, Start k: 256, accuracy: 92.3077, Time: 11.52
Min Dimension: 64, Max Dimension: 2048, Start k: 256, accuracy: 92.3077, Time: 11.72
Min Dimension: 64, Max Dimension: 3072, Start k: 256, accuracy: 92.3077, Time: 12.115
Min Dimension: 128, Max Dimension: 256, Start k: 256, accuracy: 92.0243, Time: 13.605
Min Dimension: 128, Max Dimension: 512, Start k: 256, accuracy: 93.4008, Time: 15.46
Min Dimension: 128, Max Dimension: 1024, Start k: 256, accuracy: 93.8057, Time: 16.22
Min Dimension: 128, Max Dimension: 2048, Start k: 256, accuracy: 93.8057, Time: 16.345
Min Dimension: 128, Max Dimension: 3072, Start k: 256, accuracy: 93.8057, Time: 16.425
Min Dimension: 256, Max Dimension: 512, Start k: 256, accuracy: 93.4008, Time: 20.75
Min Dimension: 256, Max Dimension: 1024, Start k: 256, accuracy: 93.8462, Time: 24.244999999999997
Min Dimension: 256, Max Dimension: 2048, Start k: 256, accuracy: 94.17, Time: 24.59
Min Dimension: 256, Max Dimension: 3072, Start k: 256, accuracy: 94.17, Time: 24.814999999999998
Min Dimension: 512, Max Dimension: 1024, Start k: 256, accuracy: 93.8462, Time: 34.44
Min Dimension: 512, Max Dimension: 2048, Start k: 256, accuracy: 94.17, Time: 39.64
Min Dimension: 512, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 40.785
Min Dimension: 1024, Max Dimension: 2048, Start k: 256, accuracy: 94.17, Time: 61.28
Min Dimension: 1024, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 67.97999999999999
Min Dimension: 2048, Max Dimension: 3072, Start k: 256, accuracy: 94.4534, Time: 99.275
Min Dimension: 64, Max Dimension: 128, Start k: 128, accuracy: 87.4494, Time: 7.72
Min Dimension: 64, Max Dimension: 256, Start k: 128, accuracy: 90.6478, Time: 8.42
Min Dimension: 64, Max Dimension: 512, Start k: 128, accuracy: 91.8219, Time: 8.49
Min Dimension: 64, Max Dimension: 1024, Start k: 128, accuracy: 91.8219, Time: 8.695
Min Dimension: 64, Max Dimension: 2048, Start k: 128, accuracy: 91.8219, Time: 8.83
Min Dimension: 64, Max Dimension: 3072, Start k: 128, accuracy: 91.8219, Time: 9.190000000000001
Min Dimension: 128, Max Dimension: 256, Start k: 128, accuracy: 92.0243, Time: 10.765
Min Dimension: 128, Max Dimension: 512, Start k: 128, accuracy: 93.4008, Time: 11.675
Min Dimension: 128, Max Dimension: 1024, Start k: 128, accuracy: 93.7652, Time: 11.95
Min Dimension: 128, Max Dimension: 2048, Start k: 128, accuracy: 93.7652, Time: 12.11
Min Dimension: 128, Max Dimension: 3072, Start k: 128, accuracy: 93.7652, Time: 12.24
Min Dimension: 256, Max Dimension: 512, Start k: 128, accuracy: 93.4008, Time: 16.32
Min Dimension: 256, Max Dimension: 1024, Start k: 128, accuracy: 93.8462, Time: 17.884999999999998
Min Dimension: 256, Max Dimension: 2048, Start k: 128, accuracy: 94.0891, Time: 18.18
Min Dimension: 256, Max Dimension: 3072, Start k: 128, accuracy: 94.0891, Time: 18.6
Min Dimension: 512, Max Dimension: 1024, Start k: 128, accuracy: 93.8462, Time: 27.41
Min Dimension: 512, Max Dimension: 2048, Start k: 128, accuracy: 94.17, Time: 30.035
Min Dimension: 512, Max Dimension: 3072, Start k: 128, accuracy: 94.413, Time: 31.770000000000003
Min Dimension: 1024, Max Dimension: 2048, Start k: 128, accuracy: 94.17, Time: 49.254999999999995
Min Dimension: 1024, Max Dimension: 3072, Start k: 128, accuracy: 94.4534, Time: 52.675
Min Dimension: 2048, Max Dimension: 3072, Start k: 128, accuracy: 94.4534, Time: 84.15
Min Dimension: 64, Max Dimension: 128, Start k: 64, accuracy: 87.004, Time: 6.32
Min Dimension: 64, Max Dimension: 256, Start k: 64, accuracy: 89.8785, Time: 6.585
Min Dimension: 64, Max Dimension: 512, Start k: 64, accuracy: 89.8785, Time: 6.73
Min Dimension: 64, Max Dimension: 1024, Start k: 64, accuracy: 89.8785, Time: 6.82
Min Dimension: 64, Max Dimension: 2048, Start k: 64, accuracy: 89.8785, Time: 7.055
Min Dimension: 64, Max Dimension: 3072, Start k: 64, accuracy: 89.8785, Time: 7.365
Min Dimension: 128, Max Dimension: 256, Start k: 64, accuracy: 91.9838, Time: 8.725
Min Dimension: 128, Max Dimension: 512, Start k: 64, accuracy: 93.1984, Time: 9.14
Min Dimension: 128, Max Dimension: 1024, Start k: 64, accuracy: 93.1984, Time: 9.295
Min Dimension: 128, Max Dimension: 2048, Start k: 64, accuracy: 93.1984, Time: 9.71
Min Dimension: 128, Max Dimension: 3072, Start k: 64, accuracy: 93.1984, Time: 9.975000000000001
Min Dimension: 256, Max Dimension: 512, Start k: 64, accuracy: 93.4008, Time: 13.524999999999999
Min Dimension: 256, Max Dimension: 1024, Start k: 64, accuracy: 93.8462, Time: 14.545
Min Dimension: 256, Max Dimension: 2048, Start k: 64, accuracy: 93.8462, Time: 14.56
Min Dimension: 256, Max Dimension: 3072, Start k: 64, accuracy: 93.8462, Time: 14.88
Min Dimension: 512, Max Dimension: 1024, Start k: 64, accuracy: 93.8462, Time: 23.165
Min Dimension: 512, Max Dimension: 2048, Start k: 64, accuracy: 94.17, Time: 24.700000000000003
Min Dimension: 512, Max Dimension: 3072, Start k: 64, accuracy: 94.17, Time: 24.91
Min Dimension: 1024, Max Dimension: 2048, Start k: 64, accuracy: 94.17, Time: 41.55
Min Dimension: 1024, Max Dimension: 3072, Start k: 64, accuracy: 94.4534, Time: 43.22
Min Dimension: 2048, Max Dimension: 3072, Start k: 64, accuracy: 94.4534, Time: 73.945
Min Dimension: 64, Max Dimension: 128, Start k: 32, accuracy: 86.3968, Time: 5.38
Min Dimension: 64, Max Dimension: 256, Start k: 32, accuracy: 88.8664, Time: 5.52
Min Dimension: 64, Max Dimension: 512, Start k: 32, accuracy: 88.8664, Time: 5.58
Min Dimension: 64, Max Dimension: 1024, Start k: 32, accuracy: 88.8664, Time: 5.705
Min Dimension: 64, Max Dimension: 2048, Start k: 32, accuracy: 88.8664, Time: 5.98
Min Dimension: 64, Max Dimension: 3072, Start k: 32, accuracy: 88.8664, Time: 6.375
Min Dimension: 128, Max Dimension: 256, Start k: 32, accuracy: 91.9838, Time: 7.6850000000000005
Min Dimension: 128, Max Dimension: 512, Start k: 32, accuracy: 93.1174, Time: 7.975
Min Dimension: 128, Max Dimension: 1024, Start k: 32, accuracy: 93.1174, Time: 8.01
Min Dimension: 128, Max Dimension: 2048, Start k: 32, accuracy: 93.1174, Time: 8.35
Min Dimension: 128, Max Dimension: 3072, Start k: 32, accuracy: 93.1174, Time: 8.55
Min Dimension: 256, Max Dimension: 512, Start k: 32, accuracy: 93.4008, Time: 12.045
Min Dimension: 256, Max Dimension: 1024, Start k: 32, accuracy: 93.8462, Time: 12.48
Min Dimension: 256, Max Dimension: 2048, Start k: 32, accuracy: 93.8462, Time: 12.57
Min Dimension: 256, Max Dimension: 3072, Start k: 32, accuracy: 93.8462, Time: 12.879999999999999
Min Dimension: 512, Max Dimension: 1024, Start k: 32, accuracy: 93.8462, Time: 20.21
Min Dimension: 512, Max Dimension: 2048, Start k: 32, accuracy: 94.17, Time: 20.979999999999997
Min Dimension: 512, Max Dimension: 3072, Start k: 32, accuracy: 94.17, Time: 21.310000000000002
Min Dimension: 1024, Max Dimension: 2048, Start k: 32, accuracy: 94.17, Time: 36.385000000000005
Min Dimension: 1024, Max Dimension: 3072, Start k: 32, accuracy: 94.4534, Time: 37.36
Min Dimension: 2048, Max Dimension: 3072, Start k: 32, accuracy: 94.4534, Time: 66.61500000000001
Min Dimension: 64, Max Dimension: 128, Start k: 16, accuracy: 84.8178, Time: 4.775
Min Dimension: 64, Max Dimension: 256, Start k: 16, accuracy: 86.6397, Time: 4.890000000000001
Min Dimension: 64, Max Dimension: 512, Start k: 16, accuracy: 86.6397, Time: 4.970000000000001
Min Dimension: 64, Max Dimension: 1024, Start k: 16, accuracy: 86.6397, Time: 5.08
Min Dimension: 64, Max Dimension: 2048, Start k: 16, accuracy: 86.6397, Time: 5.295
Min Dimension: 64, Max Dimension: 3072, Start k: 16, accuracy: 86.6397, Time: 5.635
Min Dimension: 128, Max Dimension: 256, Start k: 16, accuracy: 91.9028, Time: 6.904999999999999
Min Dimension: 128, Max Dimension: 512, Start k: 16, accuracy: 92.9555, Time: 7.08
Min Dimension: 128, Max Dimension: 1024, Start k: 16, accuracy: 92.9555, Time: 7.244999999999999
Min Dimension: 128, Max Dimension: 2048, Start k: 16, accuracy: 92.9555, Time: 7.42
Min Dimension: 128, Max Dimension: 3072, Start k: 16, accuracy: 92.9555, Time: 7.68
Min Dimension: 256, Max Dimension: 512, Start k: 16, accuracy: 93.4008, Time: 10.815000000000001
Min Dimension: 256, Max Dimension: 1024, Start k: 16, accuracy: 93.8057, Time: 11.075
Min Dimension: 256, Max Dimension: 2048, Start k: 16, accuracy: 93.8057, Time: 11.3
Min Dimension: 256, Max Dimension: 3072, Start k: 16, accuracy: 93.8057, Time: 11.6
Min Dimension: 512, Max Dimension: 1024, Start k: 16, accuracy: 93.8462, Time: 18.634999999999998
Min Dimension: 512, Max Dimension: 2048, Start k: 16, accuracy: 94.0891, Time: 18.975
Min Dimension: 512, Max Dimension: 3072, Start k: 16, accuracy: 94.0891, Time: 19.225
Min Dimension: 1024, Max Dimension: 2048, Start k: 16, accuracy: 94.17, Time: 33.94
Min Dimension: 1024, Max Dimension: 3072, Start k: 16, accuracy: 94.413, Time: 34.55
Min Dimension: 2048, Max Dimension: 3072, Start k: 16, accuracy: 94.4534, Time: 64.00999999999999
Min Dimension: 64, Max Dimension: 128, Start k: 8, accuracy: 83.0364, Time: 4.615
Min Dimension: 64, Max Dimension: 256, Start k: 8, accuracy: 83.0364, Time: 4.66
Min Dimension: 64, Max Dimension: 512, Start k: 8, accuracy: 83.0364, Time: 4.675000000000001
Min Dimension: 64, Max Dimension: 1024, Start k: 8, accuracy: 83.0364, Time: 4.85
Min Dimension: 64, Max Dimension: 2048, Start k: 8, accuracy: 83.0364, Time: 5.08
Min Dimension: 64, Max Dimension: 3072, Start k: 8, accuracy: 83.0364, Time: 5.295
Min Dimension: 128, Max Dimension: 256, Start k: 8, accuracy: 91.7814, Time: 6.65
Min Dimension: 128, Max Dimension: 512, Start k: 8, accuracy: 91.7814, Time: 6.72
Min Dimension: 128, Max Dimension: 1024, Start k: 8, accuracy: 91.7814, Time: 6.779999999999999
Min Dimension: 128, Max Dimension: 2048, Start k: 8, accuracy: 91.7814, Time: 6.9350000000000005
Min Dimension: 128, Max Dimension: 3072, Start k: 8, accuracy: 91.7814, Time: 7.285
Min Dimension: 256, Max Dimension: 512, Start k: 8, accuracy: 93.2794, Time: 10.355
Min Dimension: 256, Max Dimension: 1024, Start k: 8, accuracy: 93.2794, Time: 10.504999999999999
Min Dimension: 256, Max Dimension: 2048, Start k: 8, accuracy: 93.2794, Time: 10.745000000000001
Min Dimension: 256, Max Dimension: 3072, Start k: 8, accuracy: 93.2794, Time: 11.015
Min Dimension: 512, Max Dimension: 1024, Start k: 8, accuracy: 93.8462, Time: 17.925
Min Dimension: 512, Max Dimension: 2048, Start k: 8, accuracy: 93.8462, Time: 18.18
Min Dimension: 512, Max Dimension: 3072, Start k: 8, accuracy: 93.8462, Time: 18.585
Min Dimension: 1024, Max Dimension: 2048, Start k: 8, accuracy: 94.17, Time: 32.855000000000004
Min Dimension: 1024, Max Dimension: 3072, Start k: 8, accuracy: 94.17, Time: 33.025000000000006
Min Dimension: 2048, Max Dimension: 3072, Start k: 8, accuracy: 94.4534, Time: 63.144999999999996
Min Dimension: 64, Max Dimension: 128, Start k: 4, accuracy: 80.1619, Time: 4.525
Min Dimension: 64, Max Dimension: 256, Start k: 4, accuracy: 80.2024, Time: 4.4350000000000005
Min Dimension: 64, Max Dimension: 512, Start k: 4, accuracy: 80.2024, Time: 4.475
Min Dimension: 64, Max Dimension: 1024, Start k: 4, accuracy: 80.2024, Time: 4.625
Min Dimension: 64, Max Dimension: 2048, Start k: 4, accuracy: 80.2024, Time: 4.82
Min Dimension: 64, Max Dimension: 3072, Start k: 4, accuracy: 80.2024, Time: 5.13
Min Dimension: 128, Max Dimension: 256, Start k: 4, accuracy: 91.336, Time: 6.48
Min Dimension: 128, Max Dimension: 512, Start k: 4, accuracy: 91.336, Time: 6.529999999999999
Min Dimension: 128, Max Dimension: 1024, Start k: 4, accuracy: 91.336, Time: 6.6
Min Dimension: 128, Max Dimension: 2048, Start k: 4, accuracy: 91.336, Time: 6.875
Min Dimension: 128, Max Dimension: 3072, Start k: 4, accuracy: 91.336, Time: 7.13
Min Dimension: 256, Max Dimension: 512, Start k: 4, accuracy: 93.1984, Time: 10.085
Min Dimension: 256, Max Dimension: 1024, Start k: 4, accuracy: 93.1984, Time: 10.405000000000001
Min Dimension: 256, Max Dimension: 2048, Start k: 4, accuracy: 93.1984, Time: 10.64
Min Dimension: 256, Max Dimension: 3072, Start k: 4, accuracy: 93.1984, Time: 10.785
Min Dimension: 512, Max Dimension: 1024, Start k: 4, accuracy: 93.8462, Time: 17.72
Min Dimension: 512, Max Dimension: 2048, Start k: 4, accuracy: 93.8462, Time: 17.990000000000002
Min Dimension: 512, Max Dimension: 3072, Start k: 4, accuracy: 93.8462, Time: 18.634999999999998
Min Dimension: 1024, Max Dimension: 2048, Start k: 4, accuracy: 94.17, Time: 31.42
Min Dimension: 1024, Max Dimension: 3072, Start k: 4, accuracy: 94.17, Time: 32.745
Min Dimension: 2048, Max Dimension: 3072, Start k: 4, accuracy: 94.4534, Time: 61.870000000000005
"""

In [None]:
sk8_accuracies = [float(x) for x in re.findall(r'accuracy: ([\d.]+)', sk8_long_text)]
sk8_times = [float(x) for x in re.findall(r'Time: ([\d.]+)', sk8_long_text)]
sk8_min_dims = [int(x) for x in re.findall(r'Min Dimension: (\d+),', sk8_long_text)]
sk8_max_dims = [int(x) for x in re.findall(r'Max Dimension: (\d+),', sk8_long_text)]
sk8_start_k = [int(x) for x in re.findall(r'Start k: (\d+),', sk8_long_text)]

for i in range(len(sk8_accuracies)):
  sk8_accuracies[i] = round(sk8_accuracies[i], 4)

for i in range(len(sk8_times)):
  sk8_times[i] = round(sk8_times[i], 4)

print("sk8_accuracies =", sk8_accuracies)
print("sk8_times =", sk8_times)

sk8_accuracies = [88.1377, 91.9433, 93.2794, 93.6842, 93.6842, 93.6842, 92.0243, 93.4008, 93.8462, 94.0891, 94.0891, 93.4008, 93.8462, 94.17, 94.413, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 88.0972, 91.6599, 92.834, 92.834, 92.834, 92.834, 92.0243, 93.4008, 93.8057, 93.8057, 93.8057, 93.4008, 93.8462, 94.17, 94.17, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 87.8543, 91.2551, 92.3077, 92.3077, 92.3077, 92.3077, 92.0243, 93.4008, 93.8057, 93.8057, 93.8057, 93.4008, 93.8462, 94.17, 94.17, 93.8462, 94.17, 94.4534, 94.17, 94.4534, 94.4534, 87.4494, 90.6478, 91.8219, 91.8219, 91.8219, 91.8219, 92.0243, 93.4008, 93.7652, 93.7652, 93.7652, 93.4008, 93.8462, 94.0891, 94.0891, 93.8462, 94.17, 94.413, 94.17, 94.4534, 94.4534, 87.004, 89.8785, 89.8785, 89.8785, 89.8785, 89.8785, 91.9838, 93.1984, 93.1984, 93.1984, 93.1984, 93.4008, 93.8462, 93.8462, 93.8462, 93.8462, 94.17, 94.17, 94.17, 94.4534, 94.4534, 86.3968, 88.8664, 88.8664, 88.8664, 88.8664, 88.8664, 91.9838, 93.1174, 93.1

## Graph

In [None]:
trace_openai = go.Scatter(
    x=openai_knn_times,
    y=openai_knn_accuracies,
    mode='lines+markers',
    marker=dict(symbol='circle', size=10),
    name='OpenAI KNN',
    text=[f"OpenAI KNN<br>Dimension: {dim}<br>Result: {res}%<br>Time: {t}"
      for dim, res, t in zip(openai_truncate_dims, openai_knn_accuracies, openai_knn_times)],
    hoverinfo='text',
    line=dict(shape='linear')
)

# trace_new_method = go.Scatter(
#     x=new_method_times,
#     y=new_method_accuracies,
#     mode='markers',
#     name='Step K = 2',
#     text=[f"Step k = 2<br>Start Dimension: {s_dim}<br>End Dimension: {e_dim}<br>Start k: {k}<br>Result: {res}%<br>Time: {t}"
#       for s_dim, e_dim, k, res, t in zip(new_method_min_dims, new_method_max_dims, new_method_start_k, new_method_accuracies, new_method_times)],
#     hoverinfo='text',
#     marker=dict(symbol='square')
# )

trace_valid = go.Scatter(
    x=valid_times,
    y=valid_accuracies,
    mode='markers',
    name='Step K = 2 (Valid Case)',
    text=[f"Step k = 2<br>Start Dimension: {s_dim}<br>End Dimension: {e_dim}<br>Start k: {k}<br>Result: {res}%<br>Time: {t}"
      for s_dim, e_dim, k, res, t in zip(valid_min_dims, valid_max_dims, valid_start_k, valid_accuracies, valid_times)],
    hoverinfo='text',
    marker=dict(symbol='square', color='green')
)

trace_invalid = go.Scatter(
    x=invalid_times,
    y=invalid_accuracies,
    mode='markers',
    name='Step K = 2 (Invalid Case)',
    text=[f"Step k = 2<br>Start Dimension: {s_dim}<br>End Dimension: {e_dim}<br>Start k: {k}<br>Result: {res}%<br>Time: {t}"
      for s_dim, e_dim, k, res, t in zip(invalid_min_dims, invalid_max_dims, invalid_start_k, invalid_accuracies, invalid_times)],
    hoverinfo='text',
    marker=dict(symbol='square', color='red')
)

# trace_sk4 = go.Scatter(
#     x=sk4_times,
#     y=sk4_accuracies,
#     mode='markers',
#     name='Step K = 4',
#     text=[f"Step k = 4<br>Start Dimension: {s_dim}<br>End Dimension: {e_dim}<br>Start k: {k}<br>Result: {res}%<br>Time: {t}"
#       for s_dim, e_dim, k, res, t in zip(sk4_min_dims, sk4_max_dims, sk4_start_k, sk4_accuracies, sk4_times)],
#     hoverinfo='text',
#     marker=dict(symbol='square')
# )

# trace_sk8 = go.Scatter(
#     x=sk8_times,
#     y=sk8_accuracies,
#     mode='markers',
#     name='Step K = 8',
#     text=[f"Step k = 8<br>Start Dimension: {s_dim}<br>End Dimension: {e_dim}<br>Start k: {k}<br>Result: {res}%<br>Time: {t}"
#       for s_dim, e_dim, k, res, t in zip(sk8_min_dims, sk8_max_dims, sk8_start_k, sk8_accuracies, sk8_times)],
#     hoverinfo='text',
#     marker=dict(symbol='square')
# )

layout = go.Layout(
    title='Accuracy vs Time (Truncation KNN VS Progressive KNN)',
    xaxis=dict(title='Time (s)'),
    yaxis=dict(title='Accuracy (%)'),
    # width=1000,
    height=800,
)

# fig = go.Figure(data=[trace_openai, trace_new_method, trace_sk4, trace_sk8], layout=layout)
fig = go.Figure(data=[trace_openai, trace_valid, trace_invalid], layout=layout)
fig.show()

In [None]:
trace_openai = go.Scatter(
    x=openai_knn_times,
    y=openai_knn_accuracies,
    mode='lines+markers',
    marker=dict(symbol='circle', size=10),
    name='Regular KNN',
    text=[f"OpenAI KNN<br>Dimension: {dim}<br>Result: {res}%<br>Time: {t}"
      for dim, res, t in zip(openai_truncate_dims, openai_knn_accuracies, openai_knn_times)],
    hoverinfo='text',
    line=dict(shape='linear')
)

trace_new_method = go.Scatter(
    x=new_method_times,
    y=new_method_accuracies,
    mode='markers',
    name='Progressive KNN',
    text=[f"Step k = 2<br>Start Dimension: {s_dim}<br>End Dimension: {e_dim}<br>Start k: {k}<br>Result: {res}%<br>Time: {t}"
      for s_dim, e_dim, k, res, t in zip(new_method_min_dims, new_method_max_dims, new_method_start_k, new_method_accuracies, new_method_times)],
    hoverinfo='text',
    marker=dict(symbol='square')
)

# trace_valid = go.Scatter(
#     x=valid_times,
#     y=valid_accuracies,
#     mode='markers',
#     name='Step K = 2 (Valid Case)',
#     text=[f"Step k = 2<br>Start Dimension: {s_dim}<br>End Dimension: {e_dim}<br>Start k: {k}<br>Result: {res}%<br>Time: {t}"
#       for s_dim, e_dim, k, res, t in zip(valid_min_dims, valid_max_dims, valid_start_k, valid_accuracies, valid_times)],
#     hoverinfo='text',
#     marker=dict(symbol='square', color='green')
# )

# trace_invalid = go.Scatter(
#     x=invalid_times,
#     y=invalid_accuracies,
#     mode='markers',
#     name='Step K = 2 (Invalid Case)',
#     text=[f"Step k = 2<br>Start Dimension: {s_dim}<br>End Dimension: {e_dim}<br>Start k: {k}<br>Result: {res}%<br>Time: {t}"
#       for s_dim, e_dim, k, res, t in zip(invalid_min_dims, invalid_max_dims, invalid_start_k, invalid_accuracies, invalid_times)],
#     hoverinfo='text',
#     marker=dict(symbol='square', color='red')
# )

# trace_sk4 = go.Scatter(
#     x=sk4_times,
#     y=sk4_accuracies,
#     mode='markers',
#     name='Step K = 4',
#     text=[f"Step k = 4<br>Start Dimension: {s_dim}<br>End Dimension: {e_dim}<br>Start k: {k}<br>Result: {res}%<br>Time: {t}"
#       for s_dim, e_dim, k, res, t in zip(sk4_min_dims, sk4_max_dims, sk4_start_k, sk4_accuracies, sk4_times)],
#     hoverinfo='text',
#     marker=dict(symbol='square')
# )

# trace_sk8 = go.Scatter(
#     x=sk8_times,
#     y=sk8_accuracies,
#     mode='markers',
#     name='Step K = 8',
#     text=[f"Step k = 8<br>Start Dimension: {s_dim}<br>End Dimension: {e_dim}<br>Start k: {k}<br>Result: {res}%<br>Time: {t}"
#       for s_dim, e_dim, k, res, t in zip(sk8_min_dims, sk8_max_dims, sk8_start_k, sk8_accuracies, sk8_times)],
#     hoverinfo='text',
#     marker=dict(symbol='square')
# )

layout = go.Layout(
    title='Accuracy vs Time (Truncation KNN VS Progressive KNN)',
    xaxis=dict(title='Time (s)'),
    yaxis=dict(title='Accuracy (%)'),
    # width=1000,
    height=800,
)

fig = go.Figure(data=[trace_openai, trace_new_method], layout=layout)
# fig = go.Figure(data=[trace_openai, trace_valid, trace_invalid], layout=layout)
fig.show()