# Index Tests
Tests on the creation use data with the following composition:
- test1: 20 3-dimensional points
- test2: 30 3-dimensional points
- test3: 100 4-dimensional points

## Import Vantage Point Tree Classes

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install import-ipynb

Collecting import-ipynb
  Downloading https://files.pythonhosted.org/packages/63/35/495e0021bfdcc924c7cdec4e9fbb87c88dd03b9b9b22419444dc370c8a45/import-ipynb-0.1.3.tar.gz
Building wheels for collected packages: import-ipynb
  Building wheel for import-ipynb (setup.py) ... [?25l[?25hdone
  Created wheel for import-ipynb: filename=import_ipynb-0.1.3-cp36-none-any.whl size=2976 sha256=db75b50806731ad382e126a8c32ba4c3851e4d47901fa8fed7acaa8a02a74842
  Stored in directory: /root/.cache/pip/wheels/b4/7b/e9/a3a6e496115dffdb4e3085d0ae39ffe8a814eacc44bbf494b5
Successfully built import-ipynb
Installing collected packages: import-ipynb
Successfully installed import-ipynb-0.1.3


## Initialize Notebook

In [None]:
import numpy as np
import os
from scipy.spatial import distance as d
import time
import math
import random
import json

TEST_PATH = '/content/gdrive/MyDrive/test-folder'

FINE_TUNED_ID = '/content/gdrive/MyDrive/[MIRCV]FoodWebSearch/deployment/ft_id.npy'
FINE_TUNED_FEATURES = '/content/gdrive/MyDrive/[MIRCV]FoodWebSearch/deployment/ft_features.npy'

FEATURES_PATH_TEST_1 = "/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/features-test-1.npy"
FEATURES_NAMES_TEST_1 = "/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/features-names-test-1.npy"

FEATURES_PATH_TEST_2 = "/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/features-test-2.npy"
FEATURES_NAMES_TEST_2 = "/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/features-names-test-2.npy"

FEATURES_PATH_TEST_3 = "/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/features-test-3.npy"
FEATURES_NAMES_TEST_3 = "/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/features-names-test-3.npy"

## Classes

In [None]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, None):
            return ""
        return json.JSONEncoder.default(self, obj)

In [None]:
class Node:

  def __init__(self, id, is_leaf, **kwargs):
    self.parent = kwargs.get("parent", None)
    self.id = id
    self.is_leaf = is_leaf
    self.pivot = kwargs.get("pivot", None)
    self.median = kwargs.get("median", -1)
    if self.is_leaf:
      self.objects = kwargs.get("objects", [])
      self.file_path_s_1, self.file_path_s_2 = "", ""
    else:
      self.right = kwargs.get("right", None)
      self.left = kwargs.get("left", None)

  def set_parameters(self, pivot, median):
    self.pivot = pivot
    self.median = median

  def add_children(self, left, right):
    self.left = left
    self.right = right

  def add_objects(self, s_1, s_2):
    self.objects_left = s_1
    self.objects_right = s_2

  def save_leaf_objects_on_disk(self, file_path, s_1, s_2):
    self.file_path_s_1 = file_path + "_subset_1.npy"
    self.file_path_s_2 = file_path + "_subset_2.npy"
    np.save(self.file_path_s_1, np.array(s_1, dtype=object))
    np.save(self.file_path_s_2, np.array(s_2, dtype=object))

  def load_objects_from_disk(self, left=True, right=True):
    if left and not right:
      result = np.load(self.file_path_s_1, allow_pickle=True)
      return result
    if right and not left:
      result = np.load(self.file_path_s_1, allow_pickle=True)
      return result
    s_1 = np.load(self.file_path_s_1, allow_pickle=True)
    s_2 = np.load(self.file_path_s_2, allow_pickle=True)
    result = np.concatenate((s_1, s_2))
    return result

  def get_node_name(self):
    return self.id

In [None]:
class VP_Tree:

  def __init__(self, index_name, height, disk_mode=True, leaves_path=None, distance=None):
    """by default the tree is built with euclidean distance and the leaves are saved on disk, 
    use_similarity=True allows you to use the cosine similarity
    use disk_mode=False if you want to keep all the tree in memory(not suggested for huge data)"""
    self.root = None
    self.index_name = index_name
    self.height = height #to review
    self.disk_mode = disk_mode
    self.leaves_path = leaves_path
    self.distance = distance #the distance function used (euclidean by default)
    self.distance_computed = 0
    self.file_accessed = 0
    self.file_created = 0

  def create_vptree(self, names_path, features_path):
    start = time.time()
    data = VP_Tree.read_data(names_path, features_path)
    n = len(data)
    print("Number of data:", n)
    max_height = math.floor(math.log(n,2)-1)
    print("The max height of the tree is:", max_height)
    if self.height > max_height: self.height = max_height
    self.distance_computed = 0
    #take 1 pivot randomly and set pivot as root
    self.root, s_1, s_2 = self.partition_by_median(data)
    print("Tree is building")
    self.create_tree_level(self.root, s_1, s_2, 1)
    end = time.time()
    print("Building of the tree completed in:", end-start, "s")
  
  def create_tree_level(self, node, s_1, s_2, iteration):
      is_leaf = iteration + 1 >= self.height
      left_node, s_1_left, s_2_left = self.partition_by_median(s_1, parent=node,is_left=True, is_leaf=is_leaf)
      right_node, s_1_right, s_2_right = self.partition_by_median(s_2, parent=node,is_left=False, is_leaf=is_leaf)
      node.add_children(right_node, left_node)
      if iteration + 1 < self.height:
        self.create_tree_level(left_node, s_1_left, s_2_left, iteration + 1)
        self.create_tree_level(right_node, s_1_right, s_2_right, iteration + 1)
      else:
        if self.disk_mode:
          left_path = self.get_leaves_path(left_node.get_node_name())
          right_path = self.get_leaves_path(right_node.get_node_name())
          left_node.save_leaf_objects_on_disk(left_path, s_1_left, s_2_left)
          right_node.save_leaf_objects_on_disk(right_path, s_1_right, s_2_right)
        else:
          left_node.add_objects(s_1_left, s_2_left)
          right_node.add_objects(s_1_right, s_2_right)

  def partition_by_median(self, data, parent=None,is_left=False,is_leaf=False):
    pivot_index = random.choice(range(len(data)))
    pivot = data[pivot_index]
    del data[pivot_index]
    #compute all the distances
    distances = np.array([self.compute_distance(pivot[1],element[1]) for element in data])
    #sort the distances
    zipped_data_distances = sorted(zip(data, distances), key= lambda x:x[1])
    ordered_data, distances = zip(*zipped_data_distances)
    median = np.median(distances)
    #get the median
    s_1 = [element for element, distance in zipped_data_distances if distance <= median]
    s_2 = [element for element, distance in zipped_data_distances if distance >= median]
    #update node
    if parent == None:
      node = Node(id="0", is_leaf=is_leaf, pivot=pivot, median=median)
    else:
      node_id = parent.id + str(0 if is_left else 1)
      node = Node(node_id, is_leaf=is_leaf, pivot=pivot, median=median)
    return node, s_1, s_2

  def save_vptree(file_path, tree):
    if not os.path.exists(file_path): os.mkdir(file_path)
    file = os.path.join(file_path, tree.index_name + '.json')
    if os.path.exists(file):
      os.remove(file)
    with open(file, 'a') as json_file:
      index_json = {"index": tree.index_name, "nodes":[], 
                    "height":tree.height, "distance": None}
      VP_Tree.save_node(tree.root, index_json)
      vp_tree_json = json.dumps(index_json, cls=NumpyEncoder)
      json_file.write(vp_tree_json)
      print("File saved correctly in:", file)
    return file
  
  def save_node(node, index_json):
    if node.is_leaf:
        row_json={"is_leaf":True, 
                    "id":node.id,
                    "pivot" : node.pivot,
                    "median":node.median, 
                    "left_file":node.file_path_s_1, 
                    "right_file":node.file_path_s_2}
        index_json["nodes"].append(row_json)
    else:
        row_json={"is_leaf":False,
                  "id":node.id, 
                  "pivot": node.pivot,
                  "median": node.median,
                  "right_child":node.right.id,
                  "left_child":node.left.id}
        index_json["nodes"].append(row_json)
        VP_Tree.save_node(node.left, index_json)
        VP_Tree.save_node(node.right,index_json)
    return

  def load_vptree(path):
    if not os.path.exists:
      print("the path do not exist")
      return None
    entry_list=[]
    with open(path,'r', encoding='utf-8') as f:
      json_tree = json.load(f)
      entry_list=json_tree["nodes"]
    root_node=VP_Tree.parse_node('0',entry_list)
    index_name = json_tree["index"]
    height = json_tree["height"]
    distance = json_tree.get("distance", None)
    vp_tree = VP_Tree(index_name=index_name,height=height,leaves_path=path, 
                      distance=None)
    vp_tree.root = root_node
    print("Tree loaded correctly")
    return vp_tree

  def parse_node(id, nodes):
    node_json = None
    for element in nodes:
      if element["id"]==id:
        node_json = element
    node=Node(id=node_json["id"], is_leaf=node_json["is_leaf"], 
              pivot=node_json["pivot"], median=node_json["median"])
    if (node.is_leaf):
      node.file_path_s_1=node_json["left_file"]
      node.file_path_s_2=node_json["right_file"]
    else:
      right=VP_Tree.parse_node(node_json["right_child"],nodes)
      left=VP_Tree.parse_node(node_json["left_child"],nodes)
      node.add_children(left, right)
    return node

  def knn_search(self, k, query):
    start = time.time()
    nn = [None for i in range(k)]
    d_nn = [math.inf for i in range(k)]
    self.distance_computed = 0
    self.file_accessed = 0
    nn, d_nn = self.search_subtree(self.root, nn, d_nn, k, query)
    end = time.time()
    print("Query answered in", end-start, " s")
    return self.reorder_list_on_distances(nn, d_nn, desc=False)

  def search_subtree(self, node, nn, d_nn, k, query):
    pivot, median = node.pivot, node.median
    distance = self.compute_distance(pivot[1], query)
    if distance < d_nn[0]:
      d_nn[0] = distance
      nn[0] = pivot
      nn, d_nn = self.reorder_list_on_distances(nn, d_nn)
    if node.is_leaf:
      return self.search_in_leaf(node, nn, d_nn, k, query)
    if distance - d_nn[0] <= median:
      nn, d_nn = self.search_subtree(node.left, nn, d_nn, k, query)
    if distance + d_nn[0] >= median:
      nn, d_nn = self.search_subtree(node.right, nn, d_nn, k, query)
    return nn, d_nn

  def search_in_leaf(self, node, nn, d_nn, k, query):
    objects = []
    distance_pivot = self.compute_distance(node.pivot[1], query)
    left, right = False, False
    if self.disk_mode:
      if distance_pivot - d_nn[0] <= node.median: 
        left = True
        self.file_accessed = self.file_accessed + 1
      if distance_pivot + d_nn[0] >= node.median: 
        right = True
        self.file_accessed = self.file_accessed + 1
      objects = node.load_objects_from_disk(left=left, right=right)
    else:
      objects = node.objects_left + node.objects_right
    for obj in objects:
      distance = self.compute_distance(obj[1], query)
      if distance < d_nn[0]:
        nn[0] = obj
        d_nn[0] = distance
        nn, d_nn = self.reorder_list_on_distances(nn, d_nn)
    return nn, d_nn

  def reorder_list_on_distances(self, nn, d_nn, desc=True):
      zipped = sorted(zip(nn, d_nn), key= lambda x:x[1], reverse=desc)
      nn, d_nn = zip(*zipped)
      return list(nn), list(d_nn)

  def print_tree(node, level, disk_mode=True):
    indentation = "\n" + str(level * "\t")
    response = "id: " + node.id + " " + str(node.pivot)
    if node.is_leaf:
      if disk_mode: 
        response += indentation + str(node.file_path_s_1)
        response += indentation + str(node.file_path_s_2)
      else:
        response += indentation + str(node.objects_left)
        response += indentation + str(node.objects_right)
      return response
    response += indentation + VP_Tree.print_tree(node=node.right, level=level+1, disk_mode=disk_mode)
    response += indentation + VP_Tree.print_tree(node=node.left, level=level+1, disk_mode=disk_mode)
    return response

  def get_leaves_path(self, file_name):
    if not self.leaves_path is None:
      directory = os.path.join(self.leaves_path, self.index_name)
    else: directory = os.path.join(LEAF_FOLDER, self.index_name)
    if not os.path.exists(directory):
      os.mkdir(directory)
      print("directory created", directory)
    leaves_directory = os.path.join(directory, "leaves_"+ str(self.height))
    if not os.path.exists(leaves_directory):
      os.mkdir(leaves_directory)
    return os.path.join(leaves_directory, file_name)

  def compute_distance(self, a, b):
    self.distance_computed = self.distance_computed + 1
    if self.distance == None:
      return d.euclidean(a,b)
    return self.distance(a,b)

  def read_data(file_path_names, file_path_features):
    names = np.load(file_path_names)
    features = np.load(file_path_features)
    return [(name, feature) for name, feature in zip(names, features)]

## Partition By Median Tests

In [None]:
# Partition By Median Tests 1
data = [("image0", np.array([0,2,1])) , ("image1",np.array([2,3,6])), ("image2",np.array([5,3,2])),
          ("image3",np.array([5,6,4])), ("image4",np.array([5,16,1])), ("image5",np.array([2,6,2])), ("image6",np.array([1,3,1]))]

vantage_point_tree = VP_Tree("Index_Test", 5)
node, s_1, s_2 = vantage_point_tree.partition_by_median(data)

print("Node:", node.pivot)
print("Median:", node.median)
print("Set 1:", s_1)
print("Set 2:", s_2)

Node: ('image0', array([0, 2, 1]))
Median: 5.336688998879147
Set 1: [('image6', array([1, 3, 1])), ('image5', array([2, 6, 2])), ('image2', array([5, 3, 2]))]
Set 2: [('image1', array([2, 3, 6])), ('image3', array([5, 6, 4])), ('image4', array([ 5, 16,  1]))]


In [None]:
# Partition By Median Tests 2
data = [("img_1",np.array([0,0])), ("img_2",np.array([0,1])), ("img_3",np.array([2,2])), ("img_4",np.array([3,3]))]

vantage_point_tree = VP_Tree("Index_Test", 5)

node, s_1, s_2 = vantage_point_tree.partition_by_median(data)

print("Pivot:", node.pivot)
print("Median:", node.median)
print("Set 1:", s_1)
print("Set 2:", s_2)

Pivot: ('img_1', array([0, 0]))
Median: 2.8284271247461903
Set 1: [('img_2', array([0, 1])), ('img_3', array([2, 2]))]
Set 2: [('img_3', array([2, 2])), ('img_4', array([3, 3]))]


## Creation Tests with euclidean distance

In [None]:
# Create Tree Test 1
vantage_point_tree = VP_Tree("Index_Test_1",4)
vantage_point_tree.create_vptree(FEATURES_NAMES_TEST_1,FEATURES_PATH_TEST_1)
result = VP_Tree.print_tree(vantage_point_tree.root, level=1)

print("Result:\n" + result + "\n")

index_name = vantage_point_tree.index_name

path = VP_Tree.save_vptree(os.path.join(TEST_PATH, index_name),vantage_point_tree)

print("Saved on:", path)

tree_loaded = VP_Tree.load_vptree(path)

print("\nTree Loaded\n")
result = VP_Tree.print_tree(tree_loaded.root, level=1)
print(result)

Number of data: 21
The max height of the tree is: 3
Tree is building


NameError: ignored

In [None]:
# Create Tree Test 2
vantage_point_tree = VP_Tree("Index_Test_2",4)
vantage_point_tree.create_vptree(FEATURES_NAMES_TEST_2, FEATURES_PATH_TEST_2)
result = VP_Tree.print_tree(vantage_point_tree.root, level=1)
print("Result:\n" + result + "\n")

path = VP_Tree.save_vptree(os.path.join(TEST_PATH, index_name),vantage_point_tree)

print("Saved on:", path)

tree_loaded = VP_Tree.load_vptree(path)

print("\nTree Loaded\n")
result = VP_Tree.print_tree(tree_loaded.root, level=1)
print(result)

In [None]:
# Create Tree Test 3
vantage_point_tree = VP_Tree("Index_Test_3",6)
vantage_point_tree.create_vptree(FEATURES_NAMES_TEST_3, FEATURES_PATH_TEST_3)
result = VP_Tree.print_tree(vantage_point_tree.root, level=1)
print("Result:\n" + result + "\n")

path = VP_Tree.save_vptree(os.path.join(TEST_PATH, index_name),vantage_point_tree)

print("Saved on:", path)

tree_loaded = VP_Tree.load_vptree(path)

print("\nTree Loaded\n")
result = VP_Tree.print_tree(tree_loaded.root, level=1)
print(result)

In [None]:
# Create Tree Test 4
vantage_point_tree = VP_Tree("Index_Test_4",4, disk_mode=True,leaves_path=TEST_PATH)
vantage_point_tree.create_vptree(FEATURES_NAMES_TEST_1, FEATURES_PATH_TEST_1)
result = VP_Tree.print_tree(vantage_point_tree.root, level=1, disk_mode=True)

print("Result:\n" + result + "\n")

path = VP_Tree.save_vptree(os.path.join(TEST_PATH, index_name),vantage_point_tree)

print("Saved on:", path)

tree_loaded = VP_Tree.load_vptree(path)

print("\nTree Loaded\n")
result = VP_Tree.print_tree(tree_loaded.root, level=1)
print(result)

## Creation Tests with Cosine Similarity

## Index Fine Tuned (Hamming Distance)

In [None]:
def hamming(a,b):
  return np.count_nonzero(a != b)
  #return d.hamming(a,b)

# creation
if __name__ == '__main__':
  vantage_point_tree = VP_Tree("index_fine_tuned_hamming",height=10, 
                               disk_mode=True, leaves_path=TEST_PATH,
                               distance=hamming)
  vantage_point_tree.create_vptree(FINE_TUNED_ID, FINE_TUNED_FEATURES)

  index_name = vantage_point_tree.index_name
  dest_folder = os.path.join(TEST_PATH, index_name)
  print("Destination Folder: ", dest_folder)


  VP_Tree.save_vptree(dest_folder,vantage_point_tree)

Number of data: 126000
The max height of the tree is: 15
Tree is building
Building of the tree completed in: 44.90977764129639 s
Destination Folder:  /content/gdrive/MyDrive/test-folder/index_fine_tuned_hamming
File saved correctly in: /content/gdrive/MyDrive/test-folder/index_fine_tuned_hamming/index_fine_tuned_hamming.json


In [None]:
query = [random.randint(-2,2) for _ in range(256)]
print("Query length: ", len(query))
results, distances = vantage_point_tree.knn_search(10, vantage_point_tree.root.pivot[1])
print([element for element, _ in results])
print(distances)
print("Distances Computed: ", vantage_point_tree.distance_computed)
print("File Accessed: ", vantage_point_tree.file_accessed)

Query length:  256
Query answered in 9.486578702926636  s
['im8481.jpg', 'im3103.jpg', 'im935.jpg', 'im21649.jpg', 'im21649.jpg', 'im123.jpg', 'im123.jpg', 'im9931.jpg', 'im15786.jpg', 'im15786.jpg']
[0, 219, 220, 220, 220, 220, 220, 221, 221, 221]
Distances Computed:  673855
File Accessed:  1024


# Index Fine Tuning (Cosine Similarity)

In [None]:
def inv_cosine_similarity(a,b):
  return 1-np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

# creation
if __name__ == '__main__':
  vantage_point_tree_cosine = VP_Tree("index_fine_tuned_hamming",height=10, 
                               disk_mode=True, leaves_path=TEST_PATH,
                               distance=inv_cosine_similarity)
  vantage_point_tree_cosine.create_vptree(FINE_TUNED_ID, FINE_TUNED_FEATURES)

  index_name = vantage_point_tree_cosine.index_name
  dest_folder = os.path.join(TEST_PATH, index_name)
  print("Destination Folder: ", dest_folder)


  VP_Tree.save_vptree(dest_folder,vantage_point_tree_cosine)

Number of data: 126000
The max height of the tree is: 15
Tree is building
Building of the tree completed in: 34.03710865974426 s
Destination Folder:  /content/gdrive/MyDrive/test-folder/index_fine_tuned_hamming
File saved correctly in: /content/gdrive/MyDrive/test-folder/index_fine_tuned_hamming/index_fine_tuned_hamming.json


In [None]:
query = [random.randint(-2,2) for _ in range(256)]
print("Query length:", len(query))
results, distances = vantage_point_tree_cosine.knn_search(10, query)
print([element for element, _ in results])
print(distances)
print("Distances Computed: ", vantage_point_tree_cosine.distance_computed)
print("File Accessed: ", vantage_point_tree_cosine.file_accessed)

Query length: 256
Query answered in 14.846802473068237  s
['1444279.jpg', '3806312.jpg', '1262677.jpg', '979955.jpg', '677596.jpg', '3008582.jpg', '2525911.jpg', '2867015.jpg', '2958902.jpg', '2508157.jpg']
[0.8309591780466887, 0.8323519443522625, 0.8406768575545924, 0.8485221218616431, 0.8490953071667069, 0.8494656790827948, 0.8497474177553309, 0.851173617751247, 0.8513416205962392, 0.851387406132734]
Distances Computed:  120935
File Accessed:  971


#Manhattan Distance

In [None]:
def manhattan(a,b):
  return d.cityblock(a,b)

# creation
if __name__ == '__main__':
  vantage_point_tree_man = VP_Tree("index_fine_tuned_hamming",height=10, 
                               disk_mode=True, leaves_path=TEST_PATH,
                               distance=manhattan)
  vantage_point_tree_man.create_vptree(FINE_TUNED_ID, FINE_TUNED_FEATURES)

  index_name = vantage_point_tree_man.index_name
  dest_folder = os.path.join(TEST_PATH, index_name)
  print("Destination Folder: ", dest_folder)


  #VP_Tree.save_vptree(dest_folder,vantage_point_tree_man)

Number of data: 126000
The max height of the tree is: 15
Tree is building
Building of the tree completed in: 24.674943208694458 s
Destination Folder:  /content/gdrive/MyDrive/test-folder/index_fine_tuned_hamming


In [None]:
query = [random.randint(-2,2) for _ in range(256)]
print("Query length:", len(query))
results, distances = vantage_point_tree_cosine.knn_search(10, query)
print([element for element, _ in results])
print(distances)
print("Distances Computed: ", vantage_point_tree_cosine.distance_computed)
print("File Accessed: ", vantage_point_tree_cosine.file_accessed)

Query length: 256
Query answered in 14.295985698699951  s
['837724.jpg', 'im22351.jpg', 'im15164.jpg', 'im907.jpg', 'im20059.jpg', 'im7573.jpg', 'im19740.jpg', '3662475.jpg', '3555823.jpg', 'im24357.jpg']
[0.9006141772667191, 0.9152262032233672, 0.9196521317769965, 0.920218191045073, 0.9213471425716077, 0.9216129093927734, 0.9225966323776394, 0.9229664063954939, 0.9235919061737508, 0.9240897378679845]
Distances Computed:  117706
File Accessed:  945


#Prova

In [None]:
if __name__ == '__main__':
  dest_folder = os.path.join(INDEX_DIR, 
                             "index_fine_tuned_hamming/index_fine_tuned_hamming.json")
  vantage_point_tree = VP_Tree.load_vptree(dest_folder)
  id, query = vantage_point_tree.root.pivot[0], vantage_point_tree.root.pivot[1]
  print("Query id:", id)
  start = time.time()
  ids, d_nn = vantage_point_tree.knn_search(k=10, query=query)
  end = time.time()
  print("Results:", [element[0] for element in ids])
  print("Distances:", d_nn)
  print("Distance Computed:", vantage_point_tree.distance_computed)
  print("File Accessed:", vantage_point_tree.file_accessed)
  ids_str = [id[0] for id in ids]
  zipped = zip(ids_str,d_nn)
  #display_results(zipped)

Tree loaded correctly
Query id: 1037387.jpg
Query answered in 0.02215862274169922  s
Results: ['1037387.jpg', '3001416.jpg', '3254904.jpg', '2895543.jpg', '3526250.jpg', '3466578.jpg', '2346400.jpg', '3168158.jpg', '3311886.jpg', '3833803.jpg']
Distances: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Distance Computed: 829
File Accessed: 1


# Create Tree Test

In [None]:
# Create Tree Test 1
vantage_point_tree = VP_Tree("Index_Test_1_sim",4, disk_mode=True, use_similarity=True)
vantage_point_tree.create_vptree(FEATURES_NAMES_TEST_1,FEATURES_PATH_TEST_1)
result = VP_Tree.print_tree(vantage_point_tree.root, level=1)

print("Result:\n" + result + "\n")

index_name = vantage_point_tree.index_name

path = VP_Tree.save_vptree(os.path.join(TEST_PATH, index_name),vantage_point_tree)

print("Saved on:", path)

tree_loaded = VP_Tree.load_vptree(path)

print("\nTree Loaded\n")
result = VP_Tree.print_tree(tree_loaded.root, level=1)
print(result)

Number of data: 21
The max height of the tree is: 3
Tree is building
Building of the tree completed in: 3.6347827911376953 s
Result:
id: 0 ('img20', array([10, 31, 10]))
	id: 00 ('img0', array([0, 2, 1]))
		id: 000 ('img7', array([ 5, 23,  4]))
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_1_sim/leaves_3/000_subset_1.npy
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_1_sim/leaves_3/000_subset_2.npy
		id: 001 ('img4', array([ 5, 16,  1]))
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_1_sim/leaves_3/001_subset_1.npy
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_1_sim/leaves_3/001_subset_2.npy
	id: 01 ('img3', array([5, 6, 4]))
		id: 010 ('img10', array([77, 61, 46]))
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_1_sim/leaves_3/010_subset_1.npy
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_1_sim/leaves_3/010_subset_2.npy
		i

## Searching Tests

### Tree with Euclidean Distance

In [None]:
# k-NN search Test 1
vantage_point_tree = VP_Tree("Index_Test_knn_1",4)
vantage_point_tree.create_vptree(FEATURES_NAMES_TEST_1, FEATURES_PATH_TEST_1)
print(VP_Tree.print_tree(vantage_point_tree.root,1))

query = [0,1,1]
k = 3

print("Query:", query, "\n")

nn, d_nn = vantage_point_tree.knn_search(k, query)

print("\nk-NN search on the tree")
print("Points:", nn)
print("Distances:", d_nn)
print("Distance Computed:", vantage_point_tree.distance_computed)
print("File Accessed:", vantage_point_tree.file_accessed)

print("\n\nSequential Scan on the array")
data = np.load(FEATURES_PATH_TEST_1)
distances = [d.euclidean(point, query) for point in data]
zipped = sorted(zip(data, distances), key=lambda x:x[1])
for element in zipped[0:k]:
  print(element)

Number of data: 21
The max height of the tree is: 3
Tree is building
Building of the tree completed in: 0.06978678703308105 s
id: 0 ('img16', array([ 1, 30, 60]))
	id: 00 ('img13', array([ 2,  3, 10]))
		id: 000 ('img18', array([ 5, 23,  1]))
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_knn_1/leaves_3/000_subset_1.npy
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_knn_1/leaves_3/000_subset_2.npy
		id: 001 ('img17', array([25,  6, 41]))
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_knn_1/leaves_3/001_subset_1.npy
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_knn_1/leaves_3/001_subset_2.npy
	id: 01 ('img5', array([2, 6, 2]))
		id: 010 ('img6', array([1, 3, 1]))
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_knn_1/leaves_3/010_subset_1.npy
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_knn_1/leaves_3/010_subset_2.npy
		id: 01

In [None]:
# k-NN search Test 2
vantage_point_tree = VP_Tree("Index_Test_3",4)
vantage_point_tree.create_vptree(FEATURES_NAMES_TEST_2, FEATURES_PATH_TEST_2)
print(VP_Tree.print_tree(vantage_point_tree.root,1))

query = [0,1,1]
k = 3

nn, d_nn = vantage_point_tree.knn_search(k, query)

print("Query:", query)

print("k-NN search on the tree")
print("Points:", nn)
print("Distances:", d_nn)
print("Distance Computed:", vantage_point_tree.distance_computed)
print("File Accessed:", vantage_point_tree.file_accessed)

print("\n\nSequential Scan on the array")
data = np.load(FEATURES_PATH_TEST_2)
distances = [d.euclidean(point, query) for point in data]
zipped = sorted(zip(data, distances), key=lambda x:x[1])
for element in zipped[0:k]:
  print(element)

Number of data: 30
The max height of the tree is: 3
Tree is building
Building of the tree completed in: 6.901199102401733 s
id: 0 ('img_26', array([49., 83., 24.]))
	id: 00 ('img_16', array([56., 97., 40.]))
		id: 000 ('img_23', array([40., 97., 35.]))
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_3/000_subset_1.npy
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_3/000_subset_2.npy
		id: 001 ('img_1', array([60.21, 24.  , 26.  ]))
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_3/001_subset_1.npy
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_3/001_subset_2.npy
	id: 01 ('img_8', array([98., 27., 22.]))
		id: 010 ('img_29', array([43., 34., 61.]))
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_3/010_subset_1.npy
			/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_3/010_subset_2.np

In [None]:
# k-NN search Test 3
vantage_point_tree = VP_Tree("Index_Test_3",4)
vantage_point_tree.create_vptree(FEATURES_NAMES_TEST_3, FEATURES_PATH_TEST_3)
print(VP_Tree.print_tree(vantage_point_tree.root,1))

query = [2,17,26,31]
k = 6

nn, d_nn = vantage_point_tree.knn_search(k, query)

print("Query:", query)

print("k-NN search on the tree")
print("Points:", nn)
print("Distances:", d_nn)
print("Distance Computed:", vantage_point_tree.distance_computed)
print("File Accessed:", vantage_point_tree.file_accessed)

print("\n\nSequential Scan on the array")
data = np.load(FEATURES_PATH_TEST_3)
distances = [d.euclidean(point, query) for point in data]
zipped = sorted(zip(data, distances), key=lambda x:x[1])
for element in zipped[0:k]:
  print(element)

Number of data: 100
The max height of the tree is: 5
Tree is building
Building of the tree completed in: 0.12288379669189453 s
id: 0 ('img_28', array([86, 64, 36, 58]))
	id: 00 ('img_74', array([48, 53, 26, 39]))
		id: 000 ('img_33', array([45, 51, 30, 44]))
			id: 0000 ('img_64', array([80, 68, 43, 41]))
				/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_4/0000_subset_1.npy
				/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_4/0000_subset_2.npy
			id: 0001 ('img_31', array([82, 28, 35, 29]))
				/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_4/0001_subset_1.npy
				/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_4/0001_subset_2.npy
		id: 001 ('img_13', array([51, 84, 68, 72]))
			id: 0010 ('img_60', array([44, 36, 38, 92]))
				/content/gdrive/My Drive/[MIRCV]FoodWebSearch/antonio-tests/Index_Test_3/leaves_4/0010_subset_1.npy
				/content/gdrive/My Drive/[M

In [None]:
#k-NN search Test 4
vantage_point_tree = VP_Tree("Index_Test_3",4, disk_mode=True)
vantage_point_tree.create_vptree(FEATURES_NAMES_TEST_3, FEATURES_PATH_TEST_3)

query = [0,1,1,2]
k = 6

nn, d_nn = vantage_point_tree.knn_search(k, query)

print("Query:", query)

print("k-NN search on the tree")
print("Points:", nn)
print("Distances:", d_nn)

print("\n\nSequential Scan on the array")
data = np.load(FEATURES_PATH_TEST_3)
distances = [d.euclidean(point, query) for point in data]
zipped = sorted(zip(data, distances), key=lambda x:x[1])
for element in zipped[0:k]:
  print(element)

Number of data: 100
The max height of the tree is: 5
Tree is building
Building of the tree completed in: 0.11896729469299316 s
Query answered in 0.026964664459228516  s
Query: [0, 1, 1, 2]
k-NN search on the tree
Points: [array(['img_95', array([14, 26, 10, 15])], dtype=object), array(['img_75', array([ 2, 17, 25, 31])], dtype=object), array(['img_45', array([11, 29, 18, 31])], dtype=object), array(['img_20', array([20, 28, 34, 12])], dtype=object), array(['img_20', array([20, 28, 34, 12])], dtype=object), array(['img_80', array([10, 31, 30, 25])], dtype=object)]
Distances: [32.72613634390714, 40.95119045888654, 45.110974274559844, 48.14561246884289, 48.14561246884289, 48.68264577855234]


Sequential Scan on the array
(array([14, 26, 10, 15]), 32.72613634390714)
(array([ 2, 17, 25, 31]), 40.95119045888654)
(array([11, 29, 18, 31]), 45.110974274559844)
(array([20, 28, 34, 12]), 48.14561246884289)
(array([10, 31, 30, 25]), 48.68264577855234)
(array([33, 13, 16, 34]), 49.8196748283246)


### Tree with Cosine Similarity

In [None]:
# k-NN search Test 1 - Similarity
vantage_point_tree = VP_Tree("Index_Test_1_sim",4, use_similarity=True)
vantage_point_tree.create_vptree(FEATURES_NAMES_TEST_1, FEATURES_PATH_TEST_1)

query = [0,2,1]
k = 3

print("Query:", query, "\n")

nn, d_nn = vantage_point_tree.knn_search(k, query)

print("\nk-NN search on the tree")
print("Points:", nn)
print("Distances:", d_nn)

print("\n\nSequential Scan on the array")
data = np.load(FEATURES_PATH_TEST_1)
distances = [1-np.dot(point, query)/(np.linalg.norm(point)*np.linalg.norm(query)) for point in data]
zipped = sorted(zip(data, distances), key=lambda x:x[1])
for element in zipped[0:k]:
  print(element)

NameError: ignored