In [1]:
#! pip install pymilvus

In [2]:
from sentence_transformers import SentenceTransformer
import glob
from PIL import Image

import numpy as np
import time
from pathlib import Path

from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
num_entities, dim = 3000, 8

In [3]:
connections.connect("default", host="34.168.23.74", port="19530")

In [4]:
collection_name = "fsdl_cosine"

In [5]:
has = utility.has_collection(collection_name)
print(f"Does the photo embedding collection exist in milvus: {has}")

Does the photo embedding collection exist in milvus: True


In [6]:
dim = 512

In [7]:
if not has:
    fields = [
        FieldSchema(name="img_name", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
        FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]

    schema = CollectionSchema(fields, "FDSL project embedding database")
    fsdl_collection = Collection(collection_name, schema, consistency_level="Strong")
else:
    fsdl_collection = Collection(collection_name)

In [8]:
fsdl_collection

<Collection>:
-------------
<name>: fsdl_cosine
<partitions>: [{"name": "_default", "collection_name": "fsdl_cosine", "description": ""}]
<description>: FDSL project embedding database
<schema>: {
  auto_id: False
  description: FDSL project embedding database
  fields: [{
    name: img_name
    description: 
    type: 21
    params: {'max_length': 100}
    is_primary: True
    auto_id: False
  }, {
    name: embeddings
    description: 
    type: 101
    params: {'dim': 512}
  }]
}

In [9]:
img_names = list(glob.glob('/home/arylwen/github/fsdl_project_docker-img/semantic_search/ml_api/api/static/img/*.jpg'))
len(img_names)

403210

In [10]:
#num_images = 10
num_images = 4096
#num_images = len(img_names)
chunk_size = 3072

In [11]:
import torch
img_model = SentenceTransformer('clip-ViT-B-32')

def encode_images_cosine(img_model, img_names):
    print("Images:", len(img_names))
    img_emb=torch.empty(0)
    #split in chunks
    chunks = [img_names[x:x+chunk_size] for x in range(0, len(img_names), chunk_size)]
    #list comprehension reaches the limit of open files; TODO - how to do this in parallel
    for chunk in chunks:
        emb_tensor = img_model.encode([Image.open(filepath) for filepath in chunk],
                                        batch_size=128, convert_to_tensor=True, 
                                        show_progress_bar=True)
        print(emb_tensor.shape)
        img_emb = torch.cat((img_emb, emb_tensor),0)
    #print(img_emb)
    print(img_emb.shape)
    #place all points on the unit sphere
    img_emb = img_emb / np.linalg.norm(img_emb, axis=1, keepdims=True)
    return img_emb, img_names

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [12]:
img_emb, img_names = encode_images_cosine(img_model, img_names[:num_images])

Images: 4096


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

torch.Size([3072, 512])


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

torch.Size([1024, 512])
torch.Size([4096, 512])


In [13]:
img_emb_np = img_emb.detach().numpy()

In [14]:
np.save("../data/abo-cosine.npy", img_emb_np)

In [15]:
fnames = {idx: i.split("/")[-1].split(".")[0] for idx, i in enumerate(img_names[:num_images])}
len(fnames)

4096

In [16]:
#list(fnames.keys())

In [17]:
#fnames

In [18]:
#fnames.values()

In [19]:
entities = [
    list(fnames.values()),
    img_emb_np
]
#entities

[['7426229b',
  '5246e002',
  '67a581b7',
  '36f01b41',
  '3f527ee3',
  'faa2f842',
  '023b739e',
  'f57bcfd5',
  '639d9982',
  '9064d02e',
  '35c46162',
  '3585f94b',
  '6875115b',
  'a04095dc',
  '6ffa219a',
  'ad839d72',
  '6700d8a8',
  '3eb8ab95',
  '556c80dd',
  '5d4b8612',
  '84d62d14',
  '363feda9',
  'bcd7170a',
  'ccc854b0',
  'f3a375c4',
  'ce8b81d1',
  'e68ae3da',
  'bb75e2a0',
  '0e2bd2a7',
  'f983f443',
  '58cc7969',
  'e3545864',
  '0126db7b',
  '0006c55d',
  '8134e0b5',
  '22cab603',
  'a96e933b',
  '2e3ab99c',
  '2ec99359',
  'e0ae1f54',
  '33042887',
  '4641b630',
  '23ab4289',
  '98925f50',
  '6b9ff87e',
  'c9d85fb0',
  '7b8cf26d',
  '2196d29a',
  '78828b5a',
  '410eb5c6',
  '355e6c44',
  '35f7aa70',
  '29a9bf1a',
  'e93be97b',
  '2ce70687',
  'a88baf05',
  'f091f56a',
  'd0be817d',
  '45669e33',
  '062c6cc4',
  '540a5562',
  '80116fcf',
  '4a02b256',
  '3d5c77dc',
  'eb332007',
  '6f16409a',
  'dab1bade',
  '5517b459',
  '0ac7074b',
  '008aedce',
  'aa91c8b9',
  'f92

In [20]:
len(entities[1])

4096

In [21]:
insert_result = fsdl_collection.insert(entities)
print(f"Number of entities in the fsdl collection: {fsdl_collection.num_entities}")

Number of entities in the fsdl collection: 8192


In [22]:
#create an innner product metric for cosine similarity
#inner product metric measures orientation
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "IP",
    "params": {"nlist": 128}
}
fsdl_collection.create_index("embeddings", index)

Status(code=0, message='')

In [23]:
fsdl_collection.load()
fsdl_collection

<Collection>:
-------------
<name>: fsdl_cosine
<partitions>: [{"name": "_default", "collection_name": "fsdl_cosine", "description": ""}]
<description>: FDSL project embedding database
<schema>: {
  auto_id: False
  description: FDSL project embedding database
  fields: [{
    name: img_name
    description: 
    type: 21
    params: {'max_length': 100}
    is_primary: True
    auto_id: False
  }, {
    name: embeddings
    description: 
    type: 101
    params: {'dim': 512}
  }]
}

In [24]:
len(entities[-1][-2:])

2

In [25]:
vectors_to_search = entities[-1][-2:]
search_params = {
    "metric_type": "IP",
    "params": {"nprobe": 10},
}

start_time = time.time()
result = fsdl_collection.search(vectors_to_search, "embeddings", 
                                search_params, limit=3, output_fields=["img_name"])
end_time = time.time()

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, img_name: {hit.entity.get('img_name')}")
print(search_latency_fmt.format(end_time-start_time))

hit: (distance: 1.0, id: 88aad2c1), img_name: 88aad2c1
hit: (distance: 0.8719464540481567, id: ce7b4545), img_name: ce7b4545
hit: (distance: 0.861721932888031, id: ef9286da), img_name: ef9286da
hit: (distance: 0.9999998807907104, id: 6f03bc7b), img_name: 6f03bc7b
hit: (distance: 0.8344776034355164, id: dc90f943), img_name: dc90f943
hit: (distance: 0.814020037651062, id: 43bd06ef), img_name: 43bd06ef
search latency = 0.4802s


In [34]:
import pandas as pd
df = pd.DataFrame(['https://storage.googleapis.com/fsdl_images/semsearch/88aad2c1.jpg', 
                   'https://storage.googleapis.com/fsdl_images/semsearch/ce7b4545.jpg',
                   'https://storage.googleapis.com/fsdl_images/semsearch/ef9286da.jpg',
                   'https://storage.googleapis.com/fsdl_images/semsearch/6f03bc7b.jpg', 
                   'https://storage.googleapis.com/fsdl_images/semsearch/dc90f943.jpg',
                   'https://storage.googleapis.com/fsdl_images/semsearch/43bd06ef.jpg'])
df.columns=["images"]

def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

df.style.format(make_clickable)

Unnamed: 0,images
0,https://storage.googleapis.com/fsdl_images/semsearch/88aad2c1.jpg
1,https://storage.googleapis.com/fsdl_images/semsearch/ce7b4545.jpg
2,https://storage.googleapis.com/fsdl_images/semsearch/ef9286da.jpg
3,https://storage.googleapis.com/fsdl_images/semsearch/6f03bc7b.jpg
4,https://storage.googleapis.com/fsdl_images/semsearch/dc90f943.jpg
5,https://storage.googleapis.com/fsdl_images/semsearch/43bd06ef.jpg


In [35]:
from IPython.display import Image, HTML

def path_to_image_html(path):
    '''
     This function essentially convert the image url to 
     '<img src="'+ path + '"/>' format. And one can put any
     formatting adjustments to control the height, aspect ratio, size etc.
     within as in the below example. 
    '''

    return '<img src="'+ path + '" style=max-height:124px;"/>'

HTML(df.to_html(escape=False ,formatters=dict(images=path_to_image_html)))

Unnamed: 0,images
0,
1,
2,
3,
4,
5,


In [36]:
import semsearch
from semsearch.encoding import clip_encoding


ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [46]:
query_text =  "black shoes"
query_embed = clip_encoding.get_text_embeddings(query_text).detach().numpy()
len(query_embed[0])

512

In [47]:
vectors_to_search = [query_embed[0]]
search_params = {
    "metric_type": "IP",
    "params": {"nprobe": 10},
}

start_time = time.time()
result = fsdl_collection.search(vectors_to_search, "embeddings", 
                                search_params, limit=3, output_fields=["img_name"])
end_time = time.time()

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, img_name: {hit.entity.get('img_name')}")
print(search_latency_fmt.format(end_time-start_time))

hit: (distance: 0.30680763721466064, id: 269892fb), img_name: 269892fb
hit: (distance: 0.2969932556152344, id: a564aec1), img_name: a564aec1
hit: (distance: 0.29586440324783325, id: 340dd50c), img_name: 340dd50c
search latency = 0.4192s


In [48]:
df = pd.DataFrame(['https://storage.googleapis.com/fsdl_images/semsearch/269892fb.jpg', 
                   'https://storage.googleapis.com/fsdl_images/semsearch/a564aec1.jpg',
                   'https://storage.googleapis.com/fsdl_images/semsearch/340dd50c.jpg',
                  ])
df.columns=["images"]

def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

df.style.format(make_clickable)

Unnamed: 0,images
0,https://storage.googleapis.com/fsdl_images/semsearch/269892fb.jpg
1,https://storage.googleapis.com/fsdl_images/semsearch/a564aec1.jpg
2,https://storage.googleapis.com/fsdl_images/semsearch/340dd50c.jpg


In [49]:
HTML(df.to_html(escape=False ,formatters=dict(images=path_to_image_html)))

Unnamed: 0,images
0,
1,
2,
