In [1]:
from transformers import AutoImageProcessor, AutoModel


model_ckpt = "nateraw/vit-base-beans"
processor = AutoImageProcessor.from_pretrained('vit_base_beans/')
model = AutoModel.from_pretrained('vit_base_beans/')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

In [3]:
import os

def get_image_files(folder_path):
    # List all files in the folder
    all_files = os.listdir(folder_path)
    
    # Filter out only the image files
    image_files = [file for file in all_files if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'))]
    
    # Convert relative paths to absolute paths
    image_files = [os.path.join(folder_path, file) for file in image_files]
    
    return image_files

folder_path = 'random/'
Value = get_image_files(folder_path)

# Print all image file paths
for image_file in Value:
    print(image_file)


random/1.jpg
random/10.jpg
random/11.jpg
random/12.jpg
random/2.jpg
random/3.jpg
random/4.jpg
random/5.jpg
random/6.jpg
random/7.jpeg
random/8.jpg
random/9.jpg


In [4]:
from PIL import Image
import numpy as np

# Example function to load and preprocess images
def load_image(file_path, target_size=(224, 224)):
    image = Image.open(file_path)
    image = image.resize(target_size)  # Resize the image to a fixed size
    image = np.array(image)            # Convert PIL image to numpy array
    #image = image / 255.0               # Normalize pixel values (if needed)
    return image

In [5]:
Images = []
for i in Value:
  #print(i, type(i))
  Images.append(load_image(str(i)))

In [6]:
def extract_embeddings(model: torch.nn.Module, Img):
    """Utility to compute embeddings."""
    device = model.device

    #def pp(batch):
    images = Img
        # `transformation_chain` is a compostion of preprocessing
        # transformations we apply to the input images to prepare them
        # for the model. For more details, check out the accompanying Colab Notebook.
    image_batch_transformed = torch.stack(
        [torch.tensor(image).permute(2, 0, 1) for image in images]
    )
    new_batch = {"pixel_values": image_batch_transformed.to(device)}
    with torch.no_grad():
        embeddings = model(**new_batch).last_hidden_state[:, 0].cpu()
    return {"embeddings": embeddings}

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
extract_fn = extract_embeddings(model.to(device), Images)

In [8]:
extract_fn['embeddings'].shape

torch.Size([12, 768])

In [9]:
type(extract_fn['embeddings'])

torch.Tensor

In [17]:
!pip install pymilvus

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pymilvus
  Downloading pymilvus-2.4.0-1-py3-none-any.whl.metadata (4.5 kB)
Collecting grpcio<=1.60.0,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.60.0-cp39-cp39-win_amd64.whl.metadata (4.2 kB)
Collecting protobuf>=3.20.0 (from pymilvus)
  Downloading protobuf-5.26.1-cp39-cp39-win_amd64.whl.metadata (592 bytes)
Collecting environs<=9.5.0 (from pymilvus)
  Downloading environs-9.5.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.9.0-cp39-cp39-win_amd64.whl.metadata (8.9 kB)
Collecting minio>=7.0.0 (from pymilvus)
  Downloading minio-7.2.5-py3-none-any.whl.metadata (6.4 kB)
Collecting pyarrow>=12.0.0 (from pymilvus)
  Downloading pyarrow-15.0.2-cp39-cp39-win_amd64.whl.metadata (3.1 kB)
Collecting azure-storage-blob (from pymilvus)
  Downloading azure_storage_blob-12.19.1-py3-none-any.whl.metadata (26 kB)
Collecting marshmallow>=3.0.0 (from environs<=9

In [10]:
#Imports a PyMilvus package:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

In [11]:
#Connects to a server:
connections.connect("default", host="localhost", port="19530")

In [22]:
#Creates a collection:
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="file_path", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=768)
]



In [14]:
schema = CollectionSchema(fields, "Image query based Image retrieval system")
hello_milvus = Collection("BDT", schema)

NameError: name 'fields' is not defined

In [15]:
hello_milvus = Collection(name="BDT")

In [31]:
entities = [
    [i for i in range(len(Value))],
    [path for path in Value],
    [arr.numpy() for arr in extract_fn['embeddings']]
]

In [36]:
# Builds indexes on the entities:

index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}
hello_milvus.create_index("embeddings", index)

Status(code=0, message=)

In [37]:
hello_milvus.insert(entities)

(insert count: 12, delete count: 0, upsert count: 0, timestamp: 448965910330605571, success count: 12, err count: 0)

In [38]:
hello_milvus.flush() 

In [39]:
hello_milvus.load()

In [17]:
vectors_to_search = extract_fn['embeddings'][7].numpy()
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 10},
}
result = hello_milvus.search([vectors_to_search], "embeddings", search_params, limit=3, output_fields=["file_path"])
result

['["id: 7, distance: 0.0, entity: {\'file_path\': \'random/5.jpg\'}", "id: 4, distance: 0.8913886547088623, entity: {\'file_path\': \'random/2.jpg\'}", "id: 8, distance: 1.9012119770050049, entity: {\'file_path\': \'random/6.jpg\'}"]']

In [52]:
for i in result[0]:
    print(i)

id: 5, distance: 0.0, entity: {'file_path': 'random/3.jpg'}
id: 0, distance: 3.152235746383667, entity: {'file_path': 'random/1.jpg'}
id: 8, distance: 3.4347569942474365, entity: {'file_path': 'random/6.jpg'}


In [54]:
utility.drop_collection('hello_milvus')