# LanceDB Myntra Fashion Search Engine

[View in Colab](https://colab.research.google.com/drive/17CNo2rkbFYaIYcS5_ABd-fimBDWBi0C7?usp=sharing)

## Preliminaries

In [1]:
%%capture

# !pip install lancedb
# !pip install open_clip_torch

In [2]:
import os
import pandas as pd
from PIL import Image
from pathlib import Path
from random import sample

import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import EmbeddingFunctionRegistry

from typing import Any

## Download Data Instructions
- For this project you need to download the [Myntra Fashion Product Dataset]( https://www.kaggle.com/datasets/hiteshsuthar101/myntra-fashion-product-dataset) from Kaggle.
- Create a folder named `input` within `session_1`.
- Unzip the downloaded data and move it in the `input` folder.
- The final directory structure should look like this
```python
Week9
  |-session_1
  |     |-input
  |         |-Fashion Dataset.csv
  |         |-Images
  |             |-Images
  |                |-0.jpg
  |                |-2.jpg
  |                .
  |                .
  |                .
  |-session_2
        |-....
```

In [3]:
import pandas as pd

df = pd.read_csv("input/Fashion Dataset.csv", index_col=0)

df.head()

Unnamed: 0,p_id,name,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes
0,17048614.0,Khushal K Women Black Ethnic Motifs Printed Ku...,5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
1,16524740.0,InWeave Women Orange Solid Kurta with Palazzos...,5899.0,Orange,InWeave,http://assets.myntassets.com/assets/images/165...,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
2,16331376.0,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,4899.0,Navy Blue,Anubhutee,http://assets.myntassets.com/assets/images/163...,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
3,14709966.0,Nayo Women Red Floral Printed Kurta With Trous...,3699.0,Red,Nayo,http://assets.myntassets.com/assets/images/147...,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
4,11056154.0,AHIKA Women Black & Green Printed Straight Kurta,1350.0,Black,AHIKA,http://assets.myntassets.com/assets/images/110...,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size..."


## Embedding Model

In [4]:
def register_model(model_name: str) -> Any:
    """
    Register a model with the given name using LanceDB's EmbeddingFunctionRegistry.

    Args:
        model_name (str): The name of the model to register.

    Returns:
        model: The registered model instance.

    Usage:
    >>> model = register_model("open-clip")
    """
    registry = EmbeddingFunctionRegistry.get_instance()
    model = registry.get(model_name).create()
    return model

## Schema

In [5]:
# Register the OpenAI CLIP model
clip = register_model("open-clip")


class Myntra(LanceModel):
    """
    Represents a Myntra Schema.

    Attributes:
        vector (Vector): The vector representation of the item.
        image_uri (str): The URI of the item's image.
    """

    vector: Vector(clip.ndims()) = clip.VectorField()
    image_uri: str = clip.SourceField()

    @property
    def image(self):
        return Image.open(self.image_uri)


# Function to map schema name to schema class
def get_schema_by_name(schema_name: str) -> Any:
    """
    Retrieves the schema object based on the given schema name.

    Args:
        schema_name (str): The name of the schema.

    Returns:
        object: The schema object corresponding to the given schema name, or None if not found.

    Usage:
    >>> schema = get_schema_by_name("Myntra")
    """
    schema_map = {
        "Myntra": Myntra,
    }
    return schema_map.get(schema_name)

## Creating a Table

In [6]:
def create_table(
    database: str,
    table_name: str,
    data_path: str,
    mode: str = "create",  # "create", "overwrite"
    schema: Any = Myntra,
    sample_size: int = 100,
) -> None:
    """
    Create a table in the specified vector database and add data to it.

    Args:
        database (str): The name of the database to connect to.
        table_name (str): The name of the table to create.
        data_path (str): The path to the data directory.
        mode (str): The mode for creating the table. Defaults to "create".
        schema (Schema, optional): The schema to use for the table. Defaults to Myntra.
        sample_size (int, optional): The number of images to sample from the data. Defaults to 100.

    Returns:
        None

    Usage:
    >>> create_table(database="lancedb_myntra", table_name="fashion", data_path="input")
    """

    # Connect to the lancedb database
    db = lancedb.connect(database)

    # Check if the table already exists in the database
    if table_name in db and mode != "overwrite":
        print(f"Table {table_name} already exists in the database")
        table = db[table_name]

    # if it does not exist then create a new table
    else:

        print(f"Creating table {table_name} in the database")

        if table_name in db:
            db.drop_table(table_name)

        # Create the table with the given schema
        table = db.create_table(table_name, schema=schema, mode=mode)

        # Define the Path of the images and obtain the Image uri
        p = Path(data_path).expanduser()
        uris = [str(f) for f in p.glob("*.jpg")]
        print(f"Found {len(uris)} images in {p}")

        # Sample sample_size images from the data
        # Increase this value for more accurate results but
        # it will take more time to process embeddings
        uris = sample(uris, sample_size)

        # Add the data to the table
        print(f"Adding {len(uris)} images to the table")
        table.add(pd.DataFrame({"image_uri": uris}))
        print(f"Added {len(uris)} images to the table")

In [14]:
# The data_path should refer to the folder in which the images are located

create_table(
    database="lancedb_myntra",
    table_name="fashion",
    data_path="input/Images/Images",
    mode="overwrite",
    sample_size=1000,
)

Creating table fashion in the database
Found 14481 images in input/Images/Images
Adding 1000 images to the table


[90m[[0m2025-06-20T20:20:55Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/ishandutta/Documents/code/GenAIEngineering-Cohort1/Week9/session_1/lancedb_myntra/fashion.lance, it will be created
100%|██████████| 64/64 [00:01<00:00, 51.07it/s]
100%|██████████| 64/64 [00:01<00:00, 51.51it/s]
100%|██████████| 64/64 [00:01<00:00, 49.89it/s]
100%|██████████| 64/64 [00:01<00:00, 52.77it/s]
100%|██████████| 64/64 [00:01<00:00, 45.77it/s]
100%|██████████| 64/64 [00:01<00:00, 52.56it/s]
100%|██████████| 64/64 [00:01<00:00, 51.17it/s]
100%|██████████| 64/64 [00:01<00:00, 49.81it/s]
100%|██████████| 64/64 [00:01<00:00, 51.99it/s]
100%|██████████| 64/64 [00:01<00:00, 49.87it/s]
100%|██████████| 64/64 [00:01<00:00, 51.85it/s]
100%|██████████| 64/64 [00:01<00:00, 52.88it/s]
100%|██████████| 64/64 [00:01<00:00, 51.26it/s]
100%|██████████| 64/64 [00:01<00:00, 52.97it/s]
100%|██████████| 64/64 [00:01<00:00, 51.81it/s]
100%|██████████| 40/40 [00:00<00:00, 51.47it/s]

Added 1000 images to the table





## Vector Search

In [8]:
def run_vector_search(
    database: str,
    table_name: str,
    schema: Any,
    search_query: Any,
    limit: int = 6,
    output_folder: str = "output",
) -> None:
    """
    This function performs a vector search on the specified database and table using the provided search query.
    The search can be performed on either text or image data. The function retrieves the top 'limit' number of results
    and saves the corresponding images in the 'output_folder' directory. The function assumes if the search query ends
    with '.jpg' or '.png', it is an image search, otherwise it is a text search.
    Args:
        database (str): The path to the database.
        table_name (str): The name of the table.
        schema (Schema): The schema to use for converting search results to Pydantic models.
        search_query (Any): The search query, can be text or image.
        limit (int, optional): The maximum number of results to return. Defaults to 6.
        output_folder (str, optional): The folder to save the output images. Defaults to "output".

    Returns:
        None

    Usage:
    >>> run_vector_search(database="lancedb_myntra", table_name="fashion", schema=Myntra, search_query="Black Kurta")

    """

    # Create the output folder if it does not exist
    if os.path.exists(output_folder):
        for file in os.listdir(output_folder):
            os.remove(os.path.join(output_folder, file))
    else:
        os.makedirs(output_folder)

    # Connect to the lancedb database
    db = lancedb.connect(database)

    # Open the table
    table = db.open_table(table_name)

    # Check if the search query is an image or text
    try:
        if search_query.endswith(".jpg") or search_query.endswith(".png"):
            search_query = Image.open(search_query)
        else:
            search_query = search_query
    except AttributeError as e:
        if str(e) == "'JpegImageFile' object has no attribute 'endswith'":
            print(
                "Running via Streamlit, search query is already an array so skipping opening image using Pillow"
            )
        else:
            raise

    # Perform the vector search and retrieve the results
    rs = table.search(search_query).limit(limit).to_pydantic(schema)

    # Save the images to the output folder
    for i in range(limit):
        image_path = os.path.join(output_folder, f"image_{i}.jpg")
        rs[i].image.save(image_path, "JPEG")

After the search is done, the results will be saved in the `output` folder.

## Text Search

Run Text to Image search in the database. Results will be stored in the `Week9/session_1/output` folder.

In [26]:
run_vector_search(
    database="lancedb_myntra",
    table_name="fashion",
    schema=Myntra,
    search_query="polka dot black",
    limit=3,
    output_folder="output",
)

# sample search queries
# polka dot
# black saree
# pink kurta
# red dress
# pink skirt
# polka dot black top 


## Image Search

Run Image to Image search in the database. Results will be stored in the `Week9/session_1/output` folder.

In [27]:
run_vector_search(
    database="lancedb_myntra",
    table_name="fashion",
    schema=Myntra,
    search_query="input/Images/Images/0.jpg",
    limit=3,
    output_folder="output",
)

---