In [2]:
import os
import pandas as pd
import tensorflow as tf
from dotenv import load_dotenv
import torch
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from sentence_transformers import SentenceTransformer



  from tqdm.autonotebook import tqdm, trange





In [3]:
import chromadb
from chromadb.utils import embedding_functions

In [197]:

load_dotenv()

HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [198]:

CHROMA_DATA_PATH = r"./embeddings_database"
DATA_DIR = r"./data"
COLLECTION_NAME = "SearchEngine"

EMBED_MODEL = "google-bert/bert-base-uncased"

EMBED_FUNCTION = embedding_functions.SentenceTransformerEmbeddingFunction(
     model_name=EMBED_MODEL
 )

In [199]:
client = chromadb.PersistentClient(path = CHROMA_DATA_PATH)

In [200]:

collection = client.create_collection(name=COLLECTION_NAME,embedding_function = EMBED_FUNCTION,metadata={"hnsw:space": "cosine"},)

In [4]:

df = pd.read_parquet('Clothes.parquet')
df.head()

Unnamed: 0,image,text
0,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,Mulberry Blue Zipped Backpack leather bag. a b...
1,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,Balenciaga Black Track leather low trainers. a...
2,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,Gucci Brown Leather vest. a black leather vest.
3,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,Balenciaga Beige Wool pull. a sweater with bla...
4,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,Mulberry Multicolour Cloth travel bag. a blue ...


In [202]:

import sys
if 'google.colab' in sys.modules:
    print('Running in Colab.')
    !pip3 install transformers==4.15.0 timm==0.4.12 fairscale==0.4.4
    !git clone https://github.com/salesforce/BLIP
    %cd BLIP

In [206]:
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")




In [207]:
from transformers import AutoTokenizer, BertTokenizer, BertModel  # Import all necessary classes


blip_tokenizer = AutoTokenizer.from_pretrained("salesforce/blip-image-captioning-base")
bert_tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
bert_model = BertModel.from_pretrained("google-bert/bert-base-uncased")


In [208]:

def generate_caption(image_bytes):
    """Generates text ."""
    image = tf.io.decode_image(image_bytes)
    image_np = image.numpy()
    inputs = processor(images=image_np, return_tensors="pt")
    outputs = model.generate(**inputs)
    caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]

    return caption



In [211]:
def get_bert_embedding(text):
    """Generates an embedding for the given text using BERT."""
    encoded_text = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad(): 
        outputs = bert_model(**encoded_text)
    embeddings = outputs.last_hidden_state[:, 0, :]  
    embeddings_list = embeddings.squeeze(0).tolist() 
    
    return embeddings_list


In [252]:
    metadatas = []
    new_documents = []
    old_documents = []
    ids = []
    captions = []

    for i in range(len(df)): 
        image_dict = df['image'][i]
        image_bytes = image_dict["bytes"]
        image_cap = df['text'][i]
        caption = generate_caption(image_bytes)
        new_document = get_bert_embedding(caption)
        old_document = get_bert_embedding(image_cap)

        metadata = {
            'original_caption': image_cap,
            'generated_caption': caption,
        }
        metadatas.append(metadata)
        new_documents.append(new_document)
        old_documents.append(old_document)
        ids.append(f"id_{i}")
        captions.append(caption)


    collection.add(
        ids=ids,
        embeddings=new_documents,  
        metadatas=metadatas,
        documents=captions, 
    )
    

In [253]:
collection.count()

990