In [None]:
!python -m spacy download en_core_web_sm
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install openai==0.27.7

In [None]:
# Install dependencies
!pip install -U sentence-transformers gradio


In [None]:
# !pip install pandas==2.2.2 numpy<2.1.0

In [None]:

# Import required libraries
import pandas as pd
import numpy as np
import torch
import gradio as gr
import re

In [None]:
# !sudo apt purge python3-numpy

In [None]:

from sentence_transformers import SentenceTransformer,util


In [None]:
# Load your dataset
import re
df = pd.read_csv("./merged_output.csv")

# Clean and combine text fields
df["product_disc"] = df["product_disc"].apply(
    lambda x: ", ".join(eval(x)) if isinstance(x, str) else x
)
df["combined"] = (
    "product: "
    + df["product"].str.strip()
    + "; product_disc: "
    + df["product_disc"].str.strip()
)

# Remove punctuation & lowercase
df["combined"] = df["combined"].apply(
    lambda x: re.sub("[^a-zA-Z0-9\s]", "", str(x)).lower()
)

# Optional: Check/insert image URL column if not present
if "product_url" not in df.columns:
    df["image_url"] = (
        "https://via.placeholder.com/200x150.png?text=No+Image"  # Placeholder
    )

df.head()

In [None]:
df['combined'][0]

In [None]:
df

In [None]:
# Load the embedding model
embedder = SentenceTransformer("all-mpnet-base-v2")

# Encode all combined text at once
embeddings = embedder.encode(
    df["combined"].tolist(), convert_to_tensor=True, show_progress_bar=True
)
df["embedding"] = embeddings.cpu().tolist()  # Save as list for DataFrame compatibility

In [None]:
df["bm25_tokenized"] = df["combined"].apply(lambda x: x.lower().split())

In [None]:
df

In [None]:
!pip install rank_bm25

In [None]:
import torch
import numpy as np
from sentence_transformers import util
from rank_bm25 import BM25Okapi

def search_with_html(query, n=7):
    # Select device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Encode query embedding
    query_embedding = embedder.encode(query, convert_to_tensor=True).to(device).to(torch.float32)

    # Ensure corpus embeddings are float32 and on same device
    corpus_embeddings = torch.tensor(np.array(df["embedding"].tolist()), dtype=torch.float32).to(device)

    bm25 = BM25Okapi(df["bm25_tokenized"].tolist())
    bm25_scores = bm25.get_scores(query.lower().split())
    bm25_scores = torch.tensor(bm25_scores, dtype=torch.float32).to(device)

    # Cosine similarity
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]

    # Combine scores
    final_score = (0.7 * cos_scores) + (0.3 * bm25_scores)

    # Get top results
    top_results = torch.topk(final_score, k=n)

    results = df.iloc[top_results.indices.cpu().numpy()].copy()
    results["similarity"] = top_results.values.cpu().numpy()

    # Enhance image quality in URL
    def upgrade_url_quality(url):
        return url.replace("tr:w-100", "tr:w-400")

    # Build HTML result cards
    cards = ""
    for _, row in results.iterrows():
        updated_url = upgrade_url_quality(row['product_url'])
        cards += f"""
        <div style="display:inline-block; margin:10px; width:200px; border:1px solid #ccc; border-radius:10px; padding:10px">
            <img src="{updated_url}" style="width:100%; height:150px; object-fit:cover; border-radius:10px 10px 0 0" />
            <h4>{row['product']}</h4>
            <p style='font-size:14px'>Score: {round(row['similarity'], 2)}</p>
        </div>
        """
    return f"<div style='display:flex; flex-wrap:wrap'>{cards}</div>"

In [None]:
demo = gr.Interface(
    fn=search_with_html,
    inputs=gr.Textbox(label="Enter your search query"),
    outputs=gr.HTML(label="Top Matching Products"),
    title="Semantic Product Search",
    description="Search through products using semantic understanding.",
)

demo.launch(share=True, debug=True)

In [None]:
Laundry Cleaning Products