In [None]:
!pip install pandas numpy
!pip install chromadb langchain langchain-community
!pip install peft


#### Finetuning embedding model using LoRA and triplet loss
    - Prepare Data - Triplet Datasets
        - Anchor/Query: Product to find similar items for
        - Positive: Represents a similar product
        - Negative: Represents a disimilar product
    - Choose Model : Capable of producing high quality vector representations
        - Siamese Networks(Twin Networks): These networks have two branches that process inputs (anchor and positive/negative) and output embeddings that are compared using a loss function.
        - Sentence Transformers: Designed for semantic similarity tasks, these models can learn embeddings for text(product descriptions, titles)
    - Train with triplet loss function

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the dataset
df_master = pd.read_csv("sample_20k.csv")
df_master.head()

Unnamed: 0,item_id,item_name,product_type,country,enhanced_product_desc,image_path,image_caption,complete_product_description
0,B07TG4V6BV,Amazon Brand - Solimo Designer Black and White...,CELLULAR_PHONE_CASE,IN,"Given Product description: , No Warranty, bran...",5f/5f39a379.jpg,a black and white phone case with a design on it,a black and white phone case with a design on ...
1,B07T2K5MY1,Amazon Brand - Solimo Designer Galaxy 3D Print...,CELLULAR_PHONE_CASE,IN,"Given Product description: , No Warranty, bran...",4a/4ab3ead6.jpg,a phone case with an orange swirl design,a phone case with an orange swirl design Given...
2,B0854LLTNR,Amazon Brand - Solimo Designer Daddy's Girl an...,CELLULAR_PHONE_CASE,IN,"Given Product description: , Extreme precision...",cd/cd678bbf.jpg,a black and white phone case with the words da...,a black and white phone case with the words da...
3,B07TGBPM1H,Amazon Brand - Solimo Designer Kiss-ing Couple...,CELLULAR_PHONE_CASE,IN,"Given Product description: , None, brand: Amaz...",d4/d47d521f.jpg,a couple is silhouetted against each other pho...,a couple is silhouetted against each other pho...
4,B077VJDKLV,"find. Women’s Flat Mule Sandals, Black, 8 UK",SANDAL,GB,"Given Product description: , Leather look uppe...",8e/8e2b2da0.jpg,a pair of black sandals on a white background,a pair of black sandals on a white background ...


In [3]:
# SAMPLE_SIZE = 500
# df_tune_sample = df_master.sample(SAMPLE_SIZE)
# df_tune_sample.head()

df_tune_sample = df_master.copy()

print(df_tune_sample['product_type'].unique())
print(df_tune_sample['product_type'].nunique())

['CELLULAR_PHONE_CASE' 'SANDAL' 'SHOES' 'OUTDOOR_LIVING'
 'HEALTH_PERSONAL_CARE' 'GPS_OR_NAVIGATION_ACCESSORY' 'GROCERY' 'HERB'
 'BEAUTY' 'CHAIR' 'NUTRITIONAL_SUPPLEMENT' 'KITCHEN' 'JANITORIAL_SUPPLY'
 'ABIS_DRUGSTORE' 'EARRING' 'HOME_BED_AND_BATH' 'BACKPACK' 'LAMP' 'HOME'
 'HANDBAG' 'HARDWARE_HANDLE' 'SAUTE_FRY_PAN' 'FINENECKLACEBRACELETANKLET'
 'HOME_LIGHTING_AND_LAMPS' 'NECKLACE' 'TABLE' 'PET_SUPPLIES'
 'ELECTRIC_FAN' 'SCISSORS' 'WATCH' 'COFFEE' 'WRITING_BOARD' 'SNACK_MIX'
 'SKIN_FOUNDATION_CONCEALER' 'SOFA' 'HEADBOARD' 'AUTO_ACCESSORY'
 'DISHWASHER_DETERGENT' 'OFFICE_PRODUCTS' 'SLOW_COOKER'
 'HOME_FURNITURE_AND_DECOR' 'FURNITURE' 'SCREEN_PROTECTOR' 'RUG'
 'NUT_BUTTER' 'PILLOW' 'BOOT' 'VITAMIN' 'CHARGING_ADAPTER'
 'WIRELESS_ACCESSORY' 'LIGHT_BULB' 'HAT' 'MASCARA' 'AV_FURNITURE' 'CLOCK'
 'WASTE_BAG' 'SKIN_CLEANING_AGENT' 'ACCESSORY' 'SUITCASE' 'WALL_ART'
 'SUNSCREEN' 'ELECTRONIC_ADAPTER' 'BABY_BOTTLE' 'BABY_PRODUCT'
 'MILK_SUBSTITUTE' 'BED_FRAME' 'DRINKING_CUP' 'WALLET' 'SHELF'
 'COM

In [6]:
## 1. Data Preparation: Load and format triplet dataset

## 2. Model Selection

## 3. Training with Triplet Loss
#     - Define Loss Function (e.g., TripletMarginLoss)
#     - Training Loop
#     - Calculate Loss
#     - Backpropagate
#     - Update Weights

## 4. Evaluation and Fine-Tuning

#### 1. Data Preparation: Load and format triplet dataset

In [4]:
## 1. Data Preparation: Load and format triplet dataset

import random

# Category groups - ensure each has enough samples.
category_groups = df_tune_sample.groupby('product_type')

# Create triplet samples
triplets = []
for category, group in category_groups:
    group_list = group.to_dict("records")

    for anchor in group_list:
        # Select a positive sample from the same caregory
        positive = random.choice(group_list)

        # Select a negative sample from a different category
        while True:
            negative_category = random.choice(df_tune_sample["product_type"].unique())
            if negative_category != category:
                negative = df_tune_sample[df_tune_sample["product_type"] == negative_category].sample(1).to_dict("records")[0]
                break

        triplets.append((anchor["item_name"], positive["complete_product_description"], negative["complete_product_description"]))



In [5]:
len(triplets)

20000

#### 2. Load pre-trained model

In [6]:
# Load Pre-Trained Model

from transformers import AutoModel, AutoTokenizer
import torch
from peft import get_peft_model, LoraConfig, TaskType

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm


#### 3. Apply LoRA configuration

In [7]:
# Apply LoRA for efficient fine-tuning. It will adapt only selected layers
config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION, # We will extract embeddings
    inference_mode= False,
    r=8, # Low-rank dimension
    lora_alpha=32, # Scaling factor
    lora_dropout=0.1,
    bias="none"
)

# Wrap the model with LoRA
peft_model = get_peft_model(base_model, config)
peft_model.print_trainable_parameters() 

trainable params: 73,728 || all params: 22,786,944 || trainable%: 0.3236


#### 4. Create Triplet Loss Dataset
Tokenize anchor, positive and negative samples.

In [8]:
from torch.utils.data import Dataset, DataLoader

class ProductTripletDataset(Dataset):
    def __init__(self, triplets, tokenizer):
        self.triplets = triplets
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        anchor, positive, negative = self.triplets[idx]

        anchor_input = self.tokenizer(anchor, padding="max_length", truncation=True, return_tensors="pt", max_length=128)
        positive_input = self.tokenizer(positive, padding="max_length", truncation=True, return_tensors="pt", max_length=128)
        negative_input = self.tokenizer(negative, padding="max_length", truncation=True, return_tensors="pt", max_length=128)

        return {
            "anchor": anchor_input,
            "positive": positive_input,
            "negative": negative_input
        }

train_dataset = ProductTripletDataset(triplets, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

#### 5. Define Training Loop with Triplet Loss
Triplet loss ensure achor-positive pairs are closer than anchor-negative pairs

In [9]:
def get_device():
    device = torch.device("cpu")
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    print("Device: ", device)
    return device

In [10]:
import torch.optim as optim
from torch.nn import TripletMarginLoss

device = get_device()
peft_model.to(device)
# Ensure model weights are in float32 on MPS
if device.type == "mps":
    peft_model = peft_model.to(torch.float32)

optimizer = optim.AdamW(peft_model.parameters(), lr = 5e-5)
loss_fn = TripletMarginLoss(margin=0.5) # Margin controls separation

#Training Loop
for epoch in range(5):
    peft_model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        # Move inputs to device for MPS

        # Ensure input tensors are moved to the correct device and dtype
        # if device.type == "cuda":
        #     inputs = {k: v.to(device, dtype=torch.float16) for k, v in inputs.items()}  # Use float16 for CUDA (if needed)
        # else: # MPS or CPU
        #     inputs = {k: v.to(device, dtype=torch.float32) for k, v in inputs.items()}  # Use float32 for MPS and CPU

        anchor_inputs = {key: value.squeeze().to(device, dtype=torch.long) for key, value in batch["anchor"].items()}
        positive_inputs = {key: value.squeeze().to(device, dtype=torch.long)for key, value in batch["positive"].items()}
        negative_inputs = {key: value.squeeze().to(device, dtype=torch.long) for key, value in batch["negative"].items()}

        # Get embeddings
        anchor_embed = peft_model(**anchor_inputs).pooler_output
        positive_embed = peft_model(**positive_inputs).pooler_output
        negative_embed = peft_model(**negative_inputs).pooler_output

        # Compute loss
        loss = loss_fn(anchor_embed, positive_embed, negative_embed)

        # Backpropagation
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_dataloader)}")


Device:  mps
Epoch 1, Loss: 0.07783677551150323
Epoch 2, Loss: 0.0347711225271225
Epoch 3, Loss: 0.027800049084424974
Epoch 4, Loss: 0.02388069538474083
Epoch 5, Loss: 0.0220460361123085


#### 6. Save Fine-tuned Model

In [12]:
# Save fine-tuned model
peft_model.save_pretrained("fine_tuned_lora_tripletloss")
peft_model.base_model.save_pretrained("fine_tuned_lora_tripletloss")
tokenizer.save_pretrained("fine_tuned_lora_tripletloss")

('fine_tuned_lora_tripletloss/tokenizer_config.json',
 'fine_tuned_lora_tripletloss/special_tokens_map.json',
 'fine_tuned_lora_tripletloss/vocab.txt',
 'fine_tuned_lora_tripletloss/added_tokens.json',
 'fine_tuned_lora_tripletloss/tokenizer.json')

#### 7. Load Fine-tuned Model

In [13]:
# Load the model
from transformers import AutoModel, AutoTokenizer
import torch
from peft import PeftModel

saved_model_path = "fine_tuned_lora_tripletloss"
base_model = AutoModel.from_pretrained(saved_model_path)
# Load the PEFT model (adapter) on top of the base model
peft_model = PeftModel.from_pretrained(base_model, saved_model_path)

# Move the model to the appropriate device
peft_model = peft_model.to(device)

# Ensure model weights are in float32 on MPS
if device.type == "mps":
    peft_model = peft_model.to(torch.float32)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(saved_model_path)

# Ensure tokenizer is prepared correctly
# Set the padding token if not available
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token (if available)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})  # Add a new pad token if eos_token is also missing

print("PEFT Model and Tokenizer Loaded Successfully!")

PEFT Model and Tokenizer Loaded Successfully!


#### 8. Create Custom Function and Class for Embeddings

In [14]:
from langchain_core.embeddings import Embeddings

#### Compute Embeddings for all products
def custom_embedding_function(text):
    #print(f"Embedding text: '{text[:50]}...'")  # Debug: Check if function is called
    """Compute embedding using fine-tuned LoRA model."""
    tokenized = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=128)

    # Move tokenized input to the same device as the model
    tokenized = {key: value.to(device, dtype=torch.long) for key, value in tokenized.items()}

    with torch.no_grad():
        # Forward pass through the model (on the correct device)
        outputs = peft_model(**tokenized)
        #print(f"Type of outputs: {type(outputs)}") #debug
        #print(f"Outputs keys: {getattr(outputs, 'keys', lambda: None)()}") #debug
        #print(f"Outputs shape: {getattr(outputs, 'shape', lambda: None)}") #debug
        if hasattr(outputs, "pooler_output"):
            embeddings = outputs.pooler_output.cpu().numpy().tolist()[0]
        elif hasattr(outputs, "last_hidden_state"):
            # Mean pooling
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy().tolist()[0]
            #OR CLS token extraction
            #embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy().tolist()[0]
        else:
            raise ValueError(f"Unexpected output structure: {type(outputs)}")
        
    #print(f"Embedding length: {len(embeddings)}, Type: {type(embeddings)}") #debug
    return embeddings

# --- Custom Embedding Class ---
class LoRAEmbeddings(Embeddings):
    def embed_documents(self, texts):
        print(f"Number of texts to embed: {len(texts)}")  # Debug: Check number of texts
        embeddings = []
        for text in texts:
            embeddings.append(custom_embedding_function(text))
        print(f"Number of embeddings created: {len(embeddings)}")  # Debug: Check number of embeddings
        return embeddings
    
    def embed_query(self, text):
        return custom_embedding_function(text) #use same embedding function for query.

#### 9. Generate Embeddings, Save to ChromaDB

In [15]:
# Convert DataFrame rows into embeddings and store them
from langchain.vectorstores import Chroma

PERSIST_DIRECTORY = "chromadb_vectorstore_lora"

texts = []
metadatas = []
embeddings = []

for _, row in df_tune_sample.iterrows():
    text = row["complete_product_description"]  # Product description
    
    texts.append(text)
    metadatas.append({"id": str(row["item_id"]),
                      "name": row["item_name"],
                      "category": row["product_type"],
                      "country": row["country"],
                      "image_path": row["image_path"]})

# Store all embeddings in ChromaDB
# vector_store.add_texts(texts=texts, metadatas=metadatas)

print(f"Length of texts before Chroma: {len(texts)}") #debug

# Initialize your custom embedding class
embeddings = LoRAEmbeddings()
vectorstore = Chroma(embedding_function=embeddings, persist_directory=PERSIST_DIRECTORY)  # Pass embeddings to Chroma

# Initialize Chroma with the embedding object
vectorstore.add_texts(texts=texts, embedding=embeddings, metadatas=metadatas)

print(f"✅ Embeddings saved successfully from DataFrame to {PERSIST_DIRECTORY}!")


Length of texts before Chroma: 20000


  vectorstore = Chroma(embedding_function=embeddings, persist_directory=PERSIST_DIRECTORY)  # Pass embeddings to Chroma


Number of texts to embed: 20000
Number of embeddings created: 20000
✅ Embeddings saved successfully from DataFrame to chromadb_vectorstore_lora!


#### 10: Load Vector Store, Retrieve Top-k Similar results

In [27]:
# Initialize your custom embedding class
embeddings = LoRAEmbeddings()
vectorstore = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)

In [29]:
# Example with k similar results
def retrieve_custom_k(query_text, k=5):
    #results = vectorstore.similarity_search(query_text, k=k)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    results = retriever.get_relevant_documents(query_text)
    return results

query = "Find me a good phone case for IPhone"
results = retrieve_custom_k(query, k=5)

if results:
    i = 0
    for doc in results:
        print(f"Document {i+1}: {doc.page_content}")
        print(f"Metadata: {doc.metadata}")
        #print(f"Image: {doc.metadata["image_path"]}") # debug
        #print(f"Product Type: {doc.metadata["category"]}") #debug
        #print(f"Country of Origin: {doc.metadata["country"]}") #debug
        print("-" * 20)
        i = i+1
else:
    print("No results found.")

Document 1: a colorful phone case for the iphone Given Product description: , No Warranty, brand: Amazon Brand - Solimo, weight: 50.0, color: Others, height: 2200.0, width: 1879.0, model year: , shape: , style: , material: Canvas, product_type: CELLULAR_PHONE_CASE
Metadata: {'category': 'CELLULAR_PHONE_CASE', 'country': 'IN', 'id': 'B07TG4LCWZ', 'image_path': '1f/1fcde4f7.jpg', 'name': 'Amazon Brand - Solimo Designer Multicolor Bin 3D Printed Hard Back Case Mobile Cover for Micromax Canvas Juice 3Plus Q394'}
--------------------
Document 2: a black and gold phone case for the iphone Given Product description: , None, brand: Amazon Brand - Solimo, weight: 50.0, color: Multicolor, height: 2200.0, width: 1879.0, model year: , shape: , style: , material: Silicone, product_type: CELLULAR_PHONE_CASE
Metadata: {'category': 'CELLULAR_PHONE_CASE', 'country': 'IN', 'id': 'B0853WV73C', 'image_path': '2e/2e068dd0.jpg', 'name': 'Amazon Brand - Solimo Designer Golden Butterfly Pattern UV Printed Sof

In [21]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Check if the API key is loaded correctly
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")

#### 11. Generate Natual Language Response using LLM

In [32]:
from langchain.chat_models import ChatOpenAI

# Load an LLM (GPT-4 for best responses, or use an open-source model)
llm = ChatOpenAI(model_name="gpt-4", temperature=0.7)

def generate_natural_language_response(query, products):
    """Generate a response based on retrieved products using LLM. Include details like the product category and country where it is made."""
    prompt = f"""
    A customer is looking for a product based on this query: "{query}"
    Here are the recommended products:
    {products}
    
    Generate a natural language response listing the products in a friendly tone.
    Include details like the product category and country where it is made.
    Embed the image_path from the metadata, if present, in the response at the end.
    Format the results as a list.
    """

    response = llm.predict(prompt)
    return response

query = "Recommend me men's sneaker in black color"
recommended_products = retrieve_custom_k(query)

# Generate response
response_text = generate_natural_language_response(query, recommended_products)
print(response_text)

Sure, I found some great options for you! Here are some cool black sneakers for men:

1. "Amazon Brand - Inkast Denim Co. Men's Sneakers" is the first one on the list. This product falls under the category of 'Shoes' and is manufactured in India. You can check the image here: [0b/0b13da52.jpg](image_path).

2. "Concept 3 by Skechers Men's Xavien Lace-up Sneaker, BBK, 12 Medium US" also seems to be a great choice. This pair is a product of the USA and belongs to the 'Shoes' category. The product image can be found here: [4b/4b005ffc.jpg](image_path).

Please note that the other products listed are either not designed for men or not in black color. Let me know if you need more options or details about these products!
