## Importing Libraries

In [None]:
import pandas as pd 
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import ast
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from google import genai
from llm_generation import *

In [152]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [153]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nadyinky/sephora-products-and-skincare-reviews")

print("Path to dataset files:", path)

Path to dataset files: /Users/zainabshakruwala/.cache/kagglehub/datasets/nadyinky/sephora-products-and-skincare-reviews/versions/2


In [154]:
products = pd.read_csv(path + "/product_info.csv")
#reviews = pd.read_csv(path + "/reviews_0-250.csv")




## Preprocessing Data

In [155]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8494 entries, 0 to 8493
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          8494 non-null   object 
 1   product_name        8494 non-null   object 
 2   brand_id            8494 non-null   int64  
 3   brand_name          8494 non-null   object 
 4   loves_count         8494 non-null   int64  
 5   rating              8216 non-null   float64
 6   reviews             8216 non-null   float64
 7   size                6863 non-null   object 
 8   variation_type      7050 non-null   object 
 9   variation_value     6896 non-null   object 
 10  variation_desc      1250 non-null   object 
 11  ingredients         7549 non-null   object 
 12  price_usd           8494 non-null   float64
 13  value_price_usd     451 non-null    float64
 14  sale_price_usd      270 non-null    float64
 15  limited_edition     8494 non-null   int64  
 16  new   

In [156]:
products['rating'] = products['rating'].round(1)
products['rating'].head()

0   3.60
1   4.20
2   4.20
3   4.50
4   3.20
Name: rating, dtype: float64

In [157]:
products['on_sale'] = products['sale_price_usd'].apply(lambda x: 0 if pd.isna(x) else 1)

In [158]:
def clean_list(field_value):
    try:
        if isinstance(field_value, str) and field_value.startswith('['):
            items = ast.literal_eval(field_value)
            if isinstance(items, list):
                return ', '.join(str(i) for i in items)
            return str(field_value)
    except:
        return str(field_value)[:200]

In [159]:
def prepare_document(products):
    documents= []
    for idx, row in products.iterrows():
        variation_info = (
            f"{row['variation_value']} - {row['variation_desc']}"
            if pd.notna(row.get('variation_type'))
            else "standard size only"
        )
        ingredients = clean_list(row.get('ingredients', 'Not specified'))
        description = clean_list(row.get('highlights', ''))
        doc_text = f"""
Product Name: {row['product_name']}
Brand Name: {row["brand_name"]}
Ingredients: {ingredients}
On Sale: {'Yes' if row['on_sale'] == 1 else 'No'}
Reviews: {row.get('reviews', 0)} reviews
New: {'Yes' if row['new'] == 1 else 'No'}
Limited Edition: {'Yes' if row['limited_edition'] == 1 else 'No'}
Product Description: {description}
Variations Available: {variation_info}""".strip()
        metadata = {
            "product_id": row['product_id'],
            "brand_id": row['brand_id'],
            "rating" : row['rating'],
            "category" : row['primary_category'],
            "product_name": row['product_name'],
            "brand_name": row['brand_name'],
            "price_usd": row['sale_price_usd'] if row['on_sale'] == 1 else row['price_usd']
           
        }
        documents.append({
            "text": doc_text,
            "metadata": metadata
        })
    return documents

In [160]:
documents = prepare_document(products)

In [161]:
documents[0]

{'text': 'Product Name: Fragrance Discovery Set\nBrand Name: 19-69\nIngredients: Capri Eau de Parfum:, Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) D-Limonene, Linalool, Benzyl Salicylate, Ethylhexyl Methoxycinnamate, Butyl Methoxydibenzoylmethane, Ethylhexyl Salicylate, Benzl Benzoate, Citral, Geraniol, Eugenol, Benzyl Alcohol, Farnesol, Citronellol, Isoeugenol., Invisible Post Eau de Parfum:, Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) Ethylhexyl Methoxycinnamate, Ethylhexyl Salicylate, Butyl Methoxydibenzoylmethane, Benzyl Benzoate, Citral, Coumarin, Limonene, Alpha-Isomethyl Ionone, Linalool., Kashbah Eau de Parfum:, Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) Coumarin, Ethylhexyl Methoxycinnamate, Butyl Methoxydibenzoylmethane, Ethylhexyl Salicylate, D-Limonene, Eugenol, Linalool, Citronellol, Geraniol, Cinnamal, Citral., L‚ÄôAir Barbes Eau de Parfum:, Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) Benzyl Salicylate, D-Limonene, Ethylhexyl Methoxycinnam

In [162]:
texts = [doc['text'] for doc in documents]
metadata_list = [doc['metadata'] for doc in documents]

## Making Embeddings

In [163]:


embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


In [164]:
print("Creating Embeddings...")

embeddings = embedding_model.encode(
    texts,
    show_progress_bar=True,
    batch_size=23
)
print(f"‚úÖ Created {len(embeddings)} embeddings")
print(f"Embedding dimension: {embeddings.shape[1]}")

Creating Embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 370/370 [02:08<00:00,  2.88it/s]

‚úÖ Created 8494 embeddings
Embedding dimension: 384





## Vector Store

In [165]:

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

index.add(embeddings.astype('float32'))
print(f"FAISS index created with {index.ntotal} vectors")

faiss.write_index(index, "sephora_faiss.index")

# Save documents and metadata for retrieval
with open('sephora_documents.pkl', 'wb') as f:
    pickle.dump({
        'documents': documents,  # Full structure with text + metadata
        'texts': texts,          # Just texts (useful for display)
        'embedding_model_name': 'all-MiniLM-L6-v2'  # Track which model you used
    }, f)

print("‚úÖ Saved FAISS index and documents")

FAISS index created with 8494 vectors
‚úÖ Saved FAISS index and documents


## Getting Embeddings

In [166]:
def load_rag_system():
    """Load FAISS index and documents"""
    
    # Load FAISS index
    index = faiss.read_index("sephora_faiss.index")
    
    # Load documents
    with open('sephora_documents.pkl', 'rb') as f:
        data = pickle.load(f)
    
    # Load embedding model
    embedding_model = SentenceTransformer(
        data.get('embedding_model_name', 'all-MiniLM-L6-v2')
    )
    
    return index, data['documents'], embedding_model

# Load system
index, documents, embedding_model = load_rag_system()
print(f"‚úÖ Loaded {len(documents)} documents")

‚úÖ Loaded 8494 documents


In [181]:
def search_products(query, top_k=5):
    """Search for relevant products"""
    
    # Encode query
    query_embedding = embedding_model.encode([query])
    
    # Search in FAISS
    distances, indices = index.search(
        query_embedding.astype('float32'), 
        top_k
    )
    
    # Get results with full document structure
    results = []
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        doc = documents[idx]  # This already has text + metadata
        results.append({
            'rank': i + 1,
            'text': doc['text'],
            'metadata': doc['metadata'],
            'distance': float(dist),
            'similarity_score': 1 / (1 + dist)  # Convert distance to similarity
        })
    
    return results

# Test it
results = search_products("Lip balm with SPF", top_k=3)

for r in results:
    print(f"\n{'='*60}")
    print(f"Rank {r['rank']} | Similarity: {r['similarity_score']:.3f}")
    print(f"Product Name: {r['metadata']['product_name']}")
    print(f"Brand Name:{r['metadata']['brand_name']}")
    print(f"Category: {r['metadata']['category']}")
    print(f"\nText Preview:")
    print(r['text'][:200] + "...")
  


Rank 1 | Similarity: 0.606
Product Name: Intense Therapy Lip Balm SPF 25
Brand Name:Jack Black
Category: Skincare

Text Preview:
Product Name: Intense Therapy Lip Balm SPF 25
Brand Name: Jack Black
Ingredients: Beeswax, Butyrospermum Parkii (Shea) Butter, C 18-38 Alkyl Hydroxystearol Stearate, Camellia Sinenesis (Green Tea) Lea...

Rank 2 | Similarity: 0.495
Product Name: Full Spectrum 360¬∫ Refreshing Water Cream Organic Face Sunscreen SPF 50
Brand Name:COOLA
Category: Skincare

Text Preview:
Product Name: Full Spectrum 360¬∫ Refreshing Water Cream Organic Face Sunscreen SPF 50
Brand Name: COOLA
Ingredients: None
On Sale: No
Reviews: 267.0 reviews
New: No
Limited Edition: No
Product Descrip...

Rank 3 | Similarity: 0.492
Product Name: Mini BB Blur Tinted Moisturizer Broad Spectrum SPF 30
Brand Name:tarte
Category: Makeup

Text Preview:
Product Name: Mini BB Blur Tinted Moisturizer Broad Spectrum SPF 30
Brand Name: tarte
Ingredients: Cyclopentasiloxane, Isododecane, Mica, Polysilicone

In [182]:
results = [r for r in results if r['similarity_score']>0.5]

In [183]:
results

[{'rank': 1,
  'text': 'Product Name: Intense Therapy Lip Balm SPF 25\nBrand Name: Jack Black\nIngredients: Beeswax, Butyrospermum Parkii (Shea) Butter, C 18-38 Alkyl Hydroxystearol Stearate, Camellia Sinenesis (Green Tea) Leaf Extract, Cyclohexasiloxane, Cyclopentasiloxane, Flavor, Lanolin, Microcrystalline Wax, Ozokerite, Persea Gratissima (Avocado) Oil, Polyglyceryl 3 Beeswax, Theobroma Cacao (Cocoa) Seed Butter, Tocopheryl Acetate.\nOn Sale: No\nReviews: 3595.0 reviews\nNew: No\nLimited Edition: No\nProduct Description: Without Parabens, Alcohol Free, SPF, Gluten Free\nVariations Available: Original - nan',
  'metadata': {'product_id': 'P12573',
   'brand_id': 3297,
   'rating': 4.7,
   'category': 'Skincare',
   'product_name': 'Intense Therapy Lip Balm SPF 25',
   'brand_name': 'Jack Black',
   'price_usd': 10.0},
  'distance': 0.6492201089859009,
  'similarity_score': 0.6063472028696619}]

## Connect To a LLM

In [65]:
# Testing Connection



In [66]:


# The client gets the API key from the environment variable `GEMINI_API_KEY`.
client = genai.Client()

response = client.models.generate_content(
    model="gemini-2.0-flash", contents="Explain how AI works in a few words"
)
print(response.text)



AI learns patterns from data to make predictions or decisions.



In [124]:
def generate_response(query, retrieved_docs):
    """Generate natural language response using LLM"""
    
    # Create context from retrieved documents
    context = "\n\n".join([
        f"Product {i+1}:\n{doc}" 
        for i, doc in enumerate(retrieved_docs)
    ])
    
    # Create prompt
    prompt = f"""You are a helpful Sephora beauty advisor. Based on the following products, 
answer the customer's question naturally and recommend the best options.

Customer Question: {query}

Available Products:
{context}

Reply directly to the customer question.Provide a helpful recommendation with reasoning. Mention specific product names, prices, and key features."""
    
    
    client = genai.Client()
    response = client.models.generate_content(
    model="gemini-2.0-flash", contents=prompt
)
    return response.text

In [68]:
for i, doc in enumerate(results):
    print(doc['metadata'])

{'product_id': 'P453226', 'brand_id': 5987, 'category': 'Skincare', 'product_name': 'Bright-Eyed 100% Mineral Eye Cream SPF 40 PA+++', 'brand_name': 'Supergoop!'}
{'product_id': 'P395723', 'brand_id': 7091, 'category': 'Skincare', 'product_name': 'Classic Body Organic Sunscreen Spray SPF 50 Fragrance-Free', 'brand_name': 'COOLA'}
{'product_id': 'P468640', 'brand_id': 7091, 'category': 'Skincare', 'product_name': 'Full Spectrum 360¬∫ Refreshing Water Cream Organic Face Sunscreen SPF 50', 'brand_name': 'COOLA'}


In [31]:
query = "A lip balm with SPF Protection"
response = generate_response(query, results)

In [32]:
response

"Hi there! I can certainly help you find a lip balm with SPF protection.\n\nBased on what we have available, I highly recommend the **Jack Black Intense Therapy Lip Balm SPF 25** for $10. It has a fantastic rating of 4.68/5 with over 3500 reviews, which speaks to its popularity and effectiveness. This balm is formulated with moisturizing ingredients like shea butter, avocado oil, and cocoa seed butter to keep your lips soft and hydrated, while the SPF 25 provides essential sun protection. Plus, it's free of parabens and alcohol.\n"

## Build a RAG Chatnot

In [125]:
def rag_chatbot(query, top_k=3):
    """Complete RAG pipeline"""
    
    print(f"üîç Searching for: {query}")
    
    # Step 1: Retrieve relevant documents
    retrieved_docs = search_products(query, top_k=top_k)
    
    print(f"‚úÖ Found {len(retrieved_docs)} relevant products")
    
    # Step 2: Generate response
    response = generate_response(query, retrieved_docs)
    
    return {
        'query': query,
        'response': response,
        'retrieved_products': [
            {
                'name': doc['metadata']['product_name'],
                'brand': doc['metadata']['brand_name'],
                'price': doc['metadata']['price_usd'],
                'rating': doc['metadata']['rating']
            }
            for doc in retrieved_docs
        ]
    }

# Test it!
result = rag_chatbot("I need an affordable moisturizer for oily skin")
print("\nüìù Response:")
print(result['response'])
print("\nüõçÔ∏è Products considered:")
for p in result['retrieved_products']:
    print(f"- {p['name']} by {p['brand']} (${p['price']})| Rating: {p['rating']}/5")

üîç Searching for: I need an affordable moisturizer for oily skin
‚úÖ Found 3 relevant products

üìù Response:
Okay, I can help with that! Finding the right moisturizer for oily skin can be tricky, but there are definitely affordable options. Based on what we have, I'd recommend the **Peace Out Oil-Absorbing Pore Treatment Strips** for $19. While technically pore strips, it contains ingredients like DMAE Bitartrate and Vitamin A, perfect for oily skin and fighting pores!


üõçÔ∏è Products considered:
- Out of Trouble 10 Minute Mask to Rescue Problem Skin by Origins ($32.0)| Rating: 4.397/5
- Perfect Cleansing Oil by Shiseido ($35.0)| Rating: 4.4514/5
- Oil-Absorbing Pore Treatment Strips by Peace Out ($19.0)| Rating: 3.975/5
