# Amazon review Chatbot

### Import libraries 

In [None]:
import re
import numpy as np
import pandas as pd
#import faiss
#import torchtext.vocab as vocab
import torch as torch
#from groq import Groq
import pickle
import os

#transformrs
#reqwests


In [None]:
! pip install 

### Configuration

In [23]:
FAISS_INDEX_PATH = "../embeddings/faiss_index.bin"
CHUNKS_DATA_PATH = "../embeddings/chunks_data.pkl"
METADATA_PATH = "../embeddings/metadata.pkl"

### Read data

In [22]:
processedDF = pd.read_excel("../data/processedDF.xlsx")
processedDF.head()

Unnamed: 0,product_id,product_name,review_count,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,review_id,product_count,user_name,review_title,review_content,img_link,product_link,sentiment
0,['B07F1P8KNV'],wayona nylon braided usb type c ft fast charge...,1,computer accessory accessory peripheral cable ...,325.0,1099.0,0.7,4.2,10576.0,usb c usb data charging cable fully compatible...,"['AEXK37TSBFHSP2TYE63YPKETWQ7Q,AEKMVX2VDNNX4ZF...","('R10365HEDURWI9,R5RP542IMC4OI,RX2HFWXTTQDTS,R...",1,"['Sunil Funde,Biju Abraham Thomas,Samir,Rahul ...",nice product good quality braided cable vfm go...,sturdy packing good product used brand cable i...,['https://m.media-amazon.com/images/W/WEBP_402...,['https://www.amazon.in/Wayona-Charger-Samsung...,4
1,['B07GVGTSLN'],wayona usb type c fast charger cable fast char...,2,computer accessory accessory peripheral cable ...,325.0,1299.0,0.75,4.2,10576.0,fast charge data sync fast charge data transfe...,"['AEXK37TSBFHSP2TYE63YPKETWQ7Q,AEKMVX2VDNNX4ZF...","('R10365HEDURWI9,R5RP542IMC4OI,RX2HFWXTTQDTS,R...",1,"['Sunil Funde,Biju Abraham Thomas,Samir,Rahul ...",nice product good quality braided cable vfm go...,sturdy packing good product used brand cable i...,['https://m.media-amazon.com/images/W/WEBP_402...,['https://www.amazon.in/Wayona-Cable-Braided-C...,4
2,['B08QJJCY2Q'],tizum mouse pad computer mouse mat anti slip r...,1,computer accessory accessory peripheral keyboa...,169.0,299.0,0.43,4.4,5176.0,inch x inch mm x mm x mm size mouse pad ideal ...,"['AG7XUAMM5BZSSPCBAQJ3YGYSIPXA,AGS6JTKZGW3L2TC...","('R10758I9J937X1,R2QT07V4QXKIFY,R2BLT775YXVSXH...",1,"['Divya Nancy,Jay Mishra,Md Kalim,Saktyy,Venka...",nice product size big small liked ok really go...,nice product want use mouse gaming stuff life ...,['https://m.media-amazon.com/images/W/WEBP_402...,['https://www.amazon.in/Notebook-MacBook-Compu...,4
3,"['B01F25X6RQ', 'B01F262EUU']",samsung ehs ehs avfwecinu hand free wired ear ...,1,electronics headphone earbuds accessory headph...,724.0,749.0,0.025,4.2,31539.0,ear volume control design canal phone headphon...,"['AFTS5BKDRY7Y23B27UVBE2V6TOHA,AHRIDJXYEBQS7MX...","('R10FUJSCR3VYHY,R2Y8B5LQ5HLACQ,R3BC8GS9GGMBTI...",2,"['Ranit Barman,Ravi Singh,Karan Rai,Amazon Cus...",work well long good product good product good ...,sound quality excellent price three button how...,['https://m.media-amazon.com/images/I/31FzYVC6...,['https://www.amazon.in/Samsung-Original-EHS64...,3
4,['B09C6H53KH'],duracell type c micro braided sync charge cabl...,1,computer accessory accessory peripheral cable ...,368.0,699.0,0.47,4.2,387.0,tangle free durable tough braiding sync charge...,"['AG7TJLDLH3HOUPRBUFW6KNUEGO4A,AHTSVFP4GVBBXB6...","('R10G3GXLZIE38O,R806LMS8MHN8Y,R10XDKD7Z4R4WL,...",1,"['Sreejith ks,chetan w.,ASR,vajreshwari,Raghav...",superb reviewing month use good braided cable ...,superb using car month quality product describ...,['https://m.media-amazon.com/images/W/WEBP_402...,['https://www.amazon.in/DURACELL-Type-C-Micro-...,5


## Prepare combined documents

In [23]:
def build_document(row):
    """Enhanced document builder with structured formatting"""
    parts = []
    
    # Product information section
    parts.append(f"PRODUCT INFO:")
    parts.append(f"Name: {row['product_name']}")
    parts.append(f"Category: {row['category']}")
    parts.append(f"Price: {row['discounted_price']} (Actual: {row['actual_price']}, Discount: {row['discount_percentage']})")
    parts.append(f"Overall Rating: {row['rating']}/5 from {row['rating_count']} reviews")
    
    # Product description
    if pd.notna(row['about_product']) and str(row['about_product']).strip():
        parts.append(f"Description: {row['about_product']}")
    
    # Review section
    parts.append("REVIEW:")
    parts.append(f"User: {row['user_name']} | Sentiment: {row['sentiment']}")
    parts.append(f"Title: {row['review_title']}")
    parts.append(f"Content: {row['review_content']}")
    
    return " | ".join(map(str, parts))


## Chunking, embedding, and storing in FAISS

In [24]:
# ---------------------------------------------
# Chunking
# ---------------------------------------------
def chunk_text(text, max_words=80):
    """Improved chunking that preserves section boundaries"""
    words = str(text).split()
    for i in range(0, len(words), max_words):
        yield " ".join(words[i:i+max_words])

def create_and_save_faiss_index(processedDF):
    """Create FAISS index and save it locally with all associated data"""
    print("Creating FAISS index from scratch...")
    
    chunks, meta = [], []
    for _, row in processedDF.iterrows():
        document = build_document(row)
        for chunk in chunk_text(document):
            chunks.append(chunk)
            meta.append({
                "product_id": row["product_id"],
                "product_name": row["product_name"],
                "rating": row["rating"],
                "sentiment": row["sentiment"],
                "category": row["category"],
                "user_name": row["user_name"],
                "review_id": row["review_id"],
                "discounted_price": row["discounted_price"],
                "actual_price": row["actual_price"]
            })

    print(f"Created {len(chunks)} chunks from {len(processedDF)} reviews")


    # ---------------------------------------------
    # GloVe embeddings
    # ---------------------------------------------
    glove = vocab.GloVe(name="6B", dim=100)

    def text_to_vector(text, embeddings, dim=100):
        words = re.findall(r'\w+', str(text).lower())
        vectors = [embeddings[word] for word in words if word in embeddings.stoi]
        if len(vectors) == 0:
            return torch.zeros(dim)
        return torch.mean(torch.stack(vectors), dim=0)

    # Create embeddings for chunks
    chunk_embeddings = torch.stack([text_to_vector(ch, glove, 100) for ch in chunks]).float()
    chunk_embeddings_np = chunk_embeddings.detach().cpu().numpy().astype("float32")


    # ---------------------------------------------
    # Store in FAISS
    # ---------------------------------------------
    dimension = chunk_embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(chunk_embeddings_np)

    # Save everything to disk
    print("Saving FAISS index and data to disk...")
    
    # Save FAISS index
    faiss.write_index(index, FAISS_INDEX_PATH)
    
    # Save chunks and metadata
    with open(CHUNKS_DATA_PATH, 'wb') as f:
        pickle.dump(chunks, f)
    
    with open(METADATA_PATH, 'wb') as f:
        pickle.dump(meta, f)
    
    print(f"Saved {len(chunks)} review chunks with metadata to local files")
    print(f"FAISS index: {FAISS_INDEX_PATH}")
    print(f"Chunks data: {CHUNKS_DATA_PATH}")
    print(f"Metadata: {METADATA_PATH}")
    
    return index, chunks, meta

## Load FAISS index

In [25]:
def load_faiss_index():
    """Load FAISS index and associated data from disk"""
    print("Loading FAISS index and data from disk...")
    
    if not all(os.path.exists(path) for path in [FAISS_INDEX_PATH, CHUNKS_DATA_PATH, METADATA_PATH]):
        raise FileNotFoundError("Saved FAISS files not found. Please run create_and_save_faiss_index first.")
    
    # Load FAISS index
    index = faiss.read_index(FAISS_INDEX_PATH)
    
    # Load chunks and metadata
    with open(CHUNKS_DATA_PATH, 'rb') as f:
        chunks = pickle.load(f)
    
    with open(METADATA_PATH, 'rb') as f:
        meta = pickle.load(f)
    
    print(f"Loaded {len(chunks)} review chunks with metadata from local files")
    
    return index, chunks, meta

# Global variables that will be set based on whether we're creating or loading
index = None
chunks = None
meta = None

In [26]:
# Check if we need to create new or load existing
if all(os.path.exists(path) for path in [FAISS_INDEX_PATH, CHUNKS_DATA_PATH, METADATA_PATH]):
    try:
        index, chunks, meta = load_faiss_index()
        print("Successfully loaded existing FAISS index")
    except Exception as e:
        print(f"Error loading existing index: {e}")
        print("Please ensure you have processedDF available to create a new index")
else:
    print("No existing FAISS index found.")
    print("To create a new index, call: index, chunks, meta = create_and_save_faiss_index(processedDF)")

glove = vocab.GloVe(name="6B", dim=100)

def text_to_vector(text, embeddings, dim=100):
    """Text to vector conversion function (needed for search)"""
    words = re.findall(r'\w+', str(text).lower())
    vectors = [embeddings[word] for word in words if word in embeddings.stoi]
    if len(vectors) == 0:
        return torch.zeros(dim)
    return torch.mean(torch.stack(vectors), dim=0)

Loading FAISS index and data from disk...
Loaded 5061 review chunks with metadata from local files
Successfully loaded existing FAISS index


## Search function

In [27]:
def search_reviews(query, k=5, product_id=None, category=None, min_rating=None, sentiment=None):
    """Enhanced search with multiple filters and safe indexing"""
    if index is None or chunks is None or meta is None:
        raise ValueError("FAISS index not loaded. Please create or load the index first.")
    
    q_emb = np.array([text_to_vector(query, glove, 100)]).astype("float32")
    
    # Determine how many results to retrieve initially
    initial_k = min(k * 3, len(chunks))
    scores, indices = index.search(q_emb, initial_k)

    results = []
    for i, idx in enumerate(indices[0]):
        if idx == -1 or idx >= len(chunks):
            continue
            
        entry = meta[idx]
        
        # Apply filters
        if product_id and entry["product_id"] != product_id:
            continue
        if category and category.lower() not in str(entry["category"]).lower():
            continue
        if min_rating and float(entry.get("rating", 0)) < min_rating:
            continue
        if sentiment and entry.get("sentiment", "").lower() != sentiment.lower():
            continue
            
        results.append((chunks[idx], entry, float(scores[0][i])))
        if len(results) >= k:
            break
    
    return [(chunk, metadata) for chunk, metadata, score in results]

## Groq Llama 3.3 Answer function

In [None]:
client = Groq(api_key="key here")

def answer_query(query, product_id=None, category=None, min_rating=None, sentiment=None, 
                query_type="general", require_citations=True):
    """
    Enhanced query answering with scenario-based prompt engineering
    """
    
    retrieved = search_reviews(query, k=8, product_id=product_id, 
                             category=category, min_rating=min_rating, sentiment=sentiment)
    
    if not retrieved:
        return "No relevant reviews found matching your criteria."
    
    # Build structured context
    context_parts = []
    for i, (chunk, metadata) in enumerate(retrieved):
        context_parts.append(f"REVIEW {i+1}:")
        context_parts.append(f"Content: {chunk}")
        context_parts.append(f"Metadata: Product: {metadata['product_name']}, Rating: {metadata['rating']}, Sentiment: {metadata['sentiment']}")
        context_parts.append("---")
    
    context = "\n".join(context_parts)
    
    # Scenario-based system prompts
    scenario_prompts = {
        "comparison": """
You are comparing products based on Amazon reviews. Analyze similarities, differences, strengths, and weaknesses.
Focus on: features, quality, value for money, user satisfaction, and common complaints.
Provide a balanced comparison with specific examples from reviews.
        """,
        "recommendation": """
You are providing product recommendations. Consider: overall ratings, sentiment analysis, specific features mentioned,
price-value ratio, and frequency of positive/negative comments. Highlight best options for different user needs.
        """,
        "summary": """
You are summarizing product reviews. Extract key themes: most praised features, common complaints, overall satisfaction,
and any recurring patterns. Provide a concise yet comprehensive summary.
        """,
        "general": """
You are a helpful assistant analyzing Amazon reviews. Answer questions accurately based only on the provided context.
Be specific, factual, and reference actual review content when possible.
        """
    }
    
    system_prompt = scenario_prompts.get(query_type, scenario_prompts["general"])
    
    # Enhanced user prompt with explicit instructions
    prompt = f"""
{system_prompt}

USER QUESTION: {query}

AVAILABLE REVIEW CONTEXT:
{context}

INSTRUCTIONS:
1. Answer based ONLY on the provided review context
2. Be specific and reference actual review content when possible
3. If multiple reviews mention similar points, note the consensus
4. For conflicting information, acknowledge both perspectives
5. If information is insufficient, clearly state what cannot be determined
6. Consider ratings, sentiment, and specific user experiences
7. {"Include specific review references (e.g., 'Review 3 mentions...')" if require_citations else "Provide a synthesized answer"}
8. Focus on what users actually experienced and reported

ANSWER:
"""
    
    try:
        response = client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            temperature=0.1,
            max_tokens=1000
        )

        return response.choices[0].message.content
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Additional specialized functions for common scenarios
def compare_products(product_names, aspect=None):
    """Compare multiple products on specific aspects"""
    query = f"Compare products {', '.join(product_names)}"
    if aspect:
        query += f" focusing on {aspect}"
    
    return answer_query(query, query_type="comparison")

def get_recommendations(category=None, budget_range=None, min_rating=4.0):
    """Get product recommendations with filters"""
    query = "Recommend the best products"
    if budget_range:
        query += f" within budget {budup_range}"
    if category:
        query += f" in {category} category"
    
    return answer_query(query, category=category, min_rating=min_rating, query_type="recommendation")

def summarize_product_reviews(product_name):
    """Get comprehensive summary of product reviews"""
    query = f"Summarize the reviews and overall user experience for {product_name}"
    return answer_query(query, query_type="summary")

### Example usage

In [29]:
# only for first time setup (run this once with processedDF):

# index, chunks, meta = create_and_save_faiss_index(processedDF)

# For all other runs, the index auto-loads
if index is not None:
    print("FAISS index loaded successfully!")
    print("Testing with sample query...")
    result = answer_query("What do users say about battery life?")
    print(result)
else:
    print("Please create the FAISS index first by calling:")
    print("index, chunks, meta = create_and_save_faiss_index(processedDF)")

FAISS index loaded successfully!
Testing with sample query...
To answer the question about what users say about battery life, we can look at the reviews that mention battery life.

Review 3 mentions that the product (Instacuppa Milk Frother) stopped working after four months, and even with a new battery, it worked for only a month. The user found the product to be "great" and "easy to use" but had issues with its longevity and battery life.

Review 5, which is about the Instacuppa Portable Blender, states that the "battery life is acceptable" and that the user needs to "charge every 2-3 uses." The user also mentions that the motor runs for a full cycle, but there's no clear indication of when the battery is getting low, as it "simply stops" at the last second.

There is no other mention of battery life in the other reviews. 

Therefore, based on the provided review context, the consensus on battery life is limited to these two reviews. Review 3 expresses dissatisfaction with the batter

In [30]:
# General question
print("1. General question about battery life:")
print(answer_query("What do users say about battery life?"))
print("\n" + "="*50 + "\n")

# More specific questions
print("2. Question about product quality:")
print(answer_query("How is the build quality and durability?"))
print("\n" + "="*50 + "\n")

# Question with filters
print("3. Question about high-rated products:")
print(answer_query("What features do users love most?", min_rating=4.5))

1. General question about battery life:
To answer the question about what users say about battery life, we can look at the reviews that mention this aspect.

Review 3 mentions that the product (Instacuppa Milk Frother) "stopped working" after four months, and even with a new battery, it worked for only a month. However, this review does not specifically discuss battery life in terms of usage time but rather the longevity of the product and its battery.

Review 5, which is about the Instacuppa Portable Blender, mentions that the "battery life is acceptable" and that the user needs to "charge every" use, as the "motor run full time." The review also notes that there's no clear indication when the battery is getting low, as it "simply stop last second."

There is no detailed discussion about battery life in the other reviews, except for general comments about product performance and satisfaction.

In conclusion, based on the provided reviews, there is limited information about battery lif

## Gemma-3 Answer function

In [16]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m")
model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m")

In [None]:
def answer_query_from_df(query, df, product_id=None, category=None, query_type="general", require_citations=True, max_reviews=5):
    """
    Answer questions about products directly from a processed DataFrame using gemma-3.
    """
    # Filter dataframe if product_id or category is given
    filtered_df = df
    if product_id:
        filtered_df = filtered_df[filtered_df['product_id'] == product_id]
    if category:
        filtered_df = filtered_df[filtered_df['category'] == category]
    
    if filtered_df.empty:
        return "No matching products found."
    
    # Take only top reviews to avoid overloading the model
    reviews_to_use = filtered_df.head(max_reviews)
    
    # Build context string
    context = ""
    for i, row in reviews_to_use.iterrows():
        context += f"Product: {row['product_name']}\n"
        context += f"Category: {row['category']}\n"
        context += f"Rating: {row['rating']}, Sentiment: {row['sentiment']}\n"
        context += f"Review: {row['review_content']}\n"
        context += "-----\n"
    
    # Short scenario instructions
    scenario_prompts = {
        "comparison": "Compare the products using the information from reviews. Highlight strengths, weaknesses, and differences.",
        "recommendation": "Recommend the best product(s) using the provided reviews and ratings.",
        "summary": "Summarize the key points from the reviews, including pros, cons, and overall sentiment.",
        "general": "Answer the user's question clearly using the review data."
    }
    
    instruction = scenario_prompts.get(query_type, scenario_prompts["general"])
    
    # Build prompt for Gemma-3
    prompt = f"{instruction}\n\nUSER QUESTION: {query}\n\nREVIEW CONTEXT:\n{context}\nAnswer in natural language:"
    
    try:
        response = gemma_pipeline(
            prompt,
            max_new_tokens=300,
            temperature=0.2,
            repetition_penalty=2.0
        )
        generated_text = response[0]["generated_text"]
        # Extract only answer part
        answer_start = generated_text.find("Answer in natural language:")
        if answer_start != -1:
            answer = generated_text[answer_start + len("Answer in natural language:"):].strip()
        else:
            answer = generated_text.strip()
        return answer
    except Exception as e:
        return f"Error generating response: {str(e)}"



In [33]:
# Example usage
comparison_result = answer_query(
    "Compare iPhone 14 and Samsung Galaxy S23 focusing on battery life, camera, and performance.",
    query_type="comparison",
    require_citations=True
)
print("Comparison Result:\n", comparison_result)

Comparison Result:
 * The difference between the two phones is that they have different cameras with a wide angle lens (iPhone) or an ultra-wide telephoto zoom lenses for portraits of people at home/in public places such as restaurants etc., while their main advantage lies mainly within photography due to its high resolution sensor technology which allows you take photos without having any background noise from other objects around them like cars lights…  The phone has better screen quality than it does compared... Read more »


In this article we will compare Apple's new iPhones vs Huawei’S latest flagship smartphones - A comparison study by Google Research shows how each company stacks up against one another when evaluating smartphone specs.....Read More ›



Google Pixel XL review | Price & Specs – $598 / £607 USD …read_more >




Apple iPad Pro Review — Is It Worth Buying? [Review] -- This post contains affiliate links; if your purchase helps support our work! If so please consider