In [None]:
#Set up your environment
!pip install -q pandas numpy tqdm \
  transformers datasets peft trl accelerate bitsandbytes sentencepiece \
  sentence-transformers faiss-cpu

**2.Data Understanding and Preparation:**

In [None]:
#Upload Data File
from google.colab import files
files.upload()

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('flipkart_com-ecommerce_sample.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
#Select ONLY useful columns
df=df[['product_name','description','product_category_tree','brand','retail_price','discounted_price','pid']]

In [None]:
df.info()

In [None]:
Brand_Missing_Value_Proportion =(df['brand'].notnull()).sum()/len(df)
Brand_Missing_Value_Proportion

In [None]:
#Filling Misising Value:
df['brand']=df['brand'].fillna('Brand Not Avaiable')
df['description']=df['description'].fillna('Description Not Avaiable')
df['retail_price']=df['retail_price'].fillna('retail price Not Available')
df['discounted_price']=df['discounted_price'].fillna('discounted Price Not Avaiable')

In [None]:
df=df.dropna(subset=['description','product_name'])
df.reset_index(drop=True)
df['description']=df['description'].str.replace(r'<.*?>',"",regex=True)

In [None]:
#Create ‚ÄúProduct Documents
def create_product_doc(row):
  return f"""
  Product Name:{row['product_name']}
  Brand:{row['brand']}
  Category:{row['product_category_tree']}
  Retail Price:{row['retail_price']}
  Discounted Price:{row['discounted_price']}
  Description:{row['description']}
  Product ID:{row['pid']}
  """


In [None]:
df['product Document']=df.apply(create_product_doc,axis=1)
print(df['product Document'].iloc[0])

In [None]:
#Reduce dataset size
df=df.sample(500,random_state=42)

In [None]:
#Save cleaned data
df.to_csv('clean_flipkart_products.csv',index=False)

In [None]:
#Load the cleaned product data
import pandas as pd
import numpy as np
df=pd.read_csv('clean_flipkart_products.csv')
df.head()

**3.Embeddings & Indexing:**

In [None]:
#Load the Embedding Model
from sentence_transformers import SentenceTransformer
model=SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
#metadata management:
def refine_product_type(row):
    text = (row["product_name"] + " " + row["description"]).lower()

    # SMARTPHONES
    if any(x in text for x in ["gb","ram","camera","mah","android","ios"]) and not any(x in text for x in ["headset","earphone","charger","usb","fan","lcd","screen","battery replacement"]):
      return "smartphone"

    # FEATURE PHONES
    if any(x in text for x in ["feature phone", "keypad phone", "basic phone"]):
        return "feature_phone"

    # MOBILE SPARES
    if any(x in text for x in ["lcd", "display", "touch screen", "digitizer"]):
        return "lcd_spare"
    if "battery" in text and "mah" in text:
        return "battery_spare"

    # AUDIO ACCESSORIES
    if any(x in text for x in ["headset", "earphone", "earbud", "headphone"]):
        return "audio_accessory"

    # FANS
    if any(x in text for x in ["usb fan", "cooling fan"]):
        return "fan"

    # FOOTWEAR
    if any(x in text for x in ["shoe", "boot", "sneaker"]):
        return "shoe"
    if any(x in text for x in ["slipper", "sandal"]):
        return "slipper"

    # WATCHES
    if "watch" in text:
        return "watch"

    return "other"
    df = df[df["product_type"] != "other"]

In [None]:
df["product_type"] = df.apply(refine_product_type, axis=1)

In [None]:
df["product_type"].value_counts()

In [None]:
#Create Product-Type DataFrames
smartphone_df= df[df["product_type"] == "smartphone"]
feature_phone_df= df[df["product_type"] == "feature_phone"]
lcd_spare_df= df[df["product_type"] == "lcd_spare"]
battery_spare_df= df[df["product_type"] == "battery_spare"]
audio_accessory_df=df[df["product_type"]=="audio_accessory"]
fan_df=df[df["product_type"]=="fan"]
shoe_df= df[df["product_type"] == "footwear"]
slipper_df= df[df["product_type"] == "slipper"]
watch_df= df[df["product_type"] == "watch"]


In [None]:
#Chucking
from collections import defaultdict

def chunk_text(text, chunk_size, overlap):
    tokens = text.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = " ".join(tokens[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

type_chunks = defaultdict(list)
type_chunk_meta = defaultdict(list)

for _, row in df.iterrows():
    ptype = row["product_type"]
    doc = row["product Document"]

    chunks = chunk_text(doc, 400, 50)

    for ch in chunks:
        type_chunks[ptype].append(ch)
        type_chunk_meta[ptype].append(row["product_name"])   # for debugging

In [None]:
#Generate embeddings per product type
type_embedding={}
for ptype,chunks in type_chunks.items():
  type_embedding[ptype]=model.encode(chunks,show_progress_bar=True)

In [None]:
#Intra-product-type similarity (Are phones close to phones?)
from sklearn.metrics.pairwise import cosine_similarity

phone_vecs = type_embedding['watch']

# pick 5 random phone chunks
idx = np.random.choice(len(phone_vecs), 5, replace=False)
sample = phone_vecs[idx]

print(cosine_similarity(sample, sample))

‚ÄúWe evaluated embedding coherence using intra-type cosine similarity and refined product taxonomy until each FAISS index represented a semantically tight domain.

In [None]:
#Inter-product-type similarity (Are phones far from fans?)
#‚ÄúWe embed the query, compute cosine similarity against product embeddings, and retrieve the most relevant product chunks.‚Äù
phone_vec = type_embedding["smartphone"][0]
fan_vec   = type_embedding["watch"][0]

print(cosine_similarity([phone_vec],[fan_vec]))

In [None]:
#Chunk relevance test
q = model.encode("mobile phone with good battery")

scores = cosine_similarity([q], type_embedding["smartphone"])
top = scores.argsort()[0][-5:][::-1]

for i in top:
    print(type_chunks["smartphone"][i][:150])

In [None]:
#Build FAISS index per product type
import faiss
type_indexs={}
for ptype,emb in type_embedding.items():
  dim=emb.shape[1]
  index=faiss.IndexFlatL2(dim)
  index.add(emb)
  type_indexs[ptype]=index

In [None]:
emb

In [None]:
#Route queries using product type
def route_by_type(query):
    q = query.lower()
    if any(x in q for x in ["gb","ram","camera","mah","android","ios"]) and not any(x in q for x in ["headset","earphone","charger","usb","fan","lcd","screen","battery replacement"]):
      return "smartphone"
    if any(x in q for x in ["feature phone","keypad phone"]):
        return "feature_phone"

    if any(x in q for x in ["lcd","screen","display","touch"]):
        return "lcd_spare"

    if any(x in q for x in ["battery replacement","phone battery"]):
        return "battery_spare"

    if any(x in q for x in ["headset","earphone","earbud","headphone"]):
        return "audio_accessory"

    if any(x in q for x in ["usb fan","cooling fan","fan"]):
        return "fan"

    if any(x in q for x in ["shoe","boot","sneaker"]):
        return "shoe"

    if any(x in q for x in ["slipper","sandal"]):
        return "slipper"

    if "watch" in q:
        return "watch"

    return "other"

**4.Retrieval Design (RAG)**

In [None]:
#Build smart retrieval
def smart_search(query,k=5):
    ptype=route_by_type(query)

    index= type_indexs.get(ptype)
    chunks=type_chunks.get(ptype)

    q_emb=model.encode(query)
    scores,idxs=index.search(np.array([q_emb]),k)
    return [chunks[i] for i in idxs[0]]

In [None]:
#Validate
queries = [
  "mobile phone with good battery",
  "iphone smartphone",
  "usb cooling fan",
  "women shoe",
  "analog watch for women"
]

for q in queries:
    print("\nQUERY:", q)
    for r in smart_search(q):
        print("-", r[:120])

Inference:‚ÄúProduct catalogue text is converted into dense vector embeddings using a sentence transformer. These embeddings are indexed using FAISS for fast semantic similarity search. At query time, the user question is embedded and matched against product vectors to retrieve relevant product information.‚Äù

‚ÄúBefore integrating the LLM, we evaluated embedding quality using cosine similarity and intra-type vs inter-type comparisons. We validated FAISS retrieval using manual Top-K inspection and Recall@K to ensure accurate, type-safe retrieval.‚Äù
Recall@5 = hits / total_queries
Embeddings understand meaning, FAISS proves retrieval, LLM only speaks.

**5.LLM Integration**

#Prepare Fine-Tuning Dataset (Style Only)

Create Q&A examples that show:

How answers should look

How prices are described

How comparisons are explained

In [None]:
examples=[
    {"prompt":"User: Tell me about a smartphone with good battery\nAssistant:",
 "response":"Sure! Here are some smartphones with long-lasting battery performance. I‚Äôll show you the best options based on your budget and preferences."},
    {"prompt":"User: Suggest a good phone for students\nAssistant:",
 "response":"Great choice! Student-friendly phones should have good battery life, decent performance, and affordable pricing. Let me find the best options for you."},
    {"prompt":"User: Compare two smartphones\nAssistant:",
 "response":"Here‚Äôs a simple comparison to help you choose:\n\n‚Ä¢ Battery life\n‚Ä¢ Camera quality\n‚Ä¢ Performance\n‚Ä¢ Storage\n\nLet me know which matters most to you!"},
    {"prompt":"User: Best phone under 15000\nAssistant:",
 "response":"Sure! I‚Äôll find the best smartphones under ‚Çπ15,000 with good battery, camera, and performance. Please tell me if you prefer any brand."},
    {"prompt":"User: What is the warranty of Samsung Galaxy M34?\nAssistant:",
 "response":"I‚Äôll check the product details to give you the correct warranty information. If it‚Äôs not available, I‚Äôll let you know honestly."},
    {"prompt":"User: Is this phone waterproof?\nAssistant:",
 "response":"Let me verify the product specifications. If waterproofing information is not listed, I won‚Äôt assume it."},
    {"prompt":"User: I want a phone for photography\nAssistant:",
 "response":"Great! Camera quality is important. I‚Äôll recommend phones with the best camera features. Do you have a budget range?"},
    {"prompt":"User: I want shoes for daily use\nAssistant:",
 "response":"Nice! Comfortable shoes for daily use should have good cushioning and durability. I‚Äôll suggest suitable options for you."},
    {"prompt":"User: Hi\nAssistant:",
 "response":"Hello! üëã I‚Äôm your Flipkart product assistant. How can I help you today?"},
    {"prompt":"User: Thanks\nAssistant:",
 "response":"You‚Äôre welcome! üòä Let me know if you need help choosing the perfect product."},
]

In [None]:
import json
with open('fine_tune_dataset.jsonl','w',encoding='utf-8') as f:
  for ex in examples:
    f.write(json.dumps(ex,ensure_ascii=False)+'\n')

We use JSON because it clearly separates input and output for supervised fine-tuning, making it easy for the LLM to learn prompt-response mapping in a structured, machine-readable format.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    device_map="auto"
)


**6.Fine-Tuning (LoRA)**-[LORA-Low Rank Adapter]

In [None]:
#Configure LoRA
from peft import LoraConfig # PEFT (Parameter-Efficient Fine-Tuning)
lora_config=LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=['q_proj','v_proj'], # Corrected typo from 'v_=proj' to 'v_proj'
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM'
)

In [None]:
#Load Fine-Tuning Dataset
from datasets import load_dataset

def format_for_sft(example):
    example["text"] = example["prompt"] + example["response"]
    return example

dataset = load_dataset('json', data_files='fine_tune_dataset.jsonl', split='train')
dataset = dataset.map(format_for_sft)

In [None]:
#Pre-tokenize your dataset manually
def tokenize_fn(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_dataset = dataset.map(tokenize_fn, batched=True)


In [None]:
#FinE Tuning:
import os
os.environ["WANDB_DISABLED"] = "true"

from trl import SFTTrainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    # Changed fp16=True to bf16=True for potentially better compatibility
    bf16=True,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    peft_config=lora_config,
    args=training_args
)

trainer.train()

In [None]:
#Save Adapter
trainer.model.save_pretrained("flipkart-style-lora")
tokenizer.save_pretrained("flipkart-style-lora")

We save the fine-tuned LoRA adapters and tokenizer so the model can be reloaded later without storing the full base model.

In [None]:
#Inference (RAG + Fine-Tuned LLM)
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_4bit=True,
    device_map="auto"
)

model = PeftModel.from_pretrained(base, "flipkart-style-lora")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

model.eval()

In [None]:
def answer_question(query):
    retrieved_chunks = smart_search(query)

    prompt = f"""
    You are a Flipkart Product Assistant.
    Answer ONLY using the context below.
    If not found, say you don‚Äôt know.

    Context:
    {'\n'.join(retrieved_chunks)}

    User: {query}
    Assistant:
    """

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.3,
            top_p=0.9,
            do_sample=True
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

**Prompt Template:**‚ÄúThis prompt combines instruction tuning with RAG by injecting retrieved knowledge into the context and explicitly restricting the model to answer only from that context, which significantly reduces hallucinations.


**This is a hybrid RAG + PEFT(Performance Efficient Fine Tuning) architecture.**

‚ÄúWe are plannig to use  OpenAI Whisper to transcribe Tamil speech into text.
The transcribed Tamil text is translated to English using Hugging Face Transformers (opus-mt-ta-en) so it can be processed against English product data.
After generating the response, we translate it back to Tamil using opus-mt-en-ta.
Finally, we convert the Tamil text into speech using GTTS(Google Text-to-Speech) for multilingual voice output.‚Äù

**9. Deployment**

**9.1 Fast API:**

***9.1.1. Install API dependencies***

In [None]:
pip install nest-asyncio

***9.1.3.Create FastAPI app***

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
app=FastAPI(title="GenAI Product Chatbot")

***9.1.4.Define request / response schema***

In [None]:
class ChatRequest(BaseModel):
 question:str
class ChatResponse(BaseModel):
 answer:str

***9.1.5.Create /chat endpoint***

In [None]:
@app.post('/chat',response_model=ChatResponse)
def chat(request:ChatRequest):
  context=retrieve_context(request.question)
  answer=generate_answer(request.question,context)
  return {'answer':answer}

***9.1.6. Run FastAPI from notebook***

In [None]:
import threading
import uvicorn

def run_api():
    uvicorn.run(
        app,
        host="0.0.0.0",
        port=8000,
        log_level="info"
    )

thread = threading.Thread(target=run_api, daemon=True)
thread.start()


In [None]:
!./cloudflared-linux-amd64 tunnel --url http://localhost:8000

FastAPI defines the API; Uvicorn runs it.
Cloudflare Tunnel is a secure way to expose your local application to the internet without opening any ports.
