In [51]:
## Step 1 Download the Dataset from the GIthub ['https://raw.githubusercontent.com/DataScience75/Top_mentor_projects_Datasets/refs/heads/main/customer_reviews.csv']

## Step 2: Create an API Key in Groq and Save in the Notepad

### Installation  - Create requirement.txt file

In [52]:
#! pip install faiss-cpu groq
#!pip install python-dotenv




In [53]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_score
from groq import Groq
from dotenv import load_dotenv
import os
import re

In [54]:
client = Groq(api_key='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')

In [55]:
## Load the dataset and read some rows

df = pd.read_csv('https://raw.githubusercontent.com/DataScience75/Top_mentor_projects_Datasets/refs/heads/main/customer_reviews.csv')
df.head()


Unnamed: 0,review_id,product_id,review_text,rating
0,1,P101,Excellent battery life and superb camera quali...,5
1,2,P101,"The phone lasts two days on a single charge, g...",5
2,3,P101,Battery is good but the screen is average.,4
3,4,P101,Amazing camera but the design feels outdated.,4
4,5,P101,Very reliable battery performance.,5


In [56]:
### preprocessing of the Data

def preprocess_text(text):
  text = re.sub(r"\s+", " ", text) #remove the extra spaces
  text = re.sub(r"[^a-zA-Z0-9\s]", " ", text) # removing the punctuation
  return text.strip().lower()

In [57]:
df['Clean_text'] = df['review_text'].astype('str').apply(preprocess_text)
df['Clean_text'].head()

Unnamed: 0,Clean_text
0,excellent battery life and superb camera quali...
1,the phone lasts two days on a single charge g...
2,battery is good but the screen is average
3,amazing camera but the design feels outdated
4,very reliable battery performance


In [58]:
df.head()

Unnamed: 0,review_id,product_id,review_text,rating,Clean_text
0,1,P101,Excellent battery life and superb camera quali...,5,excellent battery life and superb camera quali...
1,2,P101,"The phone lasts two days on a single charge, g...",5,the phone lasts two days on a single charge g...
2,3,P101,Battery is good but the screen is average.,4,battery is good but the screen is average
3,4,P101,Amazing camera but the design feels outdated.,4,amazing camera but the design feels outdated
4,5,P101,Very reliable battery performance.,5,very reliable battery performance


In [59]:
## Chunkings

def chunk_text(text, chunk_size=50):
  words = text.split()
  return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]


In [60]:
chunks = []
metadata = []
for idx, row in df.iterrows():
  for chunk in chunk_text(row['Clean_text']):
    chunks.append(chunk)
    metadata.append({'product_id': row['product_id'], 'review_id': row['review_id']})



In [61]:
metadata

[{'product_id': 'P101', 'review_id': 1},
 {'product_id': 'P101', 'review_id': 2},
 {'product_id': 'P101', 'review_id': 3},
 {'product_id': 'P101', 'review_id': 4},
 {'product_id': 'P101', 'review_id': 5},
 {'product_id': 'P102', 'review_id': 6},
 {'product_id': 'P102', 'review_id': 7},
 {'product_id': 'P102', 'review_id': 8},
 {'product_id': 'P102', 'review_id': 9},
 {'product_id': 'P102', 'review_id': 10},
 {'product_id': 'P103', 'review_id': 11},
 {'product_id': 'P103', 'review_id': 12},
 {'product_id': 'P103', 'review_id': 13},
 {'product_id': 'P103', 'review_id': 14},
 {'product_id': 'P103', 'review_id': 15},
 {'product_id': 'P104', 'review_id': 16},
 {'product_id': 'P104', 'review_id': 17},
 {'product_id': 'P104', 'review_id': 18},
 {'product_id': 'P104', 'review_id': 19},
 {'product_id': 'P104', 'review_id': 20},
 {'product_id': 'P105', 'review_id': 21},
 {'product_id': 'P105', 'review_id': 22},
 {'product_id': 'P105', 'review_id': 23},
 {'product_id': 'P105', 'review_id': 24},
 

In [62]:
chunks

['excellent battery life and superb camera quality highly recommended for travelers',
 'the phone lasts two days on a single charge great for outdoor use',
 'battery is good but the screen is average',
 'amazing camera but the design feels outdated',
 'very reliable battery performance',
 'poor performance lags frequently when opening apps',
 'camera quality is below average not recommended',
 'the phone heats up quickly during use',
 'touch response is slow and frustrating',
 'good design but very slow',
 'blazing fast performance and smooth multitasking',
 'runs heavy games without lag excellent speed',
 'very responsive and fast loading times',
 'performance is top notch for its price range',
 'incredible speed but battery drains fast',
 'extremely lightweight and portable great for travel',
 'slim design but battery life is short',
 'very comfortable to carry around',
 'lightweight but gets scratched easily',
 'compact and fits in small bags',
 'affordable and good value for money'

In [63]:
## Vectorization

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(chunks, convert_to_numpy= True)
embeddings

array([[-0.03224297,  0.03523099, -0.00917663, ...,  0.00715621,
        -0.05709261,  0.05280614],
       [-0.00874082,  0.07436896,  0.09761342, ...,  0.00547879,
        -0.06712592,  0.03938162],
       [-0.02588259,  0.03064434,  0.00045392, ..., -0.01514561,
        -0.01618077,  0.03101025],
       ...,
       [ 0.04114283, -0.04011743,  0.0312217 , ...,  0.01517776,
        -0.1158591 ,  0.05362745],
       [ 0.04344716, -0.00924653,  0.09080452, ..., -0.03844206,
        -0.05648727,  0.02629595],
       [-0.01125789,  0.01765033,  0.03668481, ..., -0.07615221,
        -0.09957709,  0.06578503]], dtype=float32)

In [64]:
### Storing the embeddings into Faiss

dim = embeddings.shape[1]
dim

384

In [65]:
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

In [66]:
### Rag with LLM
## take the relevant documents based on user query
## also we take the metadata along with it
## Prepare the context properly using the metadata and relevant documents
## Pass that to LLM to get more precise results.

def recommend_products(user_query, top_k=3):
  query_vec = model.encode([user_query], convert_to_numpy=True)
  distances, indices = index.search(query_vec, top_k)

  # relevant Reviews
  retrieved_chunks = [chunks[i] for i in indices[0]]
  retrieved_metadata = [metadata[i] for i in indices[0]]

  context = "\n".join([f"'Product'{m['product_id']}:{txt}" for txt, m in zip(retrieved_chunks, retrieved_metadata)])

  completion = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
      {
        "role": "system",
        "content": "You are helpful Product Recommendation assistant."
      },
      {
        "role": "user",
        "content": f"Based on the following customer reviews:\n\n{context}\n\nPlease recommend the best product for: {user_query}\n\nFormat your answer with bullet points."
        #"content": f"Based on the following customer reviews:\n{context}.\n\n Suggest the best product for {user_query}"
      }
    ],
    temperature=0.2,
    max_completion_tokens=200,
  )

  return completion.choices[0].message.content, retrieved_metadata

In [67]:
### Test Queires for Teaching

test_queries = [('long battery life', 'P101'),
                ('fast performance', 'P103')]

test_queries

[('long battery life', 'P101'), ('fast performance', 'P103')]

In [68]:
correct = 0
for query, expected_product in test_queries:
  response, meta = recommend_products(query, top_k=1)
  if meta[0]['product_id'] == expected_product:
    correct += 1

accuracy = correct / len(test_queries)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5


In [69]:
recommendations = recommend_products('I need light weight camera phone')
print("Find the recommendations: \n", recommendations)

Find the recommendations: 
 ('Based on the customer reviews, here\'s a recommendation for a lightweight camera phone:\n\n* **Recommended Product:** \'Product\'P104\n* **Reason:** The customer review for \'Product\'P104 mentions that it is "extremely lightweight and portable, great for travel". This suggests that it is an ideal option for someone looking for a lightweight camera phone.', [{'product_id': 'P105', 'review_id': 22}, {'product_id': 'P104', 'review_id': 16}, {'product_id': 'P105', 'review_id': 25}])


In [70]:
text, meta = recommend_products("I need light weight camera phone")

print("Find the recommendations:\n")
print(text)  # This will respect \n and format properly

print("\nMetadata:")
for item in meta:
    print(item)


Find the recommendations:

Based on the customer reviews, here are the recommendations for a lightweight camera phone:

* **Product P104**: This product is described as "extremely lightweight and portable", making it a great option for travel or those who want a compact camera phone.

Metadata:
{'product_id': 'P105', 'review_id': 22}
{'product_id': 'P104', 'review_id': 16}
{'product_id': 'P105', 'review_id': 25}


In [71]:
## USE streamlit/Gradio for User interface