## Import Libraries


In [10]:
from langchain.vectorstores import Chroma
from langchain.schema import Document
import json
import random
from collections import defaultdict
import tqdm
import json
from tqdm import tqdm
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.vectorstores import Chroma
from langchain.schema import Document
import hashlib
load_in_4bit=True

## Clean data by joining parent asin


In [2]:
# Load metadata
meta_dict = {}
with open("meta_All_beauty.jsonl", "r") as meta_file:
    for line in meta_file:
        data = json.loads(line)
        parent_asin = data.get("parent_asin")
        if parent_asin:
            meta_dict[parent_asin] = {
                "title": data.get("title"),
                "average_rating": data.get("average_rating"),
                "rating_number": data.get("rating_number"),
            }

# Load reviews data and process reviews and merge
merged_reviews = []
with open("All_beauty.jsonl", "r") as review_file:
    for line in review_file:
        review = json.loads(line)
        parent_asin = review.get("parent_asin")
        if parent_asin in meta_dict:
            merged_reviews.append({
                "parent_asin": parent_asin,
                "asin": review.get("asin"),
                "title": meta_dict[parent_asin]["title"],
                "average_rating": meta_dict[parent_asin]["average_rating"],
                "rating_number": meta_dict[parent_asin]["rating_number"],
                "rating": review.get("rating"),
                "text": review.get("text"),
            })

# Save merged data
with open("merged_reviews.json", "w") as output_file:
    json.dump(merged_reviews, output_file, indent=4)

print(f"Merged {len(merged_reviews)} reviews into 'merged_reviews.json'.")


Merged 701528 reviews into 'merged_reviews.json'.


## Filter Data to 15000 rows

In [6]:
# Load merged data
input_file = "merged_reviews.json"
reviews = []
with open(input_file, "r", encoding="utf-8") as f:
    reviews = json.load(f)  # Load entire JSON array

# Group reviews by rating
rating_groups = defaultdict(list)
for review in reviews:
    rating = int(review["average_rating"])  # Convert to integer (e.g., 4.0 -> 4)
    rating_groups[rating].append(review)

# Define how many reviews to take per rating
total_reviews = 15000
num_ratings = len(rating_groups)
reviews_per_rating = total_reviews // num_ratings  # Equal distribution

# Sample reviews
filtered_reviews = []
for rating, group in rating_groups.items():
    sample_size = min(reviews_per_rating, len(group))  # Avoid exceeding available data
    filtered_reviews.extend(random.sample(group, sample_size))

# Save filtered reviews
output_file = "filtered_reviews.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for review in filtered_reviews:
        f.write(json.dumps(review) + "\n")

print(f"Filtered dataset saved to {output_file} with {len(filtered_reviews)} reviews.")


Filtered dataset saved to filtered_reviews.jsonl with 15000 reviews.


In [8]:
# Remove reviews with invalid or missing product titles
cleaned_reviews = [
    review for review in filtered_reviews
    if review.get("title") and review.get("title").strip().lower() != "n/a"
]

# Summary
removed_count = len(filtered_reviews) - len(cleaned_reviews)
print(f" Removed {removed_count} reviews with invalid 'title'")
print(f" Remaining reviews: {len(cleaned_reviews)}")

# Save
output_cleaned_file = "filtered_reviews.jsonl"
with open(output_cleaned_file, "w", encoding="utf-8") as f:
    for review in cleaned_reviews:
        f.write(json.dumps(review) + "\n")

print(f" Cleaned reviews saved to '{output_cleaned_file}'")


 Removed 1 reviews with invalid 'title'
 Remaining reviews: 14999
 Cleaned reviews saved to 'filtered_reviews.jsonl'


## Semantic Chunk Product Reviews

In [11]:
# Initialize Semantic Chunker with percentile threshold
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
text_splitter = SemanticChunker(
    hf_embeddings,
    breakpoint_threshold_type="percentile"  # Smart dynamic chunking
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load JSONL Review Data
data = []
with open("filtered_reviews.jsonl", "r") as f:
    for line in f:
        json_obj = json.loads(line)
        data.append(json_obj)

# Process & chunk reviews with tqdm progress bar
chunks = []
print(" Chunking reviews with SemanticChunker (percentile-based)...")
for review in tqdm(data, desc="Chunking"):
    if len(review["text"].split()) > 100:

        review_text = review["text"]
        product_name = review.get("title", "")
        avg_rating = review.get("average_rating", 0)
        rating_number = review.get("rating_number", 0)

        try:
            split_docs = text_splitter.create_documents([review_text])

            for doc in split_docs:
                chunks.append({
                    "chunk": doc.page_content,
                    "average_rating": avg_rating,
                    "rating_number": rating_number,
                    "product": product_name
                })
        except Exception as e:
            print(f"Error processing review: {product_name} — {e}")

In [13]:
documents_to_store = []
for chunk in chunks:
    doc = Document(
    page_content=f"Product: {chunk['product']}\nReview: {chunk['chunk']}",
    metadata={
        "product": chunk["product"],
        "average_rating": chunk["average_rating"],
        "rating_number": chunk["rating_number"]
    }
)
    documents_to_store.append(doc)

## Store them into ChromaDB

In [14]:
def hash_chunk(text):
    return hashlib.md5(text.strip().lower().encode()).hexdigest()

# Deduplicate based on hash of page_content
unique_chunks = {}
for doc in documents_to_store:
    h = hash_chunk(doc.page_content)
    if h not in unique_chunks:
        unique_chunks[h] = doc  # Keep only unique chunk by content

# Store only the deduplicated values
unique_documents = list(unique_chunks.values())

persist_directory = "chromadb_reviews"

# Save to Chroma
vectordb = Chroma.from_documents(
    documents=unique_documents,
    embedding=hf_embeddings,
    persist_directory=persist_directory
)

vectordb.persist()
print(f"{len(unique_documents)} unique chunks saved to Chroma DB.")


1644 unique chunks saved to Chroma DB.


  vectordb.persist()
