In [2]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.embeddings import OpenAIEmbeddings
from pymongo import MongoClient
import utils, os
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [3]:
# Load data
data = utils.read_data()
print(f"Data loaded: {len(data)} characters")

# Initialize embeddings
embedding = OpenAIEmbeddings(model='text-embedding-3-small')  # 1536-dim vectors


Data loaded: 45744 characters


  embedding = OpenAIEmbeddings(model='text-embedding-3-small')  # 1536-dim vectors


In [4]:

# Connect to Mongo
client = MongoClient(os.getenv("MONGODB_URI"))
db = client["RAG-evaluation"]



In [5]:
# --------------------------------------------------
# Semantic Chunking
# --------------------------------------------------
semantic_splitter = SemanticChunker(embeddings=embedding, breakpoint_threshold_type="percentile")
semantic_docs = semantic_splitter.create_documents([data])
print(f"Number of semantic chunks: {len(semantic_docs)}")



Number of semantic chunks: 18


In [17]:
# --------------------------------------------------
# Store in new collection
# --------------------------------------------------
semantic_collection = db["RAG-symantic-chunk"]
semantic_vectorstore = MongoDBAtlasVectorSearch.from_documents(
    documents=semantic_docs,
    embedding=embedding,
    collection=semantic_collection,
    index_name="default"  # unique index name
)



In [18]:
semantic_retriever = semantic_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})


In [19]:
# --------------------------------------------------
# Example Query
# --------------------------------------------------
query = "how to delay skin aging"
results = semantic_retriever.get_relevant_documents(query)

print("\n--- Semantic Chunk Retrieval ---")
for i, r in enumerate(results):
    print(f"Rank {i+1}: {r.page_content[:200]}...\n")


--- Semantic Chunk Retrieval ---
Rank 1:  ey’re made up of short chains
of amino acids. In the realm of skin care, we mostly talk about peptides as building up collagen,
a protein your skin needs to keep its structure. Diﬀerent  pes of pepti...

Rank 2: Great. So how do I know which active ingredients are right
for my skin? Picking the right active ingredients depends on the speciﬁc skin concerns you’re hoping to
address. Here are a few of the most c...

Rank 3: Back to top
Sources:
 . Journal of the American Academy of Dermatology, Topical niacinamide-containing
product reduces facial skin sallowness (yellowing)
 . MedlinePlus, Azelaic Acid Topical
 . Britis...

Rank 4: Pro tip: You can use a moisturizer that has at least 30 SPF and
broad-spectrum protection to combine those two steps. When it comes to choosing a sunscreen you have options of physical sunscreens or c...

Rank 5: Moisturizing—almost to an excessive degree—and time. If that
doesn’t help, you should chat with a dermatol