In [1]:
import os

from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
    TextSplitter,
    TokenTextSplitter,
)
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [2]:
api = 'AIzaSyB1p0yVl-BSf7-AGGTdMBBHb5tjsKlEv4s'

# setting dr

In [4]:
current_dir = os.path.dirname(os.path.abspath('__file__'))
file_path = os.path.join(current_dir, "books", "romeo_and_juliet.txt")
db_dir = os.path.join(current_dir, "db")

# checking file existance

In [8]:
if not os.path.exists(file_path):
    raise FileNotFoundError(
        f"The file {file_path} does not exist. Please check the path."
    )


# file loader

In [11]:
loader = TextLoader(file_path,encoding='utf-8')
documents = loader.load()

# embeddings

In [13]:
embeddings=GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=api)

In [14]:
def create_vector_store(docs, store_name):
    persistent_directory = os.path.join(db_dir, store_name)
    if not os.path.exists(persistent_directory):
        print(f"\n--- Creating vector store {store_name} ---")
        db = Chroma.from_documents(
            docs, embeddings, persist_directory=persistent_directory
        )
        print(f"--- Finished creating vector store {store_name} ---")
    else:
        print(
            f"Vector store {store_name} already exists. No need to initialize.")

1.
Character based spliiting

In [15]:
# 1. Character-based Splitting
# Splits text into chunks based on a specified number of characters.
# Useful for consistent chunk sizes regardless of content structure.
print("\n--- Using Character-based Splitting ---")
char_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
char_docs = char_splitter.split_documents(documents)
create_vector_store(char_docs, "chroma_db_char")

Created a chunk of size 1105, which is longer than the specified 1000
Created a chunk of size 1437, which is longer than the specified 1000
Created a chunk of size 1871, which is longer than the specified 1000
Created a chunk of size 1015, which is longer than the specified 1000
Created a chunk of size 1006, which is longer than the specified 1000
Created a chunk of size 1054, which is longer than the specified 1000
Created a chunk of size 1432, which is longer than the specified 1000
Created a chunk of size 1367, which is longer than the specified 1000
Created a chunk of size 2178, which is longer than the specified 1000
Created a chunk of size 1390, which is longer than the specified 1000
Created a chunk of size 1502, which is longer than the specified 1000
Created a chunk of size 1410, which is longer than the specified 1000
Created a chunk of size 1741, which is longer than the specified 1000
Created a chunk of size 1184, which is longer than the specified 1000
Created a chunk of s


--- Using Character-based Splitting ---

--- Creating vector store chroma_db_char ---
--- Finished creating vector store chroma_db_char ---


2. Sentence based splitting

In [17]:
# 2. Sentence-based Splitting
# Splits text into chunks based on sentences, ensuring chunks end at sentence boundaries.
# Ideal for maintaining semantic coherence within chunks.
print("\n--- Using Sentence-based Splitting ---")
sent_splitter = SentenceTransformersTokenTextSplitter(chunk_size=200) #it take a lot of time to download so keep chunksize lesser 
sent_docs = sent_splitter.split_documents(documents)
create_vector_store(sent_docs, "chroma_db_sent")



--- Using Sentence-based Splitting ---



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.huggingface.co/sentence-transformers/all-mpnet-base-v2/78c0197b6159d92658e319bc1d72e4c73a9a03dd03815e70e555c5ef05615658?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1724540135&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNDU0MDEzNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9zZW50ZW5jZS10cmFuc2Zvcm1lcnMvYWxsLW1wbmV0LWJhc2UtdjIvNzhjMDE5N2I2MTU5ZDkyNjU4ZTMxOWJjMWQ3MmU0YzczYTlhMDNkZDAzODE1ZTcwZTU1NWM1ZWYwNTYxNTY1OD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=dJbkrQBwVM6qj-q3TJkkV-qwWhYufLWsNPSHvDhMDZpi-u5TgplJpQ8ELUrAlnocNUppe5F4d0RnZXNHnrvm-oIxmfegYI4DvFdmaYyVspX9b5xKlF1JkvyHib1dR7-lRLrWxt24NIb2Mls4993TmpXme64LrQdBwjd1d2gCoXyud%7EaoQ-SUwKR3CiftIGg4xMRnSpfLvmSZvRmWKKqmeFK7rqoldmo7uPNDBWMO7UtQpI8oCYZRTehOFJEUJkcvsInCAW2W0vuZyv1BlPuO-sZKBumiKxRRTwAB5so57%7EO18Iwus%7E6n%7EA1won7z88WHPb9vYmwB-

model.safetensors:  31%|###1      | 136M/438M [00:00<?, ?B/s]

KeyboardInterrupt: 

3. Token based splitting 

In [18]:
# 3. Token-based Splitting
# Splits text into chunks based on tokens (words or subwords), using tokenizers like GPT-2.
# Useful for transformer models with strict token limits.
print("\n--- Using Token-based Splitting ---")
token_splitter = TokenTextSplitter(chunk_overlap=0, chunk_size=512)
token_docs = token_splitter.split_documents(documents)
create_vector_store(token_docs, "chroma_db_token")


--- Using Token-based Splitting ---

--- Creating vector store chroma_db_token ---
--- Finished creating vector store chroma_db_token ---


4. Recursive text spliiter

In [19]:
# 4. Recursive Character-based Splitting
# Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
# Balances between maintaining coherence and adhering to character limits.
print("\n--- Using Recursive Character-based Splitting ---")
rec_char_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100)
rec_char_docs = rec_char_splitter.split_documents(documents)
create_vector_store(rec_char_docs, "chroma_db_rec_char")

# 5. Custom Splitting
# Allows creating custom splitting logic based on specific requirements.
# Useful for documents with unique structure that standard splitters can't handle.
print("\n--- Using Custom Splitting ---")


--- Using Recursive Character-based Splitting ---

--- Creating vector store chroma_db_rec_char ---
--- Finished creating vector store chroma_db_rec_char ---

--- Using Custom Splitting ---


5. Custom txt splitter 

In [20]:
class CustomTextSplitter(TextSplitter):
    def split_text(self, text):
        # Custom logic for splitting text
        return text.split("\n\n")  # Example: split by paragraphs


custom_splitter = CustomTextSplitter()
custom_docs = custom_splitter.split_documents(documents)
create_vector_store(custom_docs, "chroma_db_custom")


--- Creating vector store chroma_db_custom ---
--- Finished creating vector store chroma_db_custom ---


In [21]:
# Function to query a vector store
def query_vector_store(store_name, query):
    persistent_directory = os.path.join(db_dir, store_name)
    if os.path.exists(persistent_directory):
        print(f"\n--- Querying the Vector Store {store_name} ---")
        db = Chroma(
            persist_directory=persistent_directory, embedding_function=embeddings
        )
        retriever = db.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"k": 1, "score_threshold": 0.1},
        )
        relevant_docs = retriever.invoke(query)
        # Display the relevant results with metadata
        print(f"\n--- Relevant Documents for {store_name} ---")
        for i, doc in enumerate(relevant_docs, 1):
            print(f"Document {i}:\n{doc.page_content}\n")
            if doc.metadata:
                print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")
    else:
        print(f"Vector store {store_name} does not exist.")

In [22]:
# Define the user's question
query = "How did Juliet die?"

# Query each vector store
query_vector_store("chroma_db_char", query)
query_vector_store("chroma_db_sent", query)
query_vector_store("chroma_db_token", query)
query_vector_store("chroma_db_rec_char", query)
query_vector_store("chroma_db_custom", query)


--- Querying the Vector Store chroma_db_char ---

--- Relevant Documents for chroma_db_char ---
Document 1:
NURSE.
O lamentable day!

LADY CAPULET.
What is the matter?

NURSE.
Look, look! O heavy day!

LADY CAPULET.
O me, O me! My child, my only life.
Revive, look up, or I will die with thee.
Help, help! Call help.

 Enter Capulet.

CAPULET.
For shame, bring Juliet forth, her lord is come.

NURSE.
She’s dead, deceas’d, she’s dead; alack the day!

LADY CAPULET.
Alack the day, she’s dead, she’s dead, she’s dead!

CAPULET.
Ha! Let me see her. Out alas! She’s cold,
Her blood is settled and her joints are stiff.
Life and these lips have long been separated.
Death lies on her like an untimely frost
Upon the sweetest flower of all the field.

NURSE.
O lamentable day!

LADY CAPULET.
O woful time!

CAPULET.
Death, that hath ta’en her hence to make me wail,
Ties up my tongue and will not let me speak.

 Enter Friar Lawrence and Paris with Musicians.

FRIAR LAWRENCE.
Come, is the bride ready to 