In [1]:
# Scraped from https://en.wikipedia.org/wiki/Llama
with open("Llama_Wikipedia_Cleaned.txt", "r") as file:
    content = file.read()

In [2]:
#chunking in the most naive way
def split_content_into_equal_length(content, char_length):
    return [content[i:i+char_length] for i in range(0, len(content), char_length)]

split_content_into_equal_length(content, 300)[:3]
#splitting the content into chunks of 300 characters each and return first 3 


#the problem with this way is that the text is getting cut in the middle

['Llama - Wikipedia > Excerpt > Not to be confused with Ilama or Lama. --- Not to be confused with Ilama "Ilama (disambiguation)") or Lama. Eukaryota Kingdom: Chordata Class: Artiodactyla Family: Lama "Lama (genus)") Species: The llama (; Spanish pronunciation: [\\[ˈʎama\\]]( "Help:IPA/Spanish") or \\[ˈʝ',
 'ama\\]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tas',
 'ks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5–8 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4] The ancestors of llamas are thought to have origina']

In [3]:
#next way of chunking is to split by identifying signs 
def simple_sentence_splitter(text, chunk_size=5):
    # Split text based on a period followed by a space, which is a common end of sentence marker
    sentences = text.split('. ')
    chunks = []

    # Group sentences into chunks of 'chunk_size'
    for i in range(0, len(sentences), chunk_size):
        chunk = '. '.join(sentences[i:i+chunk_size]) + '.'
        chunks.append(chunk)

    return chunks

# Example usage with the 'content' variable
content_chunks = simple_sentence_splitter(content, 5)
print(content_chunks[:3])

#better outcome but not effective as we have many signs and to establish them will take a lot of time

['Llama - Wikipedia > Excerpt > Not to be confused with Ilama or Lama. --- Not to be confused with Ilama "Ilama (disambiguation)") or Lama. Eukaryota Kingdom: Chordata Class: Artiodactyla Family: Lama "Lama (genus)") Species: The llama (; Spanish pronunciation: [\\[ˈʎama\\]]( "Help:IPA/Spanish") or \\[ˈʝama\\]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions.', 'When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5–8 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4] The ancestors of llamas are thought to have originated from the Great Plains of North America about 40 million years ago, and subsequently mig

In [4]:
#the next way of splitting
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", "? ", "! "],  # List of characters to split on
    chunk_size=600,  # The maximum size of your chunks
    chunk_overlap=50,  # The maximum overlap between chunks
)

In [6]:
langchain_chunks = text_splitter.create_documents([content])

The result is a list of `document` objects each representing a chunk, we can look at each chunk using the index.

In [7]:
langchain_chunks[:1]

[Document(page_content='Llama - Wikipedia > Excerpt > Not to be confused with Ilama or Lama. --- Not to be confused with Ilama "Ilama (disambiguation)") or Lama. Eukaryota Kingdom: Chordata Class: Artiodactyla Family: Lama "Lama (genus)") Species: The llama (; Spanish pronunciation: [\\[ˈʎama\\]]( "Help:IPA/Spanish") or \\[ˈʝama\\]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era. Llamas are social animals and live with others as a herd')]

In [8]:
# Create a text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=300,
)

# Split the 'content' into chunks
langchain_chunks = text_splitter.create_documents([content])

# Print the first document
langchain_chunks[:1]

[Document(page_content='Llama - Wikipedia > Excerpt > Not to be confused with Ilama or Lama. --- Not to be confused with Ilama "Ilama (disambiguation)") or Lama. Eukaryota Kingdom: Chordata Class: Artiodactyla Family: Lama "Lama (genus)") Species: The llama (; Spanish pronunciation: [\\[ˈʎama\\]]( "Help:IPA/Spanish") or \\[ˈʝama\\]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5–8 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4] The ancestors of llamas are thought to have originated from the Great Plains of North America about 40 million years ago, a

In [9]:
import chromadb

# Instantiate a Chroma client
chroma_client = chromadb.Client()

# Create a new collection named "llama_chunks"
collection = chroma_client.get_or_create_collection(name="llama_chunks")


# Iterate over the langchain_chunks using a for loop and enumerate
for index, chunk in enumerate(langchain_chunks):
    # Add a new document to the collection
    collection.add(
        ids=[f"chunk_{index}"],  # Specify the ids parameter as a list containing a single string in the format f"chunk_{index}"
        documents=[chunk.page_content],  # Specify the documents parameter as a list containing a single string, which is the page_content of the current chunk
        metadatas=[{"source": "https://en.wikipedia.org/wiki/Llama",
                    "chunk_index": index}]  # Specify the metadatas parameter as a list containing a single dictionary with "source" and "chunk_index" keys
    )

# Query the collection
results = collection.query(
    query_texts=["What are llamas used for?"],  # Specify the query_texts parameter as a list containing a single string, which is your query
    n_results=1  # Specify the n_results parameter as an integer indicating the number of results you want
)

# Print the query results
print(results)

{'ids': [['chunk_0']], 'distances': [[0.677453339099884]], 'metadatas': [[{'chunk_index': 0, 'source': 'https://en.wikipedia.org/wiki/Llama'}]], 'embeddings': None, 'documents': [['Llama - Wikipedia > Excerpt > Not to be confused with Ilama or Lama. --- Not to be confused with Ilama "Ilama (disambiguation)") or Lama. Eukaryota Kingdom: Chordata Class: Artiodactyla Family: Lama "Lama (genus)") Species: The llama (; Spanish pronunciation: [\\[ˈʎama\\]]( "Help:IPA/Spanish") or \\[ˈʝama\\]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5–8 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European s