**<h1>Import all Neccesary Libraires and Modules</h1>**

In [10]:
import os
import glob
from dotenv import load_dotenv

from openai import OpenAI
import tiktoken

import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from chromadb.utils import embedding_functions

from langchain.text_splitter import RecursiveCharacterTextSplitter


In [11]:
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_key)

**<h1>Load in Data</h1>**

In [12]:
# Ensure folder_path points to the directory and includes a wildcard for files
def load_text_files(folder_path):
    """Loads all text files from a given directory."""
    all_text = []
    for filepath in glob.glob(folder_path):
        if os.path.isfile(filepath):
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read().strip()
                if text:
                    all_text.append({"text": text, "source": os.path.basename(filepath)})

    return all_text

In [13]:
# implementation
folder_path = os.path.join(os.getenv("20casedocs"), "*.txt")
all_texts = load_text_files(folder_path)

**<h1>Chunking</h1>**

In [14]:
# chunking

def chunk_casedocs(all_texts, chunk_size=8100, chunk_overlap=500):
    """
    Splits a list of text entries into chunks using a token-aware text splitter.

    Args:
        all_texts (list of dict): List of dictionaries containing "text" and "source" keys.
        chunk_size (int): Maximum size of each chunk in tokens. Default is 8100.
        chunk_overlap (int): Overlap between chunks in tokens. Default is 500.

    Returns:
        list of dict: List of chunked texts with metadata, including source and token count.
    """
    # Initialize OpenAI tokenizer
    tokenizer = tiktoken.get_encoding("cl100k_base")  # Correct for text-embedding-ada-002

    # Configure the text splitter
    optimized_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n\n", "\n\n", "\n", ". ", " ", ""],  # Prioritize document structure
        chunk_size=chunk_size,  # Adjustable chunk size
        chunk_overlap=chunk_overlap,  # Adjustable overlap
        length_function=lambda text: len(tokenizer.encode(text)),  # Exact token count
        is_separator_regex=False
    )

    chunked_texts = []
    for doc_index, entry in enumerate(all_texts):
        # Split the text into chunks
        chunks = optimized_splitter.split_text(entry["text"])
        
        # Add chunks with metadata
        for chunk_index, chunk in enumerate(chunks):
            chunked_texts.append({
                "id": f"doc_{doc_index}_chunk_{chunk_index}",
                "text": chunk,
                "metadata": {
                    "source": entry["source"],
                    "token_count": len(tokenizer.encode(chunk))  # Optional but useful
                }
            })
    
    return chunked_texts


In [15]:
# implementation
chunked_texts = chunk_casedocs(all_texts)
print(f"Total chunked segments: {len(chunked_texts)}") 

tokenizer = tiktoken.get_encoding("cl100k_base")
print(f"Max tokens in any chunk: {max(len(tokenizer.encode(chunk['text'])) for chunk in chunked_texts)}")

Total chunked segments: 36
Max tokens in any chunk: 8016


**<h1>Embedding</h1>**

In [16]:
# embedding

def add_to_chroma_collection(chunked_texts, openai_key, collection_name="case-docs-collection"):
    """
    Adds chunked texts to a Chroma collection with OpenAI embeddings, ensuring the collection exists.

    Args:
        chunked_texts (list of dict): List of dictionaries containing "id", "text", and "metadata".
        openai_key (str): OpenAI API key for the embedding function.
        collection_name (str): Name of the Chroma collection. Default is "case-docs-collection".

    Returns:
        chromadb.Collection: The Chroma collection with the added documents.
    """
    # Initialize OpenAI embedding function
    embedding_function = embedding_functions.OpenAIEmbeddingFunction(
        api_key=openai_key,
        model_name="text-embedding-ada-002"
    )
    
    # Initialize Chroma client
    chroma_client = chromadb.PersistentClient(path="database")
    
    # Check if collection already exists
    existing_collections = [col.name for col in chroma_client.list_collections()]
    
    if collection_name in existing_collections:
        chroma_collection = chroma_client.get_collection(collection_name)
    else:
        chroma_collection = chroma_client.create_collection(
            name=collection_name,
            embedding_function=embedding_function  # Attach the OpenAI embedder
        )
    
    # Prepare data for Chroma
    ids = [item["id"] for item in chunked_texts]
    documents = [item["text"] for item in chunked_texts]
    metadatas = [item["metadata"] for item in chunked_texts]

    # Add to Chroma collection
    chroma_collection.add(
        ids=ids,
        documents=documents,
        metadatas=metadatas
    )

    return chroma_collection


In [17]:
# implementation
add_to_chroma_collection(chunked_texts, openai_key)

In [19]:
db_path = os.getenv("databasepath")


client = chromadb.PersistentClient(path=db_path)
collections = client.list_collections()

for collection in collections:
    print(collection.name)

case-docs-collection
