In [1]:
import os
import re
from typing import Optional, List, Tuple
from dotenv import load_dotenv
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

load_dotenv()

# Initialize persistent ChromaDB client
client = chromadb.PersistentClient(path="../data/fre/.chromadb")

# Set up OpenAI embedding function
embedding_function = OpenAIEmbeddingFunction(
    model_name="text-embedding-3-small",
    api_key=os.getenv("OPENAI_API_KEY")
)

# Create or get ChromaDB collection
collection = client.get_or_create_collection(
    name="fre_rules",
    embedding_function=embedding_function
)

def clean_markdown(text: str) -> str:
    """
    Remove markdown formatting such as bold and escaped periods.

    Args:
        text: Markdown string.

    Returns:
        Cleaned string without formatting.
    """
    return text.replace("**", "").replace("\\.", ".")

def extract_rule_id(rule_title: str) -> Optional[str]:
    """
    Extract a rule identifier like 'Rule_101' from a rule title.

    Args:
        rule_title: The full title string of a rule.

    Returns:
        Rule ID string (e.g., 'Rule_101') or None if not matched.
    """
    match = re.search(r'Rule (\d+)', rule_title)
    if match:
        return f"Rule_{match.group(1)}"
    else:
        print(f"{rule_title} not found!")
        return None

def chunk_by_h3_with_article(markdown_text: str) -> List[Tuple[str, str, str]]:
    """
    Split markdown into chunks based on ## (article) and ### (rule) headers.

    Args:
        markdown_text: The full markdown text.

    Returns:
        A list of tuples in the format (article_title, rule_title, rule_content).
    """
    lines = markdown_text.splitlines()
    current_article: Optional[str] = None
    current_rule: Optional[str] = None
    current_content: List[str] = []
    chunks: List[Tuple[str, str, str]] = []

    for line in lines:
        if line.startswith("## "):
            current_article = clean_markdown(line[3:].strip())
        elif line.startswith("### "):
            if current_rule:
                chunks.append((current_article or "", current_rule, "\n".join(current_content).strip()))
            current_rule = clean_markdown(line[4:].strip())
            current_content = []
        elif current_rule:
            current_content.append(line)

    if current_rule:
        chunks.append((current_article or "", current_rule, "\n".join(current_content).strip()))

    return chunks

def process_markdown_file(filepath: str) -> None:
    """
    Process a markdown file, extract chunks by rule, and add them to ChromaDB.

    Args:
        filepath: Path to the markdown file.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        markdown = f.read()

    chunks = chunk_by_h3_with_article(markdown)

    for (_, rule_title, _) in chunks:
        rule_id = extract_rule_id(rule_title)
        if rule_id is None:
            print(f"Skipping rule with unrecognized title format: {rule_title}")
            continue

    for (_, rule_title, _) in chunks:
        rule_id = extract_rule_id(rule_title)
        if rule_id is None:
            continue

    for article_title, rule_title, content in chunks:
        rule_id = extract_rule_id(rule_title)
        if rule_id is None:
            continue

        metadata = {
            "article": article_title,
            "rule": rule_title,
            "file": filepath
        }

        try:
            collection.add(
                documents=[content],
                metadatas=[metadata],
                ids=[rule_id]
            )
        except Exception as e:
            print(f"Failed to add Rule: {rule_title} | Article: {article_title}")
            print(f"Error: {str(e)}")

# Process your single file
filepath: str = "../data/fre/federal-rules-of-evidence-dec-1-2024_0.md"
process_markdown_file(filepath)


In [None]:
# Print metadata like section, rule, file
result = collection.get(ids=["Rule_803"])
print(result["metadatas"][0])  

{'article': 'ARTICLE VIII. HEARSAY', 'rule': 'Rule 803. Exceptions to the Rule Against Hearsay—Regardless of  Whether the Declarant Is Available as a Witness', 'file': '../data/fre/federal-rules-of-evidence-dec-1-2024_0.md'}


In [None]:
# Print the text content
print(result["documents"][0])  

The following are not excluded by the rule against hearsay, re gardless of whether the declarant is available as a witness:  

(1) *Present Sense Impression.* A statement describing or explaining an event or condition, made while or immediately  after the declarant perceived it.    
(2) *Excited Utterance.* A statement relating to a startling  event or condition, made while the declarant was under the  stress of excitement that it caused.    
(3) *Then-Existing Mental, Emotional, or Physical Condition.* A  statement of the declarant’s then-existing state of mind (such  as motive, intent, or plan) or emotional, sensory, or physical  condition (such as mental feeling, pain, or bodily health), but  not including a statement of memory or belief to prove the  fact remembered or believed unless it relates to the validity or  terms of the declarant’s will.    
(4) *Statement Made for Medical Diagnosis or Treatment.* A  statement that:    

    (A) is made for—and is reasonably pertinent to—me