# Install the required packages


In [1]:
! source ../.venv/bin/activate
! python -m pip install langchain_community langchain-openai langchain-anthropic scikit-learn bs4 pandas pyarrow matplotlib lxml langgraph "mcp[cli]" python-dotenv





In [2]:
import re, os
import tiktoken

from bs4 import BeautifulSoup

from langchain_community.document_loaders import RecursiveUrlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_openai import AzureOpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_community.vectorstores import SKLearnVectorStore

def count_tokens(text, model="cl100k_base"):
    """
    Count the number of tokens in the text using tiktoken.
    
    Args:
        text (str): The text to count tokens for
        model (str): The tokenizer model to use (default: cl100k_base for GPT-4)
        
    Returns:
        int: Number of tokens in the text
    """
    encoder = tiktoken.get_encoding(model)
    return len(encoder.encode(text))

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    
    # Target the main article content for Paychex documentation 
    main_content = soup.find("article", class_="md-content__inner")
    
    # If found, use that, otherwise fall back to the whole document
    content = main_content.get_text() if main_content else soup.text
    
    # Clean up whitespace
    content = re.sub(r"\n\n+", "\n\n", content).strip()
    
    return content

def load_paychex_docs():
    """
    Load Paychex documentation from the official website.
    
    This function:
    1. Uses RecursiveUrlLoader to fetch pages from the Paychex website
    2. Counts the total documents and tokens loaded
    
    Returns:
        list: A list of Document objects containing the loaded content
        list: A list of tokens per document
    """
    print("Loading Paychex documentation...")

    # Load the documentation 
    urls = ["https://www.paychex.com/payroll", 
    ] 

    docs = []
    for url in urls:

        loader = RecursiveUrlLoader(
            url,
            max_depth=5,
            extractor=bs4_extractor,
        )

        # Load documents using lazy loading (memory efficient)
        docs_lazy = loader.lazy_load()

        # Load documents and track URLs
        for d in docs_lazy:
            docs.append(d)

    print(f"Loaded {len(docs)} documents from Paychex documentation.")
    print("\nLoaded URLs:")
    for i, doc in enumerate(docs):
        print(f"{i+1}. {doc.metadata.get('source', 'Unknown URL')}")
    
    # Count total tokens in documents
    total_tokens = 0
    tokens_per_doc = []
    for doc in docs:
        total_tokens += count_tokens(doc.page_content)
        tokens_per_doc.append(count_tokens(doc.page_content))
    print(f"Total tokens in loaded documents: {total_tokens}")
    
    return docs, tokens_per_doc

def save_llms_full(documents):
    """ Save the documents to a file """

    # Open the output file
    output_filename = "llms_full.txt"

    with open(output_filename, "w") as f:
        # Write each document
        for i, doc in enumerate(documents):
            # Get the source (URL) from metadata
            source = doc.metadata.get('source', 'Unknown URL')
            
            # Write the document with proper formatting
            f.write(f"DOCUMENT {i+1}\n")
            f.write(f"SOURCE: {source}\n")
            f.write("CONTENT:\n")
            f.write(doc.page_content)
            f.write("\n\n" + "="*80 + "\n\n")

    print(f"Documents concatenated into {output_filename}")

def split_documents(documents):
    """
    Split documents into smaller chunks for improved retrieval.
    
    This function:
    1. Uses RecursiveCharacterTextSplitter with tiktoken to create semantically meaningful chunks
    2. Ensures chunks are appropriately sized for embedding and retrieval
    3. Counts the resulting chunks and their total tokens
    
    Args:
        documents (list): List of Document objects to split
        
    Returns:
        list: A list of split Document objects
    """
    print("Splitting documents...")
    
    # Initialize text splitter using tiktoken for accurate token counting
    # chunk_size=8,000 creates relatively large chunks for comprehensive context
    # chunk_overlap=500 ensures continuity between chunks
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=8000,  
        chunk_overlap=500  
    )
    
    # Split documents into chunks
    split_docs = text_splitter.split_documents(documents)
    
    print(f"Created {len(split_docs)} chunks from documents.")
    
    # Count total tokens in split documents
    total_tokens = 0
    for doc in split_docs:
        total_tokens += count_tokens(doc.page_content)
    
    print(f"Total tokens in split documents: {total_tokens}")
    
    return split_docs

def create_embeddings():
    """
    Create embeddings using Azure OpenAI
    """
    embeddings = AzureOpenAIEmbeddings( 
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
        #api_version=os.getenv("AZURE_OPENAI_VERSION"),
    )
    return embeddings

def create_vectorstore(splits):
    """
    Create a vector store from document chunks using SKLearnVectorStore.
    
    This function:
    1. Initializes an embedding model to convert text into vector representations
    2. Creates a vector store from the document chunks
    
    Args:
        splits (list): List of split Document objects to embed
        
    Returns:
        SKLearnVectorStore: A vector store containing the embedded documents
    """
    print("Creating SKLearnVectorStore...")
    

    
    # Create vector store from documents using SKLearn
    persist_path = os.getcwd()+"/sklearn_vectorstore.parquet"
    vectorstore = SKLearnVectorStore.from_documents(
        documents=splits,
        embedding=create_embeddings(),
        persist_path=persist_path   ,
        serializer="parquet",
    )
    print("SKLearnVectorStore created successfully.")
    
    vectorstore.persist()
    print("SKLearnVectorStore was persisted to", persist_path)

    return vectorstore

In [3]:
# Load the documents
documents, tokens_per_doc = load_paychex_docs()

# Save the documents to a file
save_llms_full(documents)

# Split the documents
split_docs = split_documents(documents)

# Create the vector store
vectorstore = create_vectorstore(split_docs)


Loading Paychex documentation...
Loaded 15 documents from Paychex documentation.

Loaded URLs:
1. https://www.paychex.com/payroll
2. https://www.paychex.com/payroll/enterprise-payroll
3. https://www.paychex.com/payroll/800-741-6277
4. https://www.paychex.com/payroll/888-627-4735
5. https://www.paychex.com/payroll/compare-payroll-solutions
6. https://www.paychex.com/payroll/866-709-9401
7. https://www.paychex.com/payroll/payroll-protection
8. https://www.paychex.com/payroll/800-822-8704
9. https://www.paychex.com/payroll/833-729-8200
10. https://www.paychex.com/payroll/855-263-1021
11. https://www.paychex.com/payroll/small-business-payroll
12. https://www.paychex.com/payroll/paychex-pre-check
13. https://www.paychex.com/payroll/833-299-0168
14. https://www.paychex.com/payroll/800-741-6277.
15. https://www.paychex.com/payroll/switch-payroll-companies
Total tokens in loaded documents: 35553
Documents concatenated into llms_full.txt
Splitting documents...
Created 15 chunks from documents.


In [4]:
# Create retriever to get relevant documents (k=3 means return top 3 matches)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    
# Get relevant documents for the query
query = "What are the paychex services ?"    
relevant_docs = retriever.invoke(query)
print(f"Retrieved {len(relevant_docs)} relevant documents")

for d in relevant_docs:
    print(d.metadata['source'])
    print(d.page_content[0:500])
    print("\n--------------------------------\n")

Retrieved 3 relevant documents
https://www.paychex.com/payroll/payroll-protection
Protection Against Payroll Interruptions - Paychex Promise

  

Skip to main content
Skip to footer site map

 

Payroll Services

Paychex Promise

Payroll Protection and More Through Paychex Promise®
Paychex Promise will be offered free of charge to business owners for the first three months of service, and thereafter will be offered as a complete suite of services for a fixed, all-inclusive fee.1 Program and/or any of the services offered as part of Program are subject to eligibility and are v

--------------------------------

https://www.paychex.com/payroll/payroll-protection
Protection Against Payroll Interruptions - Paychex Promise

  

Skip to main content
Skip to footer site map

 

Payroll Services

Paychex Promise

Payroll Protection and More Through Paychex Promise®
Paychex Promise will be offered free of charge to business owners for the first three months of service, and thereafter will be of

In [5]:
from langchain_core.tools import tool

@tool
def paychex_website_query_tool(query: str):
    """
    Query the Paychex website using a retriever.
    
    Args:
        query (str): The query to search the documentation with

    Returns:
        str: A str of the retrieved documents
    """
    retriever = SKLearnVectorStore(
    embedding=create_embeddings(), 
    persist_path=os.getcwd()+"/sklearn_vectorstore.parquet", 
    serializer="parquet").as_retriever(search_kwargs={"k": 3})

    relevant_docs = retriever.invoke(query)
    print(f"Retrieved {len(relevant_docs)} relevant documents")
    formatted_context = "\n\n".join([f"==DOCUMENT {i+1}==\n{doc.page_content}" for i, doc in enumerate(relevant_docs)])
    return formatted_context

In [7]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),  # Your GPT-4 deployment name
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-11-20",
    temperature=0
)


# Create the messages with system instructions
messages = [
    {
        "role": "system",
        "content": """You are a helpful assistant that can answer questions about the Paychex website. 
        Use the provided tools to search the Paychex documentation when needed.
        If you don't know the answer, say "I don't know"."""
    },
    {
        "role": "user",
        "content": "What are the paychex services?"
    }
]
 
message = response = llm.invoke(messages, tools=[paychex_website_query_tool])

message.pretty_print()

PydanticSerializationError: Unable to serialize unknown type: <class 'pydantic._internal._model_construction.ModelMetaclass'>