# Step 1: Setup prerequisites

In [2]:
import os
from pymongo import MongoClient
from utils import track_progress

In [None]:
db_password = "tFdH6gHBNkU3fnjm"
db_user = "visilvestre_db_user"

# If you are using your own MongoDB Atlas cluster, use the connection string for your cluster here
MONGODB_URI = os.environ.get(f"mongodb+srv://{db_user}:{db_password}@cluster0.ft0kvvw.mongodb.net/retryWrites=true&w=majority&appName=Cluster0")

#Initialize a MongoDB Python Client
mongodb_client = MongoClient(MONGODB_URI)

# Check the connection to the server
mongodb_client.admin.command("ping")

ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 68b0d3a46016e003bbf68cac, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [31]:
# Track progress of key steps-- DO NOT CHANGE
track_progress("cluster_creation", "ai_agents_lab")

Tracking progress for task cluster_creation


MissingSchema: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?

### **Pick an LLM provider**

In [30]:
SERVERLESS_URL = os.environ.get("SERVERLESS_URL")

# Can be one of "aws", "google", "microsoft"
LLM_PROVIDER = "aws"

# Step 2: Import data into MongoDB

In [5]:
import json

In [6]:
""" DO NOT CHANGE THE VALUES ASSIGNED TO THE VARIABLES BELOW """
# Database name
DB_NAME = "mongodb_genai_devday_agents"

# Name of the collection with full documents- used for summarization
FULL_COLLECTION_NAME = "mongodb_docs"

# Name of the collection for vector search- used for Q&A
VS_COLLECTION_NAME = "mongodb_docs_embeddings"

# Name of the vector search index
VS_INDEX_NAME = "vector_index"

In [7]:
# Connect to the "VS_COLLECTION_NAME" collection
vs_collection = mongodb_client[DB_NAME][VS_COLLECTION_NAME]

# Connect to the "FULL_COLLECTION_NAME" collection
full_collection = mongodb_client[DB_NAME][FULL_COLLECTION_NAME]

In [8]:
# Insert a dataset of MongoDB docs with embeddings into the "VS_COLLECTION_NAME" collection
with open(f"../data/{VS_COLLECTION_NAME}.json", "r") as data_file:
    json_data = data_file.read()
    
data = json.loads(json_data)

print(f"Deleting existing documents from the '{VS_COLLECTION_NAME}' collection...")
vs_collection.delete_many({})
vs_collection.insert_many(data)
print(f"{vs_collection.count_documents({})} documents inserted into the '{VS_COLLECTION_NAME}' collection.")

Deleting existing documents from the 'mongodb_docs_embeddings' collection...


ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 68b0d3a46016e003bbf68cac, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [9]:
# Insert a dataset of MongoDB docs with embeddings into the "FULL_COLLECTION_NAME" collection
with open(f"../data/{FULL_COLLECTION_NAME}.json", "r") as data_file:
    json_data = data_file.read()
    
data = json.loads(json_data)

print(f"Deleting existing documents from the '{FULL_COLLECTION_NAME}' collection...")
full_collection.delete_many({})
full_collection.insert_many(data)
print(f"{full_collection.count_documents({})} documents inserted into the '{FULL_COLLECTION_NAME}' collection.")

Deleting existing documents from the 'mongodb_docs' collection...


KeyboardInterrupt: 

# Step 3: Create a vector search index

In [10]:
from utils import create_index, check_index_ready

In [11]:
""" 
Create vector index definition specifying:
path: Path to the embeddings
numDimensions: Number of embedding dimensions- depends on the embedding model used
similarity: Similarity metric. One of cosine, euclidean, dotProduct
"""

model = {
    "name": VS_INDEX_NAME,
    "type": "vectorSearch",
    "definition": {
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 384,
                "similarity": "cosine",
            }
        ]
    },
}

In [12]:
# Use the "create_index" function from utils.py to create a vector search index with the above definition for the "vs_collection" collection
create_index(vs_collection, VS_INDEX_NAME, model)

Creating the vector_index index


KeyboardInterrupt: 

In [None]:
# Yse tge "check_index_ready" function from the utils.py to check if the index was created and is READY before proceeding
check_index_ready(vs_collection, VS_INDEX_NAME)

: 

In [None]:
# Track progress of key steps-- DO NOT CHANGE
track_progress("vs_index_creation", "ai_agents_lab")

# Step 4: Create agent tools

In [15]:
# If you get a warning running this cell, ignore it.
from langchain.agents import tool 
from sentence_transformers import SentenceTransformer
from typing import List

  from .autonotebook import tqdm as notebook_tqdm


#### **Vector Search**

In [16]:
# Load the "gte-small" model using the Sentence Transformers Library
embedding_model = SentenceTransformer("thenlper/gte-small")

In [17]:
"""
Define a function that takes a text ("text") as input, embeds it using the "embedding_model" above and returns the embedding as a list
An Array can be converted to a list using the tolist() method
"""
def get_embedding(text: str) -> List[float]:
    """
    Generate the embedding for a piece of text.

    Args:
        text (str): text to embed

    Returns:
        List[float]: Embedding of the text as a list.
    """
    embedding = embedding_model.encode(text)
    return embedding.tolist()

In [18]:
# Define a tool to retrieve relevant documents for a user query using vector search
@tool
def get_information_for_question_answering(user_query: str) -> str:
    """
    Retrieve information using vector search to answer a user query.

    Args:
        user_query (str): The user's query.

    Returns:
        str: Concatenated relevant documents as a single string.
    """
    
    # Embed the user query
    query_embedding = get_embedding(user_query)
    
    """ Define an aggregation pipeline consisting of a $vectorSearch stage, followed by a $project stage
        Set the number of candidates to 150 and only return the top 5 documents from the vector search
        In the $project stage, exclude the "_id" field and include only the "body" field and vectorSearchScore
        NOTE: Use variables defined previously for the "index", "queryVector", and "path" fields in the $vectorSearch stage
    """
    
    pipeline = [
        {
            "$vectorSearch": {
                "index": VS_INDEX_NAME,
                "queryVector": query_embedding,
                "path": "embedding",
                "limit": 5,
                "numCandidates": 150
            }
        },
        {
            "$project": {
                "_id": 0,
                "body": 1,
                "score": {"$meta": "vectorSearchScore"},
            }
        },
    ]
    
    # Execute the aggregation pipeline against the "vs_collection" collection and store results in "results"
    results = vs_collection.aggregate(pipeline)
    
    # Concatenate the results into a string
    context = "/n/n".join([doc.get("body") for doc in results])
    return context

#### **Get page content**

In [20]:
# Define a tool to retrieve the content of a documentation page for summarization
@tool
def get_page_content_for_summarization(user_query: str) -> str:
    """
    Retrieve the page content based on a provided title.
    
    Args:
    user_query (str): The user's query string i.e. title of the documentation page.
    
    Returns:
    str: The content of the page.
    """
    # Query the documents where the "title" field is equal to the "user_query"
    query = {"title": user_query}
    
    # Only return the "body" field from the retrieved documents. NOTE: Set te fields to include 1, those to exclude to 0. "_id" is included by default, so exclude that.
    projection = {"_id": 0, "body": 1}
    
    # Use the "query" and "projection" with the "find_one" method to retrieve the "body" of the document with the "title" equal to the "user_query" from the "full_collection" collection
    document = full_collection.find_one(query, projection)
    if document:
        return document["body"]
    else:
        return "No document found with the given title."

In [21]:
tools = [
    get_information_for_question_answering,
    get_page_content_for_summarization
]

#### **Test out the tools**

In [23]:
# Test the "get_information_for_question_answering" tool with the query "What are some best practices for data backups in MongoDB?", you should see a non-empty response
get_information_for_question_answering("What are some best practices for data backups in MongoDB?")

  get_information_for_question_answering("What are some best practices for data backups in MongoDB?")


ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 68b0d3a46016e003bbf68cac, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

# Step 5: Define a graph state

In [25]:
from typing import Annotated
from langgraph.graph.message import add_messages
from typing_extensions import TypedDict

In [26]:
# Define the Graph State - We are only tracking chat messages but you can track other attributes as well
class GraphState(TypedDict):
    messages: Annotated[list, add_messages]

# Step 6: Instantiate the LLM

In [27]:
from langchain_core.load import load
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import requests

In [33]:
SERVERLESS_URL = "https://vtqjvgchwcjwsrela2oyhlegu0hwqnw.lambda-url.us-west-2.on.aws/"

In [35]:
# Obtain the Langchain LLM object from our serverless endpoint
llm_dict = requests.post(url=SERVERLESS_URL, json={"task": "get_llm", "data": LLM_PROVIDER}).json()
llm = load(llm_dict["llm"], secrets_map=llm_dict["secrets_map"])

KeyError: 'llm'

In [37]:
# Create a Chain-of-Thoughts (CoT) prompt template for the agent - This includes a system prompt with a placeholder for tool names, and a placeholder for messages i.e. user queries and assistant responses
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "You are a helpful AI assistant."
            " Your are provided with toolls to answer questions and summarize technical documentation related to MongoDB."
            " Think step-by-step and use these tools to get the information required to answer the user query."
            " Do not re-run tools unless absolutely necessary."
            " If you are not able to get enough information using the tools, respond with 'I don't know'."
            " You have access to the following tools: {tool_names}."
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

In [38]:
# Fill in the promp template with the tool names
prompt = prompt.partial(tool_names=", ".join([tool.name for tool in tools]))

In [39]:
# Bind the "tools" to the "llm" instantiated above
bind_tools = llm.bind_tools(tools)

NameError: name 'llm' is not defined

In [42]:
# Chain the "prompt" with the tool-bound LLM using the "|" operator
llm_with_tools = prompt | bind_tools

NameError: name 'bind_tools' is not defined

In [43]:
# Test that the LLM is making the right tools calls
llm_with_tools.invoke(
    ["Give me a summary of the page titled Create a MongoDB Deployment."]
).tool_calls

NameError: name 'llm_with_tools' is not defined

In [44]:
# Test that the LLM is making the right tool calls
llm_with_tools.invoke(
    ["What are some best practices for data backups in MongoDB?"]
).tool_calls

NameError: name 'llm_with_tools' is not defined

# Step 7: Define Graph Nodes