# Introduction to LangChain and MongoDB Atlas Vector Search
URL: https://www.mongodb.com/developer/products/mongodb/langchain-vector-search/

## Setup

In [None]:
# Import necessary modules: `os` for environment variables, `load_dotenv` to load .env files,
# and `MongoClient` for MongoDB operations.
import os
from dotenv import load_dotenv
from pymongo import MongoClient

# Load environment variables from a .env file, overriding any existing variables in the environment.
# This is useful for not cluttering the code with sensitive information like API keys and database URIs.
load_dotenv(override=True)

# Note for users: You should create a .env file in the notebook's root directory
# with the content MONGO_URI="your_mongodb_uri" to load the MONGO_URI variable.

# Retrieve the OPENAI_API_KEY and MONGO_URI from the environment variables.
# These are critical for accessing OpenAI's API and the MongoDB database, respectively.
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
MONGO_URI = os.environ["MONGO_URI"]

# Define constants for database and collection names, and the name of the vector search index.
# These will be used to specify where the data should be stored and how to manage search indexes in MongoDB.
DB_NAME = "langchain-test-2"
COLLECTION_NAME = "test"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "default"

# Define the name of the field where embeddings will be stored in the database documents.
# This is important for vector searches that rely on these embeddings.
EMBEDDING_FIELD_NAME = "embedding"

# Establish a connection to MongoDB using the URI from the environment variables.
# This client object will be used for all database operations.
client = MongoClient(MONGO_URI)

# Access the specific database and collection within MongoDB where operations will be performed.
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

# Attempt to ping the MongoDB deployment to confirm a successful connection.
# This is a good practice to verify that the client is properly configured and can communicate with the database.
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)  # If the connection fails, print the error message.


## Loading the data

In [None]:
# Import the necessary modules for document loading, text splitting, and embedding generation.
from langchain.document_loaders import PyPDFLoader  # For loading PDF documents.
from langchain.text_splitter import RecursiveCharacterTextSplitter  # For splitting text into manageable chunks.
from langchain_openai import OpenAIEmbeddings  # For generating text embeddings using OpenAI's models.
from langchain.vectorstores import MongoDBAtlasVectorSearch  # For storing and searching document vectors.

# Load a PDF document from a URL, specifically an arXiv paper in this case.
# The PyPDFLoader class is used to fetch and read the content of the PDF document from the specified URL.
loader = PyPDFLoader("https://arxiv.org/pdf/2303.08774.pdf")
data = loader.load()

# Split the loaded document text into smaller chunks for processing.
# A RecursiveCharacterTextSplitter is used to divide the text into chunks of 500 characters,
# with a 50 character overlap between consecutive chunks to ensure continuity.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(data)

# Use MongoDBAtlasVectorSearch to insert the document chunks into a MongoDB collection.
# Each chunk is treated as a separate document.
# The `from_documents` method is used here, which takes the documents (chunks),
# generates embeddings for them using OpenAIEmbeddings, and inserts them into the specified collection.
# This operation also involves creating or updating the specified vector search index.
x = MongoDBAtlasVectorSearch.from_documents(
    documents=docs,  # The chunks of text to be inserted.
    embedding=OpenAIEmbeddings(disallowed_special=()),  # The embedding model to use.
    collection=collection,  # The MongoDB collection where the documents are to be inserted.
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME  # The name of the vector search index.
)


In [None]:
# Check that the data was loade
docs[0]

## Create the Vector search index

Let’s head over to our MongoDB Atlas user interface to create our Vector Search Index. First, click on the “Search” tab and then on “Create Search Index.” You’ll be taken to this page. Please click on “JSON Editor.”

Check out our 
Vector Search documentation
 for more information on the index configuration settings
 https://www.mongodb.com/docs/atlas/atlas-search/vector-search/

```json
{
  "fields": [
    {
      "type": "vector",
      "path": "embedding",  
      "numDimensions": 1536, 
      "similarity": "cosine"  
    },
    {
      "type": "filter",
      "path": "page"  
    }
   
  ]
}
```


These fields are to specify the field name in our documents. With *embedding*, we are specifying that the dimensions of the model used to embed are *1536*, and the similarity function used to find the nearest k neighbors is *cosine*. It’s crucial that the dimensions in our search index match that of the language model we are using to embed our data.

We are also adding metadata fields for filtering, this will allow us to use pre-filtering in our RAG application

## Querying our data

### Semantic search in LangChain

In [None]:
# Import the necessary modules for working with OpenAI embeddings and MongoDB Atlas vector search.
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch

# Initialize a MongoDB Atlas vector search object.
# This involves connecting to a MongoDB Atlas instance using a connection string and specifying the database and collection names.
# The `OpenAIEmbeddings` object is used to generate vector embeddings for documents, with no special tokens disallowed.
# An index name for the Atlas vector search is also provided.
vector_search = MongoDBAtlasVectorSearch.from_connection_string(
   MONGO_URI,  # Placeholder for the actual MongoDB URI connection string.
   DB_NAME + "." + COLLECTION_NAME,  # The target database and collection in "db.collection" format.
   OpenAIEmbeddings(disallowed_special=()),  # Initialize OpenAI embeddings without disallowing any special tokens.
   index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME  # The name of the index to use for vector searches.
)

# Define the query to be used for the similarity search. In this case, looking for documents similar to "gpt-4".
query = "gpt-4"

# Execute the similarity search against the specified collection using the given query.
# The search is limited to the top 20 most similar results (`k=20`).
results = vector_search.similarity_search(
   query=query,
   k=20,
)

# Iterate through the results of the similarity search.
# For each result, print the page content followed by a newline for separation.
# This loop effectively outputs the content of each document that was found to be similar to the query.
for result in results:
   print(result.page_content + '\n')


This gives us the relevant results that semantically match the intent behind the question. Now, let’s see what happens when we ask a question using LangChain.

### Question and answering in LangChain

In [None]:
# Initialize a document retriever for semantic similarity searches.
# It's configured to retrieve the top 200 similar documents based on the query (`"k": 200`).
# After the initial retrieval, a post-filtering pipeline is applied that first limits the results to the top 2 (`"$limit": 2`),
# and then removes the 'embedding' data from those results for cleaner output (`"$project": {"embedding": 0}`).
qa_retriever = vector_search.as_retriever(
   search_type="similarity",
   search_kwargs={
       "k": 200,
       "post_filter_pipeline": [{"$limit": 2},{"$project" : {"embedding" : 0}}]
   }
)

# Import the PromptTemplate class, which is used to create structured prompts for the language model.
from langchain.prompts import PromptTemplate

# Define a prompt template that structures how the context and the question will be presented to the language model.
# This ensures the language model understands it's to use the provided context to answer the given question.
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.


{context}


Question: {question}
"""

# Create a PromptTemplate object with the defined template, specifying the variables that will be dynamically filled in.
PROMPT = PromptTemplate(
   template=prompt_template, input_variables=["context", "question"]
)

# Import the necessary classes for creating a retrieval-based QA system.
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain_openai import OpenAI

# Configure the QA system. This involves setting the language model (llm) to use (here, ChatOpenAI backed by OpenAI),
# specifying the type of chain (arbitrarily named "stuff"), the document retriever,
# indicating that source documents should be returned with the answer,
# and providing the prompt template to be used with the language model.
qa = RetrievalQA.from_chain_type(llm=OpenAI(),chain_type="stuff", retriever=qa_retriever, return_source_documents=True, chain_type_kwargs={"prompt": PROMPT})

# Use the QA system to answer a question about GPT-4.
# The system retrieves relevant documents based on semantic similarity and then formulates an answer based on those documents.
docs = qa({"query": "What is gpt4?"})

# Print the answer generated by the QA system.
print(docs["result"])


### Pre-filtering with Similarity Search

In [None]:
# Set up a retriever using vector search for document similarity.
# This retriever will find documents based on their semantic similarity to a query.
# It is configured to retrieve up to 200 documents (`"k": 200`) that match the specified pre-filter criteria.
# The pre-filter here ensures that only documents from page 3 (`"page": {"$eq": 3}`) are considered.
# After retrieval, a post-filter pipeline is applied to keep only the top 2 results (`"$limit": 2`) and
# remove the `embedding` field from those results (`"$project": {"embedding": 0}`).
qa_retriever = vector_search.as_retriever(
   search_type="similarity",
   search_kwargs={
       "k": 200,
       "pre_filter": {"page": { "$eq" : 3}},
       "post_filter_pipeline": [{"$limit": 2},{"$project" : {"embedding" : 0}}]
   }
)

# Import the PromptTemplate class, which is used to format the input for the language model.
from langchain.prompts import PromptTemplate

# Define a prompt template for the QA system.
# This template formats the context and question into a structured format that the language model can understand.
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.


{context}


Question: {question}
"""

# Create a PromptTemplate object using the defined template and specifying the input variables.
PROMPT = PromptTemplate(
   template=prompt_template, input_variables=["context", "question"]
)

# Import necessary modules for setting up the retrieval-based QA system.
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain_openai import OpenAI

# Set up the QA system by specifying the language model (ChatOpenAI powered by OpenAI),
# the type of chain (arbitrarily named "stuff" here), the document retriever, and whether to return the source documents.
# The `chain_type_kwargs` argument is used to pass the prompt template to the system.
qa = RetrievalQA.from_chain_type(llm=OpenAI(),chain_type="stuff", retriever=qa_retriever, return_source_documents=True, chain_type_kwargs={"prompt": PROMPT})

# Execute the QA system with a query asking about GPT-4.
# The query is processed by the retrieval system to find relevant documents, which are then used by the language model to generate an answer.
docs = qa({"query": "What is gpt4?"})

# Print the result of the QA system, which should include the answer to the query along with any source documents if configured to return them.
print(docs["result"])
