# RAG with MongoDB For Documentation - Data Ingestion, Retrieval, Data Ingestion Pipeline

## Data Ingestion

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_KEY:
    print("WARNING: OPENAI_KEY not found in environment variables!")
    print(f"Current working directory: {os.getcwd()}")
    print(f".env file exists: {os.path.exists('.env')}")
    
    # List all environment variables that start with OPENAI
    openai_vars = [k for k in os.environ.keys() if 'OPENAI' in k.upper()]
    print(f"OpenAI-related env vars: {openai_vars}")
    
    # Try alternative names
    OPENAI_KEY = os.getenv("OPENAI_KEY") or os.getenv("OPENAI_API_KEY")
    print(f"Key found: {'Yes' if OPENAI_KEY else 'No'}")

In [2]:
from openai import OpenAI

openai_client = OpenAI()

model="text-embedding-3-large"

def gen_embedding(text, input_type="document"):
    response = openai_client.embeddings.create(
        model = model,
        input = text
    )
    print(response.data)
    return response.data[0].embedding

In [3]:
embed = gen_embedding("RAG Technology")

[Embedding(embedding=[-0.03790051490068436, -0.03420914337038994, -0.02823350764811039, -0.0022911173291504383, -0.041884273290634155, 0.02127107046544552, -0.026406096294522285, 0.02657056413590908, 0.005628427490592003, 0.042140111327171326, 0.041811175644397736, -0.023537060245871544, 0.0409705676138401, -0.031230462715029716, -0.02359188348054886, 0.04371168464422226, -0.024925893172621727, -0.0106903575360775, 0.0045936559326946735, -0.008154824376106262, 0.02156345546245575, -0.018447719514369965, -0.01625482551753521, 0.01868528313934803, -0.013632490299642086, -0.015560409054160118, -0.014125891029834747, 0.005148732103407383, -0.06238783150911331, -0.002207741606980562, 0.06169341504573822, 0.03322234004735947, 0.005404569674283266, -0.0007412438280880451, -0.015542135573923588, -0.01706802472472191, 0.013522845692932606, 0.029256857931613922, 0.006263453047722578, -0.026972593739628792, 0.02152690850198269, -0.015761423856019974, -0.035214219242334366, 0.04396752268075943, 0.

In [4]:
len(embed)

3072

## Data Ingestion Pipeline

## Creating embedding out of data source through chunking

splitting the text using chunking

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# loading the pdf
loader = PyPDFLoader("https://media.geeksforgeeks.org/courses/LPU.pdf")
data = loader.load()

# splitting the data
splitting_definition= RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
documents = splitting_definition.split_documents(data)

In [6]:
documents

[Document(metadata={'producer': 'Adobe PDF library 16.03', 'creator': 'Adobe Illustrator 26.0 (Windows)', 'creationdate': '2022-01-15T00:56:36+06:30', 'moddate': '2022-02-11T18:39:35+05:30', 'title': 'cip', 'source': 'https://media.geeksforgeeks.org/courses/LPU.pdf', 'total_pages': 14, 'page': 0, 'page_label': '1'}, page_content='COMPLETE\nPREPARATION\n150+ HOURS | ONLINE COURSE | PLACEMENT ASSISTANCE | MOCK TESTS\n~GET INTO YOUR DREAM COMPANY~\n4.86500+\nENROLLED\nRESUME\nBUILDING\nVIDEOS'),
 Document(metadata={'producer': 'Adobe PDF library 16.03', 'creator': 'Adobe Illustrator 26.0 (Windows)', 'creationdate': '2022-01-15T00:56:36+06:30', 'moddate': '2022-02-11T18:39:35+05:30', 'title': 'cip', 'source': 'https://media.geeksforgeeks.org/courses/LPU.pdf', 'total_pages': 14, 'page': 1, 'page_label': '2'}, page_content='CONTENT\nEXPERT ADVICE BY C.E.O.\nWHERE OUR ALUMNI \nWORKS?\nCOURSE OVERVIEW\nQUESTION & ANSWER\nCONTACT US\nCOURSE BENEFITS\n2'),
 Document(metadata={'producer': 'Adobe 

Preparing documents for insertion into VectorDB by turning whole document into embeddings

In [7]:
docs_to_insert=[{"text": doc.page_content,"embedding": gen_embedding(doc.page_content)} for doc in documents]

[Embedding(embedding=[-0.015241400338709354, -0.011760440655052662, -0.0026554521173238754, -0.0074824378825724125, 0.005009980406612158, 0.023293154314160347, -0.010499812662601471, 0.07684138417243958, -0.02940923348069191, -0.001984472619369626, 0.012842141091823578, -0.04138113558292389, 0.006437336560338736, -0.0629175454378128, -0.007014785427600145, 0.00810055248439312, 0.007242511957883835, 0.021243616938591003, -0.026188531890511513, -0.029067644849419594, 0.0001272065273951739, -0.03666394576430321, -0.019373007118701935, 0.018706094473600388, -0.02506616711616516, 0.003218668280169368, -0.00022874301066622138, 0.012516817077994347, 0.013492787256836891, -0.02023511566221714, 0.006766726262867451, 0.020251380279660225, 0.022203320637345314, -0.015509792603552341, -0.03236967697739601, 0.007087983191013336, 0.0026839179918169975, -0.00828354712575674, -0.032060619443655014, 0.002031237818300724, -0.002801847644150257, 0.012638813816010952, -0.007059517316520214, 0.016721623018

## Inserting documents with context (text and embeddings) into our MongoDB (acting as VectorDB)

In [None]:
from pymongo import MongoClient

## connecting to mongo db
client = MongoClient("mongodb+srv://")
collection = client["sample_mflix"]["ragpdf_geeks4geeks"]

# inserting documents into collection
result = collection.insert_many(docs_to_insert)

result

InsertManyResult([ObjectId('696d8d4dcb6d84f89da826f0'), ObjectId('696d8d4dcb6d84f89da826f1'), ObjectId('696d8d4dcb6d84f89da826f2'), ObjectId('696d8d4dcb6d84f89da826f3'), ObjectId('696d8d4dcb6d84f89da826f4'), ObjectId('696d8d4dcb6d84f89da826f5'), ObjectId('696d8d4dcb6d84f89da826f6'), ObjectId('696d8d4dcb6d84f89da826f7'), ObjectId('696d8d4dcb6d84f89da826f8'), ObjectId('696d8d4dcb6d84f89da826f9'), ObjectId('696d8d4dcb6d84f89da826fa'), ObjectId('696d8d4dcb6d84f89da826fb'), ObjectId('696d8d4dcb6d84f89da826fc'), ObjectId('696d8d4dcb6d84f89da826fd'), ObjectId('696d8d4dcb6d84f89da826fe'), ObjectId('696d8d4dcb6d84f89da826ff'), ObjectId('696d8d4dcb6d84f89da82700'), ObjectId('696d8d4dcb6d84f89da82701'), ObjectId('696d8d4dcb6d84f89da82702'), ObjectId('696d8d4dcb6d84f89da82703'), ObjectId('696d8d4dcb6d84f89da82704'), ObjectId('696d8d4dcb6d84f89da82705'), ObjectId('696d8d4dcb6d84f89da82706'), ObjectId('696d8d4dcb6d84f89da82707'), ObjectId('696d8d4dcb6d84f89da82708'), ObjectId('696d8d4dcb6d84f89da827

Querying from mongodb cluster (VectorDB) through creating search index

In [9]:
from pymongo.operations import SearchIndexModel
import time

index_name="vector_index"
search_index_model = SearchIndexModel(
    definition = {
        "fields": [
            {
                "type": "vector",
                "numDimensions": 3072,
                "path": "embedding",
                "similarity": "cosine"
            }
        ]
    },
    name = index_name,
    type = "vectorSearch"
)
collection.create_search_index(model=search_index_model)

'vector_index'

In [10]:
print("Polling to check if the index is ready. This may take up to a minute")

found = lambda index: index.get("queryable") is True

while True:
    indices = list(collection.list_search_indexes(index_name))
    if len(indices) and found(indices[0]):
        break
    time.sleep(5)
    
print(index_name + " is ready for querying.")

Polling to check if the index is ready. This may take up to a minute
vector_index is ready for querying.


In [11]:
query_embedding=gen_embedding("Interview Prep")

query_embedding

[Embedding(embedding=[-0.022642355412244797, -0.009958522394299507, -0.011261186562478542, 0.010601283982396126, 0.027235958725214005, 0.016720376908779144, -0.003770868992432952, 0.05118097737431526, -0.019917044788599014, 0.014783521182835102, 0.0036230338737368584, -0.02987556718289852, -0.012966647744178772, -0.0048292833380401134, 0.020894043147563934, 0.008583012036979198, 0.014500705525279045, 0.01822015456855297, -0.055157531052827835, -0.022573793306946754, 0.01929999329149723, -0.023379389196634293, -0.020259851589798927, 0.004927840083837509, -0.03784581273794174, -0.01668609492480755, 0.04456481710076332, 0.0008243419579230249, -0.018614381551742554, -0.023722194135189056, 0.0013101627118885517, 0.035206206142902374, 0.01035274937748909, -0.050221119076013565, -0.0639333724975586, -0.02715025655925274, 0.02747592329978943, 0.004807858262211084, -0.006877550855278969, 0.01696034148335457, -0.022728055715560913, 0.016780367121100426, 0.0032973678316920996, 0.01226389501243829

[-0.022642355412244797,
 -0.009958522394299507,
 -0.011261186562478542,
 0.010601283982396126,
 0.027235958725214005,
 0.016720376908779144,
 -0.003770868992432952,
 0.05118097737431526,
 -0.019917044788599014,
 0.014783521182835102,
 0.0036230338737368584,
 -0.02987556718289852,
 -0.012966647744178772,
 -0.0048292833380401134,
 0.020894043147563934,
 0.008583012036979198,
 0.014500705525279045,
 0.01822015456855297,
 -0.055157531052827835,
 -0.022573793306946754,
 0.01929999329149723,
 -0.023379389196634293,
 -0.020259851589798927,
 0.004927840083837509,
 -0.03784581273794174,
 -0.01668609492480755,
 0.04456481710076332,
 0.0008243419579230249,
 -0.018614381551742554,
 -0.023722194135189056,
 0.0013101627118885517,
 0.035206206142902374,
 0.01035274937748909,
 -0.050221119076013565,
 -0.0639333724975586,
 -0.02715025655925274,
 0.02747592329978943,
 0.004807858262211084,
 -0.006877550855278969,
 0.01696034148335457,
 -0.022728055715560913,
 0.016780367121100426,
 0.0032973678316920996

Now taking embedding from query to do vector search in VectorDB

In [12]:
results = collection.ragpdf_geeks4geeks.aggregate([
    {
        "$vectorSearch": {
            "index": "vector_index",
            "path": "embedding",
            "queryVector": query_embedding,
            "numCandidates": 3072,
            "limit": 5
        }
    }
])

In [13]:
results

<pymongo.synchronous.command_cursor.CommandCursor at 0x26c37a027b0>

In [14]:
def get_query_results(query):
  """Gets results from a vector search query."""

  query_embedding = gen_embedding(query, input_type="query")
  print(query_embedding)
  pipeline = [
      {
            "$vectorSearch": {
              "index": "vector_index",
              "queryVector": query_embedding,
              "path": "embedding",
              "numCandidates":3072,
              "limit": 5
            }
      }, {
            "$project": {
              "_id": 0,
              "text": 1
         }
      }
  ]

  results = collection.aggregate(pipeline)
  print(results)

  array_of_results = []
  for doc in results:
      array_of_results.append(doc)
  return array_of_results


## Relevant response from VectorDB with improved context window

In [15]:
get_query_results("DSA")

[Embedding(embedding=[0.007518500089645386, 0.018508343026041985, -0.01304632518440485, 0.01069370936602354, 0.018195757642388344, 0.048730410635471344, -0.0077570523135364056, 0.04678909108042717, 0.009920472279191017, 0.026586206629872322, -0.009969827719032764, -0.01714283786714077, 0.006704133003950119, -0.045867785811424255, -0.011047424748539925, 0.019067706540226936, -0.04882912337779999, -0.008480934426188469, 0.017751557752490044, -0.03249242529273033, 0.001332600717432797, -0.031702734529972076, -0.0009423831361345947, 0.030123356729745865, 0.03142305463552475, 0.05109947919845581, 0.0011896751821041107, 0.026438139379024506, -0.023657776415348053, 0.013498751446604729, -0.004437889438122511, 0.047842010855674744, -0.04277483746409416, -0.026076199486851692, -0.015374263748526573, 0.03606247901916504, 0.01678912341594696, -0.004680554382503033, -0.027392348274588585, 0.0061077531427145, -0.015530556440353394, 0.003057989524677396, -0.053863391280174255, 0.06406354159116745, 0

[{'text': 'content of DSA. While Complete Interview Preparation \nCourse covers DSA, Additional topics of DSA that aren’t \ncovered in the DSA SP course, Not only this but also core \nsubjects like OS DBMS, OOPS concept, and much more. \nHence CIP is the complete package for Interview \nPreparations.\n11'},
 {'text': 'content of DSA. While Complete Interview Preparation \nCourse covers DSA, Additional topics of DSA that aren’t \ncovered in the DSA SP course, Not only this but also core \nsubjects like OS DBMS, OOPS concept, and much more. \nHence CIP is the complete package for Interview \nPreparations.\n11'},
 {'text': 'Various\nOffers from:\nInfosys,\nCapgemini,\nTCS\nDSA 200+\nLIFE\nTIME\nACCESS\nContest & Mock Test\nLifetime access\nto Course\nCoding problems\n200+ Algorithmic \nCoding Problems\nWeekly Schedule\nCompanywise Q&A Premium Lecture\nVideo\nCourse Completion\nCertificate\nResume \nBuilding Videos\n?\n6'},
 {'text': 'Various\nOffers from:\nInfosys,\nCapgemini,\nTCS\nDSA 2

## Generation Pipeline

## Relevant response from VectorDB with shown experiments on improved context window BASED ON OUR PDF for ChatGPT 

In [16]:
query = "How can I learn DSA?"
context_docs = get_query_results(query)
context_string = " ".join([doc["text"] for doc in context_docs])

# Construct prompt for the LLM using the retrieved documents as the context
prompt = f"""Use the following pieces of context to answer the question at the end.
    {context_string}
    Question: {query}
"""

openai_client = OpenAI()

# OpenAI model to use
model_name = "gpt-4o"

completion = openai_client.chat.completions.create(
model=model_name,
messages=[{"role": "user",
    "content": prompt
  }]
)
print(completion.choices[0].message.content)

[Embedding(embedding=[-0.00638221250846982, 0.01911129802465439, -0.02749369479715824, 0.01112469844520092, 0.01723126694560051, 0.02292790450155735, 0.01779668964445591, 0.014616185799241066, 0.034405991435050964, 0.0247796643525362, -0.02405874989926815, 0.007152601610869169, 0.02451108768582344, -0.04169994965195656, 0.029967421665787697, 0.0037176564801484346, 0.003714122576639056, 0.013259170576930046, -0.018192486837506294, -0.029882607981562614, 0.011280189268290997, -0.054619863629341125, -0.043876826763153076, 0.022560378536581993, -0.010538071393966675, 0.019026484340429306, 0.024369733408093452, -0.011237782426178455, -0.03963615372776985, 0.017287809401750565, 0.025387493893504143, 0.009223463013768196, -0.02654661238193512, -0.061970364302396774, -0.0319746732711792, -0.016637573018670082, 0.033614400774240494, 0.001055750879459083, -0.07243069261312485, 0.0015407777391374111, 0.0016238242387771606, 0.02835596539080143, -0.024087021127343178, 0.0035091566387563944, 0.02461