# Simple RAG agent demo

## Setup

### Install dependencies

In [1]:
from dataclasses import dataclass
%pip install python-dotenv~=1.0 docarray~=0.40.0 pypdf~=5.1 --upgrade --quiet
%pip install chromadb~=0.5.18 sentence-transformers~=3.3 lark~=1.2 --upgrade --quiet
%pip install langchain~=0.3.7 langchain_openai~=0.2.6 langchain_community~=0.3.5 langchain-chroma~=0.1.4 --upgrade --quiet
%pip install langgraph~=0.2.46 --upgrade --quiet

# If running locally, you can do this instead:
#%pip install -r ../requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;3

### Load environment variables

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

# If running in Google Colab, you can use this code instead:
# from google.colab import userdata
# os.environ["AZURE_OPENAI_API_KEY"] = userdata.get("AZURE_OPENAI_API_KEY")
# os.environ["AZURE_OPENAI_ENDPOINT"] = userdata.get("AZURE_OPENAI_ENDPOINT")

### Setup Chat Model

In [3]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
llm = ChatOpenAI(model_name="gpt-4o-mini",temperature=0.0)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

## Setup ingestion / retrieval

### Setup vector DB (Chroma)

In [4]:
from langchain_chroma import Chroma

persist_directory = './db/chroma/'
vectordb: Chroma = Chroma(
    collection_name="my_index",
    embedding_function=embedding_model,
    persist_directory=persist_directory # Optionally persist the database
)

### Setup a text splitter

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 80
)

### Setup documents to load

In [6]:
# Documents to load (tuple of document_id and document_url)

@dataclass
class DocInfo:
    id: str
    url: str

documents: list[DocInfo] = [
    DocInfo("1", "https://data.riksdagen.se/fil/85C4C51C-4FFD-4284-BDE5-37782DFD686B"),
]

### Ingest - split and add to vector index

In [7]:
from langchain_community.document_loaders import PyPDFLoader

def ingest_documents(doc_info: DocInfo):
    # Check if document already exists
    existing = vectordb.get(where={"doc_id": doc_info.id})
    if existing["documents"]:
        print(f"Document {doc_info.id} already exists in index")
        return

    # Load
    print(f"Loading {doc_info.url} to index...")
    loader = PyPDFLoader(doc_info.url)
    pages = loader.load()
    for page in pages:
        page.metadata["doc_id"] = doc_info.id
        page.metadata["title"] = doc_info.title
    # Split
    doc_splits = text_splitter.split_documents(pages)

    # Add to index
    print(f"Adding to {doc_info.url} to index...")
    vectordb.add_documents(documents=doc_splits)

    print(f"Added {doc_info.url} ({len(pages)} pages) - {len(doc_splits)} splits")


for doc_info in documents:
    ingest_documents(doc_info)

Loading https://data.riksdagen.se/fil/85C4C51C-4FFD-4284-BDE5-37782DFD686B to index...
Adding to https://data.riksdagen.se/fil/85C4C51C-4FFD-4284-BDE5-37782DFD686B to index...
Added https://data.riksdagen.se/fil/85C4C51C-4FFD-4284-BDE5-37782DFD686B (137 pages) - 536 splits
