# RAG

Date: 13 June, 2025

Nomic and OpenAI Embeddings

In [None]:
from pathlib import Path

import pandas as pd
from langchain_core.documents import Document
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings

In [None]:
data_dir = Path("../data")

In [None]:
zeno_data = pd.read_csv(data_dir / "zeno_data_clean.csv")
zeno_data.head(2)

In [None]:
nomic_embeddings = OllamaEmbeddings(model="nomic-embed-text")
openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

nomic_index = InMemoryVectorStore(nomic_embeddings)
openai_index = InMemoryVectorStore(openai_embeddings)

In [None]:
zeno_docs = []

for idx, row in zeno_data[:-1].iterrows():
    content = {
        "DATA_LAYER": row["data_layer"],
        "DESCRIPTION": row["description"],
        "CONTEXTUAL_LAYERS": row["context_layer"],
        "DATE": row["date"],
        "VARIABLES": row["variables"]
    }

    formatted_content = "\n\n".join([f"{key}\n{value}" for key, value in content.items() if pd.notna(value)])
    zeno_docs.append(
        Document(
            id=row["dataset_id"],
            page_content=formatted_content,
            metadata={"data_layer": row["data_layer"], "source": row["source"], "tile_url": row["tile_url"]},
        )
    )

In [None]:
zeno_docs[0]

In [None]:
ids = nomic_index.add_documents(documents=zeno_docs)

In [None]:
ids = openai_index.add_documents(documents=zeno_docs)

In [None]:
# Save & load_again
nomic_index.dump(data_dir/"zeno-docs-nomic-index")
nomic_index = InMemoryVectorStore.load(data_dir / "zeno-docs-nomic-index", embedding=nomic_embeddings)

In [None]:
# Save & load_again
openai_index.dump(data_dir/"zeno-docs-openai-index")
openai_index = InMemoryVectorStore.load(data_dir / "zeno-docs-openai-index", embedding=openai_embeddings)

In [None]:
queries = [
    "What percent of 2000 forest did Kalimantan Barat lose from 2001 through 2024?",
    "What percent of Johor was tree cover in 2000?",
    "Which year recorded more alerts within Protected Areas in Ucayali, Peru? 2023 or 2024?",
    "Since 2001, do the forests within Gomba's KBAs act as a net carbon source or a net carbon sink, and by how much?",
    "Has Chai Nat or Krabi in Thailand has had the most forest-related annual carbon emissions since 2001?"
    "I'm researching carbon sequestration in Indonesian forests. Specifically, for Key Biodiversity Areas (KBAs), how much carbon has been absorbed from the atmosphere between 2000 and 2024?",
    "How many deforestation alerts were reported in protected areas of the Republic of the Congo april 2024 - april 2025?",
    "which country had the most deforestation in 2018"
]

In [None]:
nomic_retriever = nomic_index.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)

In [None]:
openai_retriever = openai_index.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)

In [None]:
r = nomic_retriever.batch(queries[:2])

In [None]:
r[1]

# ColBERT

In [None]:
from pylate import indexes, models, retrieve

In [None]:
model = models.ColBERT(model_name_or_path="lightonai/GTE-ModernColBERT-v1")
colbert_index = indexes.PLAID(
    index_folder=data_dir / "colbert-index",
    index_name="dataset",
    override=True,
)

In [None]:
zeno_docs[0]

In [None]:
colbert_embeddings = model.encode(
    [doc.page_content for doc in zeno_docs],
    batch_size=4,
    is_query=False,
    show_progress_bar=True,
)

In [None]:
colbert_index.add_documents(
    documents_ids=[doc.id for doc in zeno_docs],
    documents_embeddings=colbert_embeddings,
);

In [None]:
colbert_index = indexes.PLAID(
    index_folder=data_dir / "colbert-index",
    index_name="dataset",
)

In [None]:
colbert_retriever = retrieve.ColBERT(index=colbert_index)

In [None]:
query_embedding = model.encode(
    queries[-1],
    batch_size=1,
    is_query=True,
    show_progress_bar=True
)

In [None]:
scores = colbert_retriever.retrieve(
    queries_embeddings=query_embedding,
    k=3,
)

In [None]:
scores

In [None]:
user_query = "What percent of tree cover (intact forest) did Kalimantan Barat lose from 2015 through 2018?"

def get_relevant_documents(query):
    results = []
    query_embedding = model.encode(query, batch_size=1, is_query=True, show_progress_bar=False)
    scores = colbert_retriever.retrieve(queries_embeddings=query_embedding, k=3)
    for score in scores[0]:
        results.append(zeno_data[zeno_data.dataset_id == int(score["id"])].iloc[0].to_dict())
    return results

result = get_relevant_documents(user_query)

In [None]:
result

# Agents

In [None]:
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate

In [None]:
sonnet = ChatAnthropic(model="claude-3-7-sonnet-latest")

In [None]:
candidate_datasets = pd.DataFrame(result)
candidate_datasets[["dataset_id", "data_layer", "description", "context_layer", "date", "variables"]]


In [None]:
from pydantic import BaseModel, Field


class DatasetOption(BaseModel):
    id: int = Field(description="ID of the dataset that best matches the user query.")
    reason: str = Field(description="Short reason why the dataset is the best match.")

DATASET_SELECTION_PROMPT = ChatPromptTemplate.from_messages(
    [
        (
            "user",
            """
            Based on the query, return the ID of the dataset that can best answer the user query and provide reason why it is the best match.
            Look at the dataset description and contextual layers they have access to that can add additional context to better answer the query - also check date & variables when required.

            Candidate datasets:

            {candidate_datasets}

            Query:

            {user_query}
            """,
        )
    ]
)

DATASET_SELECTION_CHAIN = DATASET_SELECTION_PROMPT | sonnet.with_structured_output(DatasetOption)

In [None]:
result = DATASET_SELECTION_CHAIN.invoke({
    "candidate_datasets": candidate_datasets[["dataset_id", "data_layer", "description", "context_layer", "date", "variables"]].to_csv(index=False),
    "user_query": user_query,
})


In [None]:
result

In [None]:
from typing import Optional


class Dataset(BaseModel):
    dataset_id: int
    source: str
    data_layer: str
    context_layer: Optional[str] = Field(None, description="Pick a single context layer from the dataset")
    daterange: Optional[str] = None
    threshold: Optional[int] = None

DATASET_PROMPT = ChatPromptTemplate.from_messages([
    ("user", """
    Given the user query and the dataset - extract the relevant information from the dataset to pull data from source.

    Dataset: 

    {dataset}

    User Query: 
    
    {user_query}    
    """),
    ])

DATASET_CHAIN = DATASET_PROMPT | sonnet.with_structured_output(Dataset)

In [None]:
selection = DATASET_CHAIN.invoke({
    "user_query": user_query,
    "dataset": zeno_data[zeno_data.dataset_id == result.id].iloc[0].to_json()
})

In [None]:
selection.model_dump()