In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# mypy: disable-error-code="import-not-found"

from __future__ import annotations  # noqa: F404

import os
import tempfile
import zipfile
from typing import TYPE_CHECKING, List, Tuple

if TYPE_CHECKING:
    import pathlib

    from langchain.schema import Document

import re
import sys
import textwrap
from pathlib import Path

import nltk
import yaml
from langchain.text_splitter import MarkdownTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.vectorstores.faiss import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from pydantic import BaseModel

# The notebook should be executed from the project root directory
if "_correct_path" not in locals():
    os.chdir("..")
    sys.path.append(".")
    print(f"changed dir to {Path('.').resolve()})")
    _correct_path = True

# Settings

In [None]:
try:
    from infra.settings_rag import diy_rag_nb_output
except ImportError:
    raise ValueError(
        "Make sure you have set rag_type=RAGType.DIY in `settings_main.py` before using this notebook."
    )


class DiyVectorStoreSettings(BaseModel):
    """Validation schema for VDB settings."""

    sentence_transformer_model_name: str
    chunk_size: int
    chunk_overlap: int


PATH_TO_DOCS = "assets/datarobot_english_documentation_docsassist.zip"

VECTORSTORE_SETTINGS = DiyVectorStoreSettings(
    sentence_transformer_model_name="all-MiniLM-L6-v2",
    chunk_size=2000,
    chunk_overlap=1000,
)

In [None]:
from joblib import Memory

# Settings for caching doc chunking & vectorization
cache_dir = "./.cache"
os.makedirs(cache_dir, exist_ok=True)
memory = Memory(cache_dir, verbose=0)
memory.reduce_size(bytes_limit=512e6)

# Chunk documents and build vector database

In [None]:
def make_chunks(
    path_to_source_documents: pathlib.Path, chunk_size: int, chunk_overlap: int
) -> List[Document]:
    """Convert raw documents into document chunks that can be ingested into a vector db."""

    def _format_metadata(docs: list[Document]) -> None:
        """
        this function formats doc metadata to extract a valid URL

        adapt to the needs of your specific document collection
        """
        https_string = re.compile(r".+(https://.+)$")

        for doc in docs:
            doc.metadata["source"] = (
                doc.metadata["source"]
                .replace("|", "/")
                .replace(str(path_to_source_documents.resolve()), "")
            )

            doc.metadata["source"] = re.sub(
                r"datarobot_docs/en/(.+)\.txt",
                r"https://docs.datarobot.com/en/docs/\1.html",
                doc.metadata["source"],
            )
            try:
                doc.metadata["source"] = https_string.findall(doc.metadata["source"])[0]
            except Exception:
                pass

    SOURCE_DOCUMENTS_FILTER = "**/*.*"  # "**/*.pdf" or "**/*.txt"

    loader = DirectoryLoader(
        str(path_to_source_documents.resolve()), glob=SOURCE_DOCUMENTS_FILTER
    )
    splitter = MarkdownTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    nltk.download("punkt", quiet=True)
    nltk.download("punkt_tab", quiet=True)
    nltk.download("averaged_perceptron_tagger_eng", quiet=True)

    data = loader.load()
    docs = splitter.split_documents(data)

    _format_metadata(docs)

    return docs

In [None]:
@memory.cache
def process_zip_documents(
    path_to_docs_zip: Path, chunk_size: int, chunk_overlap: int
) -> List[Document]:
    """Unzip documents to a temp dir and chunk."""
    with tempfile.TemporaryDirectory() as temp_dir:
        with zipfile.ZipFile(path_to_docs_zip, "r") as zip_ref:
            zip_ref.extractall(temp_dir)
        return make_chunks(Path(temp_dir), chunk_size, chunk_overlap)

In [None]:
@memory.cache
def make_vector_db(
    documents: List[Document],
    embedding_model_name: str,
    embedding_model_output_dir: Path,
    vdb_output_dir: Path,
) -> Tuple[Path, Path]:
    """Build the vector db and persist it to disk."""
    embedding_function = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        cache_folder=str(embedding_model_output_dir),
    )
    texts = [doc.page_content for doc in documents]
    metadatas = [doc.metadata for doc in documents]

    db = FAISS.from_texts(texts, embedding_function, metadatas=metadatas)
    db.save_local(str(vdb_output_dir))
    return embedding_model_output_dir, vdb_output_dir

In [None]:
print("Chunking documents...")
doc_chunks = process_zip_documents(
    path_to_docs_zip=PATH_TO_DOCS,
    chunk_size=VECTORSTORE_SETTINGS.chunk_size,
    chunk_overlap=VECTORSTORE_SETTINGS.chunk_overlap,
)

In [None]:
print("Building vector database...")
embedding_path, db_path = make_vector_db(
    documents=doc_chunks,
    embedding_model_name=VECTORSTORE_SETTINGS.sentence_transformer_model_name,
    embedding_model_output_dir=diy_rag_nb_output.embedding_model,
    vdb_output_dir=diy_rag_nb_output.vdb,
)

# Export settings needed at retrieval time to the RAG deployment directory

In [None]:
from docsassist.schema import RAGModelSettings

rag_model_settings = RAGModelSettings(
    embedding_model_name=VECTORSTORE_SETTINGS.sentence_transformer_model_name,
    max_retries=0,
    request_timeout=30,
    temperature=0.0,
    stuff_prompt=textwrap.dedent("""\
            Use the following pieces of context to answer the user's question.
            If you don't know the answer, just say that you don't know, don't try to make up an answer.
            ----------------
            {context}"""),
)

with open(diy_rag_nb_output.rag_settings, "w") as f:
    yaml.safe_dump(rag_model_settings.model_dump(mode="json"), f)