# PDF RAG

This is a PDF-based RAG application. While answering questions, it accesses relevant information from the PDF and displays the corresponding paragraphs in the form of images.

In [73]:
APPLY = True
COLLECTION_NAME = '<var:table_name>' if not APPLY else 'sample_pdf_rag'

In [74]:
from superduper import superduper, CFG
CFG.bytes_encoding = 'str'
CFG.native_json = False

In [135]:
db = superduper()

[32m2024-Dec-04 00:19:50.30[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.misc.plugins[0m:[36m13  [0m | [1mLoading plugin: mongodb[0m
[32m2024-Dec-04 00:19:50.34[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m68  [0m | [1mBuilding Data Layer[0m
[32m2024-Dec-04 00:19:50.35[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.build[0m:[36m184 [0m | [1mConfiguration: 
 +---------------+-----------------------------------+
| Configuration |               Value               |
+---------------+-----------------------------------+
|  Data Backend | mongodb://localhost:27017/test_db |
+---------------+-----------------------------------+[0m


In [None]:
def getter():
    #import subprocess
    import os
    #subprocess.run(['curl', '-O', 'https://superduperdb-public-demo.s3.amazonaws.com/pdfs.zip'])
    #subprocess.run(['unzip', '-o', 'pdfs.zip'])
    #subprocess.run(['rm', 'pdfs.zip'])
    pdf_folder = "/Users/kartiksharma/Work/superduperdb/code/superduper/small"
    pdf_names = [pdf for pdf in os.listdir(pdf_folder) if pdf.endswith(".pdf")]
    pdf_paths = [os.path.join(pdf_folder, pdf) for pdf in pdf_names]
    data = [{"url": pdf_path, "file": pdf_path} for pdf_path in pdf_paths]
    return data

In [None]:
if APPLY:
    data = getter()

## Create a table to store PDFs.

In [54]:
import os
from superduper import Schema, Table
from superduper.components.datatype import file

schema = Schema(identifier="myschema", fields={'url': 'str', 'file': file})
table = Table(identifier=COLLECTION_NAME, schema=schema)

if APPLY:
    db.apply(table, force=True)
    #db[COLLECTION_NAME].insert(data).execute()

[32m2024-Dec-03 23:36:46.47[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (datatype, file) not found in cache, loading from db[0m
[32m2024-Dec-03 23:36:46.47[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('datatype', 'file')) from metadata...[0m
[32m2024-Dec-03 23:36:46.47[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.apply[0m:[36m312 [0m | [1mFound new datatype:file:121dd11d21464f2b[0m
[32m2024-Dec-03 23:36:46.47[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (schema, myschema) not found in cache, loading from db[0m
[32m2024-Dec-03 23:36:46.47[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('schema', 'myschema')) from metadata...[0m
[3

## Split the PDF file into images for later result display

In [55]:
from superduper import ObjectModel, logging
from pdf2image import convert_from_path
import os


def split_image(pdf_path):
    logging.info(f"Splitting images from {pdf_path}")

    image_folders = "data/pdf-images"
    pdf_name = os.path.basename(pdf_path)
    images = convert_from_path(pdf_path)
    logging.info(f"Number of images: {len(images)}")

    image_folder = os.path.join(image_folders, pdf_name)
    if not os.path.exists(image_folder):
        os.makedirs(image_folder)

    data = []
    for i, image in enumerate(images):
        path = os.path.join(image_folder, f"{i}.jpg")
        image.save(os.path.join(path))
        data.append(path)
    return data


model_split_image = ObjectModel(
    identifier="split_image",
    object=split_image,
    datatype=file,
)

listener_split_image = model_split_image.to_listener(
    key="file",
    select=db[COLLECTION_NAME].find(),
    flatten=True,
)

if APPLY:
    db.apply(listener_split_image, force=True)

[32m2024-Dec-03 23:36:57.71[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.apply[0m:[36m225 [0m | [1mFound identical datatype:file:121dd11d21464f2b[0m
[32m2024-Dec-03 23:36:57.71[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (model, split_image) not found in cache, loading from db[0m
[32m2024-Dec-03 23:36:57.71[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('model', 'split_image')) from metadata...[0m
[32m2024-Dec-03 23:36:57.72[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.apply[0m:[36m312 [0m | [1mFound new model:split_image:2bf48621088043b0[0m
[32m2024-Dec-03 23:36:57.77[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (listener, split_image) not found in cache, loading fr

0it [00:00, ?it/s]


## Build a chunks model and return chunk results with coordinate information.

In [56]:
def remove_sidebars(elements):
    import re
    from collections import defaultdict

    from unstructured.documents.elements import ElementType

    if not elements:
        return elements
    points_groups = defaultdict(list)
    min_x = 99999999
    max_x = 0
    e2index = {e.id: i for i, e in enumerate(elements)}
    for e in elements:
        x_l = int(e.metadata.coordinates.points[0][0])
        x_r = int(e.metadata.coordinates.points[2][0])
        points_groups[(x_l, x_r)].append(e)
        min_x = min(min_x, x_l)
        max_x = max(max_x, x_r)
    sidebars_elements = set()
    for (x_l, x_r), es in points_groups.items():
        first_id = e2index[es[0].id]
        last_id = e2index[es[-1].id]
        on_left = first_id == 0 and x_l == min_x
        on_right = (last_id == len(elements) - 2) and x_r == max_x
        loc_match = [on_left, on_right]
        total_text = "".join(map(str, es))
        condiction = [
            any(loc_match),
            len(es) >= 3,
            re.findall("^[A-Z\s\d,]+$", total_text),
        ]
        if not all(condiction):
            continue
        sidebars_elements.update(map(lambda x: x.id, es))
        if on_left:
            check_page_num_e = elements[last_id + 1]
        else:
            check_page_num_e = elements[-1]
        if (
            check_page_num_e.category == ElementType.UNCATEGORIZED_TEXT
            and check_page_num_e.text.strip().isalnum()
        ):
            sidebars_elements.add(check_page_num_e.id)

    elements = [e for e in elements if e.id not in sidebars_elements]
    return elements


def remove_annotation(elements):
    from collections import Counter

    from unstructured.documents.elements import ElementType

    page_num = max(e.metadata.page_number for e in elements)
    un_texts_counter = Counter(
        [e.text for e in elements if e.category == ElementType.UNCATEGORIZED_TEXT]
    )
    rm_text = set()
    for text, count in un_texts_counter.items():
        if count / page_num >= 0.5:
            rm_text.add(text)
    elements = [e for e in elements if e.text not in rm_text]
    return elements


def create_chunk_and_metadatas(page_elements, stride=3, window=10):
    page_elements = remove_sidebars(page_elements)
    for index, page_element in enumerate(page_elements):
        page_element.metadata.num = index
    datas = []
    for i in range(0, len(page_elements), stride):
        windown_elements = page_elements[i : i + window]
        chunk = "\n".join([e.text for e in windown_elements])
        source_elements = [e.to_dict() for e in windown_elements]
        datas.append(
            {
                "txt": chunk,
                "source_elements": source_elements,
            }
        )
    return datas


def get_chunks(pdf):
    from collections import defaultdict

    from unstructured.documents.coordinates import RelativeCoordinateSystem
    from unstructured.partition.pdf import partition_pdf

    elements = partition_pdf(pdf)
    elements = remove_annotation(elements)

    pages_elements = defaultdict(list)
    for element in elements:
        element.convert_coordinates_to_new_system(
            RelativeCoordinateSystem(), in_place=True
        )
        pages_elements[element.metadata.page_number].append(element)

    all_chunks_and_links = sum(
        [
            create_chunk_and_metadatas(page_elements)
            for _, page_elements in pages_elements.items()
        ],
        [],
    )
    return all_chunks_and_links

In [57]:
from superduper.components.schema import FieldType

model_chunk = ObjectModel(
    identifier="chunk",
    object=get_chunks,
    datatype=FieldType(identifier="json")
)

listener_chunk = model_chunk.to_listener(
    key="file",
    select=db[COLLECTION_NAME].select(),
    flatten=True,
)

if APPLY:
    db.apply(listener_chunk, force=True)

[32m2024-Dec-03 23:37:25.28[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (model, chunk) not found in cache, loading from db[0m
[32m2024-Dec-03 23:37:25.28[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('model', 'chunk')) from metadata...[0m
[32m2024-Dec-03 23:37:25.29[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.apply[0m:[36m312 [0m | [1mFound new model:chunk:3ea7689ae7834c5b[0m
[32m2024-Dec-03 23:37:25.33[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (listener, chunk) not found in cache, loading from db[0m
[32m2024-Dec-03 23:37:25.33[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('listener', 'chunk')) from metadata...[0m
[32m2024-D

0it [00:00, ?it/s]


## Build a vector index for vector search

OpenAI:

In [58]:
from superduper_openai import OpenAIEmbedding
os.environ['OPENAI_API_KEY'] = '<key>'
openai_embedding = OpenAIEmbedding(identifier='text-embedding-ada-002')

Sentence-transformers:

In [59]:
import sentence_transformers
from superduper_sentence_transformers import SentenceTransformer

sentence_transformers_embedding = SentenceTransformer(
    identifier="sentence-transformers-embedding",
    model="BAAI/bge-small-en",
    postprocess=lambda x: x.tolist(),
    predict_kwargs={"show_progress_bar": True},
)

In [60]:
from superduper.components.model import ModelRouter

model_embedding = ModelRouter(
    'embedding',
    models={'openai': openai_embedding, 'sentence_transformers': sentence_transformers_embedding},
    model='<var:embedding_model>' if not APPLY else 'openai',
    example='this is a test',
    signature='singleton'
)


In [None]:
model_embedding.signature

In [61]:
from superduper_openai.model import OpenAIEmbedding
from superduper import VectorIndex

listener_embedding = model_embedding.to_listener(
    key=f"{listener_chunk.outputs}.txt",
    select=db[listener_chunk.outputs].select(),
)

vector_index = VectorIndex(
    identifier="vector-index",
    indexing_listener=listener_embedding,
)

if APPLY:
    db.apply(vector_index, force=True)


[32m2024-Dec-03 23:37:39.27[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.components.model[0m:[36m1327[0m | [1mPredicting with model openai[0m
[32m2024-Dec-03 23:37:40.47[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (datatype, vector[1536]) not found in cache, loading from db[0m
[32m2024-Dec-03 23:37:40.47[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('datatype', 'vector[1536]')) from metadata...[0m
[32m2024-Dec-03 23:37:40.48[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.apply[0m:[36m312 [0m | [1mFound new datatype:vector[1536]:3619f576961e4665[0m
[32m2024-Dec-03 23:37:40.48[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (model, text-embedding-ada-002) not found in cache, lo

0it [00:00, ?it/s]


## Create a plugin

When applying the processor, saves the plugin in the database, thereby saving the related dependencies as well.

The processor will integrate the returned chunks information with the images, and return a visualized image.​

In [62]:
from superduper import Plugin
from utils import Processor

processor = Processor(
    identifier="processor",
    db=db,
    chunk_key=listener_chunk.outputs,
    split_image_key=listener_split_image.outputs,
    plugins=[Plugin(path="./utils.py")],
)

## Create a RAG model

Create a RAG model to perform retrieval-augmented generation (RAG) and return the results.

In [63]:
from superduper import Model, logging


class Rag(Model):
    llm_model: Model
    vector_index_name: str
    prompt_template: str
    processor: None | Model = None

    def __post_init__(self, *args, **kwargs):
        assert "{context}" in self.prompt_template, 'The prompt_template must include "{context}"'
        assert "{query}" in self.prompt_template, 'The prompt_template must include "{query}"'
        super().__post_init__(*args, **kwargs)

    def init(self, db=None):
        db = db or self.db
        self.vector_index = self.db.load("vector_index", self.vector_index_name)
        super().init(db=db)
        
    
    def predict(self, query, top_k=5, format_result=False):
        vector_search_out = self.vector_search(query, top_k=top_k)
        key = self.vector_index.indexing_listener.key
        context = "\n\n---\n\n".join([x[key] for x in vector_search_out])
        
        prompt = self.prompt_template.format(context=context, query=query)
        output = self.llm_model.predict(prompt)
        result = {
            "answer": output,
            "docs": vector_search_out,
        }
        if format_result and self.processor:
            result["images"] = list(self.processor.predict(
                vector_search_out,
                match_text=output,
            ))
        return result

    def vector_search(self, query, top_k=5, format_result=False):
        logging.info(f"Vector search query: {query}")
        select = self.db[self.vector_index.indexing_listener.select.table].like(
            {self.vector_index.indexing_listener.key:query},
            vector_index=self.vector_index.identifier, 
            n=top_k,
        ).select()
        out = select.execute()
        if out:
            out = sorted(out, key=lambda x: x["score"], reverse=True)
        return out

In [64]:
from superduper_openai import OpenAIChatCompletion

llm_openai = OpenAIChatCompletion(identifier='llm-openai', model='gpt-3.5-turbo')

In [65]:
from superduper_anthropic import AnthropicCompletions

predict_kwargs = {
    "max_tokens": 1024,
    "temperature": 0.8,
}

llm_anthropic = AnthropicCompletions(identifier='llm-anthropic', model='claude-2.1', predict_kwargs=predict_kwargs)

TypeError: Anthropic.__post_init__() missing 1 required positional argument: 'example'

In [None]:
from superduper_vllm import VllmCompletion

predict_kwargs = {
    "max_tokens": 1024,
    "temperature": 0.8,
}

llm_vllm = VllmCompletion(
    identifier="llm-vllm",
    vllm_params={
        'model': 'TheBloke/Mistral-7B-Instruct-v0.2-AWQ',
        "gpu_memory_utilization": 0.7,
        "max_model_len": 1024,
        "quantization": "awq",
    },
    predict_kwargs=predict_kwargs,
)

In [66]:
llm = ModelRouter(
    'llm',
    models={
        'openai': llm_openai,
        #'anthropic': llm_anthropic,
        #'vllm': llm_vllm,
    },
    model='<var:llm_model>' if not APPLY else 'openai',
)

In [67]:
from superduper_openai.model import OpenAIChatCompletion

prompt_template = (
    "The following is a document and question\n"
    "Only provide a very concise answer\n"
    "Context:\n\n"
    "{context}\n\n"
    "Here's the question:{query}\n"
    "answer:"
)

rag = Rag(identifier="rag", llm_model=llm, vector_index_name=vector_index.identifier, prompt_template=prompt_template, db=db, processor=processor)

In [68]:
from IPython.display import Image, Markdown, display

if APPLY:
    db.apply(rag, force=True)
    result = rag.predict("What is code of Premium Tech Tool 2 SA Source Address?", format_result=True)
    
    display(Markdown(result["answer"]))
    
    for message, img in result["images"]:
        display(Markdown(message))
        display(img)

[32m2024-Dec-03 23:37:58.69[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (model, llm-openai) not found in cache, loading from db[0m
[32m2024-Dec-03 23:37:58.69[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('model', 'llm-openai')) from metadata...[0m
[32m2024-Dec-03 23:37:58.69[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.apply[0m:[36m312 [0m | [1mFound new model:llm-openai:a0698b169cd14975[0m
[32m2024-Dec-03 23:37:58.69[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (model, llm) not found in cache, loading from db[0m
[32m2024-Dec-03 23:37:58.69[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('model', 'llm')) from metadata...[0m
[32m2

[32m2024-Dec-03 23:37:59.72[0m| [31m[1mERROR   [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.backends.local.vector_search[0m:[36m146 [0m | [31m[1mTried to search on an empty vector database Vectors are not yet loaded in vector database. 
Please check if model outputs are ready.[0m


[32m2024-Dec-03 23:37:59.72[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.components.model[0m:[36m1327[0m | [1mPredicting with model openai[0m


The code for Premium Tech Tool 2 SA Source Address is PTT2-SA.

## Create template

In [95]:
from superduper import Application

app = Application(
    'pdf-rag',
    components=[
        table,
        listener_split_image,
        listener_chunk,
        vector_index,
        rag
    ]
)


[32m2024-Dec-03 23:51:18.71[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.components.application[0m:[36m39  [0m | [1mResorting components based on topological order.[0m
[32m2024-Dec-03 23:51:18.74[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.components.application[0m:[36m56  [0m | [1mNew order of components: ['table:sample_pdf_rag:6e50f63c63cd442e', 'listener:split_image:dba4c23fd1984eea', 'listener:chunk:2b26440ac7db46eb', 'vector_index:vector-index:e87a555abc8d4d65', 'model:rag:5b395ea9057043e3'][0m


In [70]:
from superduper import Template, CFG, Table
from superduper.components.dataset import RemoteData

template = Template(
    'pdf-rag',
    db=db,
    template=app,
    substitutions={prompt_template: 'prompt_template', COLLECTION_NAME: 'table_name'},
    template_variables=['table_name', 'prompt_template', 'llm_model', 'embedding_model'],
    default_table=Table(
        'sample_pdf_rag',
        schema=Schema(
            'sample_pdf_rag/schema',
            fields={"url": "str", "file": file}
        ),
        data=RemoteData('sample_pdfs', getter=getter),
    ),
    types={
        'prompt_template':{
            'type': 'str',
            'default': prompt_template
        },
        'table_name': {
            'type': 'str',
            'default': 'sample_pdf_rag'
        },
        'llm_model': {
            'type': 'str',
            'choices': ['openai', 'anthropic', 'vllm'],
            'default': 'openai',
        },
        'embedding_model': {
            'type': 'str',
            'choices': ['openai', 'sentence_transformers'],
            'default': 'openai',
        },
    }
)



In [96]:
#db.apply(template)
template.export(".")




In [76]:
app = Template.read('./', db=db)



In [None]:
app = db.load('template', 'pdf-rag')

In [81]:
db.apply(app)

[32m2024-Dec-03 23:40:57.31[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper_mongodb.artifacts[0m:[36m128 [0m | [1mUploading file ./files/0b4d904bb053e289821fd6f15ea1911784736167/utils.py to GridFS with file_id 0b4d904bb053e289821fd6f15ea1911784736167[0m
[32m2024-Dec-03 23:40:57.32[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper_mongodb.artifacts[0m:[36m128 [0m | [1mUploading file ./files/9498b7a11ec9428e68b04e01475a2d85/utils.py to GridFS with file_id 9498b7a11ec9428e68b04e01475a2d85[0m
[32m2024-Dec-03 23:40:57.34[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (datatype, file) not found in cache, loading from db[0m
[32m2024-Dec-03 23:40:57.34[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('datatype', 'file')) from metadata...[0m
[32m2024-Dec-03 23:40:57.34[0m| [

  y


[32m2024-Dec-03 23:40:59.92[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m590 [0m | [1mComponent 121dd11d21464f2b not found in cache, loading from db with uuid[0m
[32m2024-Dec-03 23:40:59.93[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m612 [0m | [1mAdding datatype:file:121dd11d21464f2b to cache[0m
[32m2024-Dec-03 23:40:59.95[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.components.component[0m:[36m594 [0m | [1mAdding datatype: file to cache[0m
[32m2024-Dec-03 23:40:59.95[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m590 [0m | [1mComponent fb963abf6f1041eb not found in cache, loading from db with uuid[0m
[32m2024-Dec-03 23:40:59.96[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m612 [0m | [1mAdding schema:sample_pdf_rag/schem

Template(identifier='pdf-rag', uuid='f1a53de11ece4f38', upstream=None, plugins=None, cache=True, status=<Status.ready: 'ready'>, build_variables=None, build_template=None, template={'_base': '?pdf-rag', '_builds': {'str': {'_path': 'superduper.components.schema.FieldType'}, 'datatype:file': {'_path': 'superduper.components.datatype.FileType', 'upstream': None, 'plugins': None, 'cache': True, 'build_variables': None, 'build_template': None}, 'schema:myschema': {'_path': 'superduper.components.schema.Schema', 'upstream': None, 'plugins': None, 'cache': True, 'build_variables': None, 'build_template': None, 'fields': {'url': '?str', 'file': '?datatype:file', '_fold': '?str'}}, 'table:<var:table_name>': {'_path': 'superduper.components.table.Table', 'upstream': None, 'plugins': None, 'cache': True, 'build_variables': None, 'build_template': None, 'schema': '?schema:myschema', 'primary_id': 'id', 'data': None}, 'model:split_image': {'_path': 'superduper.components.model.ObjectModel', 'upstr

In [89]:
from superduper import Document, Component
x = Document(app.template)

In [100]:
from superduper import Document, Component, Application
app = Application.read('./pdf-app/', db=db)

[32m2024-Dec-03 23:54:52.69[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m590 [0m | [1mComponent 2b26440ac7db46eb not found in cache, loading from db with uuid[0m


FileNotFoundError: Can't find 2b26440ac7db46eb in metadata

In [111]:
x = app.encode()



In [130]:
x['_blobs'].keys()
#new_app = Document.decode(x, db=db)['_base']

dict_keys(['568534984ca7688b45c9a5caaa73a5756e1a78a6', '1087871ffeeb5420a415b53f5e137701fd991584', 'c190af19f02c96ce9f7c38409748ccfc98758827', '58d51953d88d26b2a08024f471d6896b12063698'])

In [None]:
#new_app.db = None
db.apply(new_app)

In [139]:
db = superduper()
template = Template.read('./', db=db)
template()

[32m2024-Dec-04 00:31:16.85[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.misc.plugins[0m:[36m13  [0m | [1mLoading plugin: mongodb[0m
[32m2024-Dec-04 00:31:16.89[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.datalayer[0m:[36m68  [0m | [1mBuilding Data Layer[0m
[32m2024-Dec-04 00:31:16.90[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduper.base.build[0m:[36m184 [0m | [1mConfiguration: 
 +---------------+-----------------------------------+
| Configuration |               Value               |
+---------------+-----------------------------------+
|  Data Backend | mongodb://localhost:27017/test_db |
+---------------+-----------------------------------+[0m
table:<var:table_name>
superduper_<var:databackend>.query.parse_query
<var:table_name>.find()
superduper_<var:databackend>.query.parse_query
<var:table_name>.select()
superduper_<var:databackend>.query.parse_query
<var:output_prefi

FileNotFoundError: Can't find vector_index: vector-index in metadata