In [None]:
import os
from superduperdb import superduper
from superduperdb.backends.mongodb import Collection

mongodb_uri = os.getenv("MONGODB_URI", "mongodb://localhost:27017/volvo-demo")

# SuperDuperDB, now handles your MongoDB database
# It just super dupers your database 
db = superduper(mongodb_uri, artifact_store='filesystem://data/artifacts/', downloads_folder='./data/downloads')

## Load Data

In [None]:
from superduperdb.ext.unstructured.encoder import unstructured_encoder
from superduperdb import Document

TEST_FILE = "volvo-GetStarted.pdf"
db.add(unstructured_encoder)
collection = Collection('source')
to_insert = [Document({"elements": unstructured_encoder(TEST_FILE)})]
db.execute(collection.insert_many(to_insert))

## Chunk Model

In [None]:
from collections import defaultdict
from superduperdb import Listener, Model, Schema, vector

STRIDE = 3  # stride in numbers of lines
WINDOW = 10  # length of window in numbers of lines
MAX_NUM = 999999999

def merge_metadatas(metadatas):

    if not metadatas:
        return {}
    p1, p2, p3, p4 = (MAX_NUM, MAX_NUM), (MAX_NUM, 0), (0, 0), (0, MAX_NUM)
    for metadata in metadatas:
        p1_, p2_, p3_, p4_ = metadata['coordinates']['points']
        p1 = (min(p1[0], p1_[0]), min(p1[1], p1_[1]))
        p2 = (min(p2[0], p2_[0]), max(p2[1], p2_[1]))
        p3 = (max(p3[0], p3_[0]), max(p3[1], p3_[1]))
        p4 = (max(p4[0], p4_[0]), min(p4[1], p4_[1]))
    points = (p1, p2, p3, p4)
    page_number = metadata['page_number']
    return {'points': points, 'page_number': page_number}

def create_chunk_and_metadatas(page_elements):
    datas = []
    for i in range(0, len(page_elements), STRIDE):
        windown_elements = page_elements[i : i + WINDOW]
        metadatas = [e.metadata.to_dict() for e in windown_elements]
        chunk = "\n".join([e.text for e in windown_elements])
        datas.append({"txt": chunk, 'metadata': merge_metadatas(metadatas)})
    return datas

def get_chunks(elements):
    from collections import defaultdict

    pages_elements = defaultdict(list)
    for element in elements:
        pages_elements[element.metadata.page_number].append(element)

    all_chunks_and_links = sum(
        [
            create_chunk_and_metadatas(page_elements)
            for _, page_elements in pages_elements.items()
        ],
        [],
    )
    return all_chunks_and_links

chunk_model = Model(
    identifier="chunk",
    object=get_chunks,
    flatten=True,
    model_update_kwargs={"document_embedded": False},
    output_schema=Schema(identifier="myschema", fields={"txt": "string"}),
)



db.add(
    Listener(
        model=chunk_model,  # Assuming video2images is your SuperDuperDB model
        select=collection.find(),
        key="elements",
    )
)

## Create a vector-search Index

In [None]:
import sentence_transformers
from superduperdb import vector, VectorIndex
chunk_collection = Collection("_outputs.elements.chunk")

model = Model(
    identifier="embedding",
    object=sentence_transformers.SentenceTransformer("BAAI/bge-large-en-v1.5"),
    encoder=vector(shape=(384,)),
    predict_method="encode",  # Specify the prediction method
    preprocess=lambda x:x['0']['txt'] if isinstance(x, dict) else x,
    postprocess=lambda x: x.tolist(),  # Define postprocessing function
    batch_predict=True,  # Generate predictions for a set of observations all at once
)

db.add(
    VectorIndex(
        # Use a dynamic identifier based on the model's identifier
        identifier="vector-index",
        # Specify an indexing listener with MongoDB collection and model
        indexing_listener=Listener(
            select=chunk_collection.find(),
            key="_outputs.elements.chunk",  # Key for the documents
            model=model,  # Specify the model for processing
            predict_kwargs={"max_chunk_size": 1000},
        ),
    )
)

## Search Content from PDF

In [None]:
query = "How to call support"
out = db.execute(
    chunk_collection.like(
        Document({"_outputs.elements.chunk": query}),
        vector_index="vector-index",
        n=5,
    ).find({})
)

for r in sorted(out, key=lambda x:x.content['score'], reverse=True):
    print(r.content['score'])
    x = r.outputs('elements', 'chunk')['txt']
    print(x)

## QA System For PDF

In [None]:
from superduperdb.ext.llm.vllm import VllmModel

# Define the prompt for the llm model
prompt_template = (
    'The following is a document and question about the volvo user manual\n'
    'Only provide a very concise answer\n'
    '{context}\n\n'
    'Here\'s the question:{input}\n'
    'answer:'
)

# Create an instance of llm with the specified model and prompt
llm = VllmModel(identifier='llm',
                 model_name='mistralai/Mistral-7B-Instruct-v0.2', 
                 prompt_template=prompt_template,
                 vllm_kwargs={"max_model_len": 2048}, 
                 inference_kwargs={"max_tokens":2048},
               )

# Add the llm instance
db.add(llm)

# Print information about the models in the SuperDuperDB database
print(db.show('model'))

In [None]:
from superduperdb import Document

# Use the SuperDuperDB model to generate a response based on the search term and context
output, sources = db.predict(
    model_name='llm',
    input=query,
    context_select=chunk_collection.like(
        Document({"_outputs.elements.chunk": query}),
        vector_index="vector-index",
        n=5,
    ).find({}),
    context_key='_outputs.elements.chunk.0.txt',
)

# Get the reference links corresponding to the answer context
datas = []
page_messages = []
for source in sources:
    unpack_data = source.unpack()
    metadata = unpack_data['metadata']
    page_number = metadata['page_number']
    points = metadata['points']
    score = unpack_data["score"]
    message = f"page_number: {page_number}, score: {score:3f}, coordinates: {points}"
    page_messages.append(message)
    

# Display the generated response using Markdown
print(output.content)
print("\n".join(page_messages))

Log channel is reconnecting. Logs produced while the connection was down can be found on the head node of the cluster in `ray_client_server_[port].out`
