In [1]:
import os
import click

import sentence_transformers
from dotenv import load_dotenv
from superduperdb import (
    Document,
    Listener,
    Model,
    Schema,
    VectorIndex,
    superduper,
    vector,
)
from superduperdb.backends.mongodb import Collection

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

## Connect to mongodb database

In [2]:
mongodb_uri = os.getenv("MONGODB_URI", "superduperdb-demo")
artifact_store = os.getenv("ARTIFACT_STORE", "data/artifact_store")

db = superduper(mongodb_uri, artifact_store=f"filesystem://{artifact_store}")

2024-01-18 16:23:39,570	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


[32m 2024-Jan-18 16:23:39.57[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.base.build[0m:[36m60  [0m | [1mData Client is ready. MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)[0m
[32m 2024-Jan-18 16:23:39.58[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.base.build[0m:[36m35  [0m | [1mConnecting to Metadata Client with engine:  MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)[0m
[32m 2024-Jan-18 16:23:39.58[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.base.datalayer[0m:[36m80  [0m | [1mBuilding Data Layer[0m


## Parse pdf files and store them in the database

In [3]:
from superduperdb import Document
from superduperdb.ext.unstructured.encoder import unstructured_encoder

db.add(unstructured_encoder)

pdf_folder = 'pdf-folders'

pdf_paths = [os.path.join(pdf_folder, pdf) for pdf in os.listdir(pdf_folder)]
collection = Collection("source")
to_insert = [
    Document({"elements": unstructured_encoder(pdf_path)}) for pdf_path in pdf_paths
]
db.execute(collection.insert_many(to_insert))

[2024-01-18 16:23:41] pikepdf._core INFO pikepdf C++ to Python logger bridge initialized


[32m 2024-Jan-18 16:23:43.17[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.backends.local.compute[0m:[36m32  [0m | [1mSubmitting job. function:<function callable_job at 0x28e631480>[0m
[32m 2024-Jan-18 16:23:43.18[0m| [32m[1mSUCCESS [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.backends.local.compute[0m:[36m38  [0m | [32m[1mJob submitted.  function:<function callable_job at 0x28e631480> future:7ce3420e-8646-4e79-ad0f-0f4846325a82[0m


([ObjectId('65a8e00fe2023d47d469d8f6')],
 TaskWorkflow(database=<superduperdb.base.datalayer.Datalayer object at 0x298dce1d0>, G=<networkx.classes.digraph.DiGraph object at 0x2b36140d0>))

## Create a chunking model to chunk pdf chunks

In [4]:
def merge_metadatas(metadatas, return_center=False):
    MAX_NUM = 999999999
    if not metadatas:
        return {}
    p1, p2, p3, p4 = (MAX_NUM, MAX_NUM), (MAX_NUM, 0), (0, 0), (0, MAX_NUM)
    for metadata in metadatas:
        p1_, p2_, p3_, p4_ = metadata["coordinates"]["points"]
        p1 = (min(p1[0], p1_[0]), min(p1[1], p1_[1]))
        p2 = (min(p2[0], p2_[0]), max(p2[1], p2_[1]))
        p3 = (max(p3[0], p3_[0]), max(p3[1], p3_[1]))
        p4 = (max(p4[0], p4_[0]), min(p4[1], p4_[1]))
    points = (p1, p2, p3, p4)
    if return_center:
        points = {"x": (p1[0] + p3[0]) / 2, "y": (p1[1] + p3[1]) / 2}
        page_number = metadata["page_number"]
    return {"points": points, "page_number": page_number}


def create_chunk_and_metadatas(page_elements, stride=3, window=10):
    datas = []
    for i in range(0, len(page_elements), stride):
        windown_elements = page_elements[i : i + window]
        metadatas = [e.metadata.to_dict() for e in windown_elements]
        chunk = "\n".join([e.text for e in windown_elements])
        datas.append(
            {"txt": chunk, "metadata": merge_metadatas(metadatas, return_center=True)}
        )
    return datas


def get_chunks(elements):
    from collections import defaultdict

    pages_elements = defaultdict(list)
    for element in elements:
        pages_elements[element.metadata.page_number].append(element)

    all_chunks_and_links = sum(
        [
            create_chunk_and_metadatas(page_elements)
            for _, page_elements in pages_elements.items()
        ],
        [],
    )
    return all_chunks_and_links


In [5]:
MODEL_IDENTIFIER_CHUNK = "chunk"
chunk_model = Model(
    identifier=MODEL_IDENTIFIER_CHUNK,
    object=get_chunks,
    flatten=True,
    model_update_kwargs={"document_embedded": False},
    output_schema=Schema(identifier="myschema", fields={"txt": "string"}),
)

db.add(
    Listener(
        model=chunk_model,
        select=Collection("source").find(),
        key="elements",
    )
)

1it [00:00, 146.78it/s]

[32m 2024-Jan-18 16:23:43.32[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 1 model outputs to `db`[0m





([None],
 Listener(identifier='chunk/elements', key='elements', model=Model(identifier='chunk', encoder=None, output_schema=Schema(identifier='myschema', fields={'txt': 'string', '_fold': FieldType(identifier='String')}), flatten=True, preprocess=None, postprocess=None, collate_fn=None, batch_predict=False, takes_context=False, metrics=(), model_update_kwargs={'document_embedded': False}, validation_sets=None, predict_X=None, predict_select=None, predict_max_chunk_size=None, predict_kwargs=None, object=<Artifact artifact=<function get_chunks at 0x2b0b75870> serializer=dill>, model_to_device_method=None, metric_values={}, predict_method=None, serializer='dill', device='cpu', preferred_devices=('cuda', 'mps', 'cpu'), training_configuration=None, train_X=None, train_y=None, train_select=None), select=<superduperdb.backends.mongodb.query.MongoCompoundSelect[
     [92m[1msource.find({'_id': "{'$in': '[65a8e00fe2023d47d469d8f6]'}"}, {})[0m}
 ] object at 0x2b3649e70>, active=True, predict_

## Embedding all text blocks and building vector indexes

In [6]:
SOURCE_KEY = "elements"
MODEL_IDENTIFIER_EMBEDDING = "embedding"
VECTOR_INDEX_IDENTIFIER = "vector-index"
COLLECTION_NAME_CHUNK = f"_outputs.{SOURCE_KEY}.{MODEL_IDENTIFIER_CHUNK}"
CHUNK_OUTPUT_KEY = f"_outputs.{SOURCE_KEY}.{MODEL_IDENTIFIER_CHUNK}"

chunk_collection = Collection(COLLECTION_NAME_CHUNK)

def preprocess(x):
    if isinstance(x, dict):
        # For model chains, the logic of this key needs to be optimized.
        chunk = sorted(x.items())[-1][1]
        return chunk["txt"]
    return x

model = Model(
    identifier=MODEL_IDENTIFIER_EMBEDDING,
    object=sentence_transformers.SentenceTransformer("BAAI/bge-large-en-v1.5"),
    encoder=vector(shape=(384,)),
    predict_method="encode",
    preprocess=preprocess,
    postprocess=lambda x: x.tolist(),
    batch_predict=True,
)

db.add(
    VectorIndex(
        identifier=VECTOR_INDEX_IDENTIFIER,
        indexing_listener=Listener(
            select=chunk_collection.find(),
            key=CHUNK_OUTPUT_KEY,  # Key for the documents
            model=model,  # Specify the model for processing
            predict_kwargs={"max_chunk_size": 64},
        ),
    )
)

[2024-01-18 16:23:43] sentence_transformers.SentenceTransformer INFO Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5
[2024-01-18 16:23:46] sentence_transformers.SentenceTransformer INFO Use pytorch device: cpu
38it [00:00, 11092.18it/s]


[32m 2024-Jan-18 16:23:53.15[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.components.model[0m:[36m417 [0m | [1mComputing chunk 0/0[0m


Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:14<00:00,  7.46s/it]

[32m 2024-Jan-18 16:24:08.08[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.components.model[0m:[36m477 [0m | [1mAdding 38 model outputs to `db`[0m





([None],
 VectorIndex(identifier='vector-index', indexing_listener=Listener(identifier='embedding/elements', key='_outputs.elements.chunk', model=Model(identifier='embedding', encoder=Encoder(identifier='vector[384]', decoder=None, encoder=None, shape=(384,), load_hybrid=True), output_schema=None, flatten=False, preprocess=<Artifact artifact=<function preprocess at 0x2b373cdc0> serializer=dill>, postprocess=<Artifact artifact=<function <lambda> at 0x2b373ce50> serializer=dill>, collate_fn=None, batch_predict=True, takes_context=False, metrics=(), model_update_kwargs={}, validation_sets=None, predict_X=None, predict_select=None, predict_max_chunk_size=None, predict_kwargs=None, object=<Artifact artifact=SentenceTransformer(
   (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
   (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mod

## Define a vector search function

In [7]:
from pprint import pprint
def vector_search(query, top_k=5):
    collection = Collection(COLLECTION_NAME_CHUNK)
    out = db.execute(
        collection.like(
            Document({CHUNK_OUTPUT_KEY: query}),
            vector_index=VECTOR_INDEX_IDENTIFIER,
            n=top_k,
        ).find({})
    )
    if out:
        out = sorted(out, key=lambda x: x.content["score"], reverse=True)
    for r in out:
        score = r.content["score"]
        chunk_data = r.outputs("elements", "chunk")
        metadata = chunk_data["metadata"]
        chunk_message = {}
        chunk_message["score"] = score
        chunk_message["metadata"] = metadata
        txt = chunk_data["txt"]
        print(txt)
        print()
        print(chunk_message)
        print("\n\n", '-' * 20)

In [8]:
vector_search("What is the function of keys 10 to 12 on the left steering wheel keypad?")

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.83it/s]


[32m 2024-Jan-18 16:24:08.42[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.base.datalayer[0m:[36m132 [0m | [1mloading of vectors of vector-index: 'vector-index'[0m
[32m 2024-Jan-18 16:24:08.42[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.base.datalayer[0m:[36m166 [0m | [1m<superduperdb.backends.mongodb.query.MongoCompoundSelect[
    [92m[1m_outputs.elements.chunk.elements.chunk.find({'_id': "{'$in': '[65a8e00fe2023d47d469d8fc, 65a8e00fe2023d47d469d8fd, 65a8e00fe2023d47d469d8fe, 65a8e00fe2023d47d469d8ff, 65a8e00fe2023d47d469d900, 65a8e00fe2023d47d469d901, 65a8e00fe2023d47d469d902, 65a8e00fe2023d47d469d903, 65a8e00fe2023d47d469d904, 65a8e00fe2023d47d469d905, 65a8e00fe2023d47d469d906, 65a8e00fe2023d47d469d907, 65a8e00fe2023d47d469d908, 65a8e00fe2023d47d469d909, 65a8e00fe2023d47d469d90a, 65a8e00fe2023d47d469d90b, 65a8e00fe2023d47d469d90c, 65a8e00

Loading vectors into vector-table...: 38it [00:00, 826.70it/s]


To apply manually, pull the parking brake lever all the way out, past the click.
Always check that the symbol in the instrument and the indicator in the lever are illuminated before you leave the cab.
Right steering wheel keypad Keys 10 and 11 are used for phone calls. The others are used for navigating in the displays and controlling the infotainment system. The function of each key is the following:
1 Navigate left. 2 Navigate up. 3 Navigate right. 4 Navigate down. 5 Select. 6 Return to the home screen. 7 Open a menu. 8 Back. 9 Shift focus between the side display and
the instrument display.
10 End/Reject a call. 11 Accept a call. 12 Push to talk.
Position A is the recommended position for the auxiliary brake. In position A the auxiliary brake is used together with the wheel brakes when the brake pedal is depressed.
Gear selector
1 Gear lever. Select a drive program (A, M or R) or put the gearbox in neutral (N). 2 Lock button. Hold to allow the gear lever to be moved from neutral (N)

## Define an LLM model

In [9]:
MODEL_IDENTIFIER_LLM = "llm"
prompt_template = (
    "The following is a document and question about the volvo user manual\n"
    "Only provide a very concise answer\n"
    "{context}\n\n"
    "Here's the question:{input}\n"
    "answer:"
)

# from superduperdb.ext.llm.vllm import VllmModel

# llm = VllmModel(
#     identifier=MODEL_IDENTIFIER_LLM,
#     model_name="TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
#     prompt_template=prompt_template,
#     vllm_kwargs={"max_model_len": 2048, "quantization": "awq"},
#     inference_kwargs={"max_tokens": 2048},
# )
# # Add the llm instance

from superduperdb.ext.llm.openai import OpenAI

llm = OpenAI(identifier=MODEL_IDENTIFIER_LLM, prompt_template=prompt_template)


db.add(llm)

([],
 OpenAI(encoder=None, output_schema=None, flatten=False, preprocess=None, postprocess=None, collate_fn=None, batch_predict=False, takes_context=True, metrics=(), model_update_kwargs={}, validation_sets=None, predict_X=None, predict_select=None, predict_max_chunk_size=None, predict_kwargs=None, identifier='llm', prompt_template="The following is a document and question about the volvo user manual\nOnly provide a very concise answer\n{context}\n\nHere's the question:{input}\nanswer:", prompt_func=None, max_batch_size=4, inference_kwargs={}, api_url='https://api.openai.com/v1', openai_api_base='https://api.openai.com/v1', openai_api_key=None, model_name='gpt-3.5-turbo', chat=True, system_prompt=None, user_role='user', system_role='system'))

## Define a QA function

In [10]:
from IPython.display import Markdown
from IPython.display import display
import pandas as pd
def qa(query, vector_search_top_k=5):
    collection = Collection(COLLECTION_NAME_CHUNK)
    output, out = db.predict(
        model_name=MODEL_IDENTIFIER_LLM,
        input=query,
        context_select=collection.like(
            Document({CHUNK_OUTPUT_KEY: query}),
            vector_index=VECTOR_INDEX_IDENTIFIER,
            n=vector_search_top_k,
        ).find({}),
        context_key=f"{CHUNK_OUTPUT_KEY}.0.txt",
    )
    if out:
        out = sorted(out, key=lambda x: x.content["score"], reverse=True)
    page_messages = []
    for source in out:
        chunk_data = source.outputs("elements", "chunk")
        metadata = chunk_data["metadata"]
        page_number = metadata["page_number"]
        points = metadata["points"]
        score = source["score"]
        page_messages.append(
            {"page_number": page_number, "points": points, "score": score}
        )
    df = pd.DataFrame(page_messages)
    display(output.content)
    display(df)

In [11]:
qa("What is the function of keys 10 to 12 on the left steering wheel keypad?")

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.49it/s]


[32m 2024-Jan-18 16:24:08.94[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.ext.llm.base[0m:[36m29  [0m | [1mInitializing OpenAI : llm[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[32m 2024-Jan-18 16:24:10.33[0m| [1mINFO    [0m | [36m183eefeaab2d[0m| [36m3c87f48a-c135-4be5-adb7-fc0cd3023e5f[0m| [36msuperduperdb.ext.llm.base[0m:[36m32  [0m | [1mInitialized  OpenAI : llm successfully[0m


'The function of keys 10 to 12 on the left steering wheel keypad is to control the audio in the truck.'

Unnamed: 0,page_number,points,score
0,1,"{'x': 1041.2174049999999, 'y': 319.44622500000...",0.772663
1,1,"{'x': 399.51374, 'y': 349.85872500000005}",0.765555
2,1,"{'x': 207.576865, 'y': 287.1374900000001}",0.726659
3,1,"{'x': 210.14336500000002, 'y': 287.1374900000001}",0.722947
4,1,"{'x': 313.606095, 'y': 252.39644000000013}",0.720475
