# Vector Search and RAG function application based on SuperDuperDB

In [1]:
import os
import click
from tqdm import tqdm

import sentence_transformers
from dotenv import load_dotenv
from superduper import (
    Document,
    Listener,
    model,ObjectModel,
    Schema,
    VectorIndex,
    superduper,
    vector
)
# from superduper.backends.mongodb import
import superduper_mongodb
load_dotenv()

  from tqdm.autonotebook import tqdm, trange


True

## Connect to mongodb database

In [2]:
mongodb_uri = os.getenv("MONGODB_URI", "superduperdb-demo")
artifact_store = os.getenv("ARTIFACT_STORE", "data/artifact_store")

db = superduper(mongodb_uri, artifact_store=f"filesystem://{artifact_store}")

2024-Sep-06 11:14:39.39| INFO     | localhost.localdomain| superduper.base.build:56   | Data Client is ready. MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)
2024-Sep-06 11:14:39.39| INFO     | localhost.localdomain| superduper.base.build:35   | Connecting to Metadata Client with engine:  MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)
2024-Sep-06 11:14:39.40| INFO     | localhost.localdomain| superduper.base.build:141  | Connecting to compute client: Compute(uri=None, compute_kwargs={}, _path='superduper.backends.local.compute.LocalComputeBackend')
2024-Sep-06 11:14:39.40| INFO     | localhost.localdomain| superduper.base.datalayer:106  | Building Data Layer
2024-Sep-06 11:14:39.40| INFO     | localhost.localdomain| superduper.base.build:208  | Configuration: 
 +----------------+---------------------------------------+
| Configuration  |      

## Parse pdf files and store them in the database

In [None]:
from superduper.ext.unstructured.encoder import unstructured_encoder

db.apply(unstructured_encoder)

pdf_folder = 'pdf-folders'

pdf_paths = [os.path.join(pdf_folder, pdf) for pdf in os.listdir(pdf_folder)]
# collection = superduper_mongodb("source")
to_insert = [
    Document({"elements": unstructured_encoder(pdf_path)}) for pdf_path in pdf_paths
]
# db.execute(collection.insert_many(to_insert))
# _ = db['source'].insert_many(to_insert).execute()
db['source'].insert_many(to_insert).execute()

In [None]:
db.show()

In [None]:
db['source'].find_one().execute().unpack()

## Create a chunking model to chunk pdf chunks

In [None]:
def merge_metadatas(metadatas, return_center=False):
    MAX_NUM = 999999999
    if not metadatas:
        return {}
    p1, p2, p3, p4 = (MAX_NUM, MAX_NUM), (MAX_NUM, 0), (0, 0), (0, MAX_NUM)
    for metadata in metadatas:
        p1_, p2_, p3_, p4_ = metadata["coordinates"]["points"]
        p1 = (min(p1[0], p1_[0]), min(p1[1], p1_[1]))
        p2 = (min(p2[0], p2_[0]), max(p2[1], p2_[1]))
        p3 = (max(p3[0], p3_[0]), max(p3[1], p3_[1]))
        p4 = (max(p4[0], p4_[0]), min(p4[1], p4_[1]))
    points = (p1, p2, p3, p4)
    if return_center:
        points = {"x": (p1[0] + p3[0]) / 2, "y": (p1[1] + p3[1]) / 2}
        page_number = metadata["page_number"]
    return {"points": points, "page_number": page_number}


def create_chunk_and_metadatas(page_elements, stride=3, window=10):
    datas = []
    for i in range(0, len(page_elements), stride):
        windown_elements = page_elements[i : i + window]
        metadatas = [e.metadata.to_dict() for e in windown_elements]
        chunk = "\n".join([e.text for e in windown_elements])
        datas.append(
            {"txt": chunk, "metadata": merge_metadatas(metadatas, return_center=True)}
        )
    return datas


@model(flatten=True, model_update_kwargs={'document_embedded': False})
def get_chunks(elements):
    from collections import defaultdict

    pages_elements = defaultdict(list)
    for element in elements:
        pages_elements[element.metadata.page_number].append(element)

    all_chunks_and_links = sum(
        [
            create_chunk_and_metadatas(page_elements)
            for _, page_elements in pages_elements.items()
        ],
        [],
    )
    return all_chunks_and_links


In [None]:
MODEL_IDENTIFIER_CHUNK = "chunker"
upstream_listener= Listener(
        model=get_chunks,
        select=db['source'].select(),
        key="elements",
       uuid=MODEL_IDENTIFIER_CHUNK
)
db.apply(upstream_listener)

In [None]:
db.show()

In [None]:
db.databackend.db.list_collection_names() 


In [None]:
upstream_listener.outputs_key
# '_outputs.chunker'

In [None]:
# MODEL_IDENTIFIER_CHUNK = "chunk"
# from superduper import ObjectModel
# chunk_model = ObjectModel(
#     identifier=MODEL_IDENTIFIER_CHUNK,
#     object=get_chunks,
#     flatten=True,
#     model_update_kwargs={"document_embedded": False},
#     output_schema=Schema(identifier="myschema", fields={"txt": "string"}),
# )

# db.add(
#     Listener(
#         model=chunk_model,
#         select=select,
#         key="elements",
#     )
# )
# upstream_listener= Listener(
#         model=get_chunks,
#         select=db['source'].find(),
#         key="elements",
#        uuid=MODEL_IDENTIFIER_CHUNK
# )
# db.apply(upstream_listener)

## Embedding all text blocks and building vector indexes

In [None]:
# SOURCE_KEY = "elements"
MODEL_IDENTIFIER_EMBEDDING = "embedding"
VECTOR_INDEX_IDENTIFIER = "vector-index"
# COLLECTION_NAME_CHUNK = f"_outputs.{SOURCE_KEY}.{MODEL_IDENTIFIER_CHUNK}"
COLLECTION_NAME_CHUNK = f"_outputs.{MODEL_IDENTIFIER_CHUNK}" # _outputs.chunk
# CHUNK_OUTPUT_KEY = f"_outputs.{SOURCE_KEY}.{MODEL_IDENTIFIER_CHUNK}"
CHUNK_OUTPUT_KEY = f"_outputs.{MODEL_IDENTIFIER_CHUNK}.txt"
indexing_key = upstream_listener.outputs_key # Same as CHUNK_OUTPUT_KEY
chunk_collection = db[COLLECTION_NAME_CHUNK]

def preprocess(x):
    if isinstance(x, dict):
        # For model chains, the logic of this key needs to be optimized.
        chunk = sorted(x.items())[-1][1]
        return chunk["txt"]
    return x
from superduper_sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(
    identifier=MODEL_IDENTIFIER_EMBEDDING,
    object=sentence_transformers.SentenceTransformer("BAAI/bge-large-en-v1.5", device="cuda"),
    datatype=vector(shape=(1024,)),
    device="cuda",
    # predict_method="encode",
    # preprocess=preprocess,
    postprocess=lambda x: x.tolist(),
    # batch_predict=True,
    predict_kwargs={"show_progress_bar": True},
    # device='cuda'

)
# Create vector-index
vector_index = \
    VectorIndex(
        VECTOR_INDEX_IDENTIFIER,
        indexing_listener=Listener(
            select=chunk_collection.select(),
            key=CHUNK_OUTPUT_KEY,  # Key for the documents
            # key=indexing_key,  # Key for the documents
            model=embedding_model,  # Specify the model for processing
            # predict_kwargs={"max_chunk_size": 64},
            uuid="embedding-bge-large",
            identifier="embedding-bge-large-listener"
        )
    )
# db.apply()

## Start Indexing Embeddings

In [None]:
db.apply(vector_index)

In [None]:
print(len(embedding_model.predict("What is superduper")))

In [None]:
db.show()

In [None]:
db[COLLECTION_NAME_CHUNK].find_one().execute().unpack()


## Define a vector search function

In [None]:
from pprint import pprint
def vector_search(query, top_k=5):
    collection = db[COLLECTION_NAME_CHUNK]
    out = db.execute(
        collection.like(
            Document({CHUNK_OUTPUT_KEY: query}),
            vector_index=VECTOR_INDEX_IDENTIFIER,
            n=top_k,
        ).select({})
    )

    if out:
        out = sorted(out, key=lambda x: x['score'], reverse=True)
    for r in out:
        score = r["score"]
        # chunk_data = r.outputs("elements", "chunk")
        chunk_data = r[upstream_listener.outputs_key] # upstream_listener.outputs_key
        metadata = chunk_data["metadata"]
        chunk_message = {}
        chunk_message["score"] = score
        chunk_message["metadata"] = metadata
        txt = chunk_data["txt"]
        print(txt)
        print()
        print(chunk_message)
        print("\n\n", '-' * 20)

In [None]:
vector_search("What is the function of keys 10 to 12 on the left steering wheel keypad?")

## Define an LLM model Anthropic

In [54]:
from superduper_anthropic import AnthropicCompletions
MODEL_IDENTIFIER_LLM = "llm"
# import os
# os.environ["ANTHROPIC_API_KEY"] = "sk-ant-api-xxx"
predict_kwargs = {
    "max_tokens": 1024,
    "temperature": 0.8,
}

llm = AnthropicCompletions(
    identifier=MODEL_IDENTIFIER_LLM,
    model='claude-2.1',
    predict_kwargs=predict_kwargs
)
llm.predict("Tell me a joke")

"Why can't a bicycle stand up by itself? Because it's two-tired!"

In [12]:
db.drop(llm)

In [13]:
db.show()

[]

## Define an LLM model OpenAI

In [39]:
from superduper_openai import OpenAIChatCompletion
# import os
# os.environ['OPENAI_API_KEY'] = 'sk-prox-xxx'
llm = OpenAIChatCompletion(identifier='llm', model='gpt-3.5-turbo')        
llm.predict("Tell me a joke")


"Why couldn't the bicycle find its way home?\nBecause it lost its bearings!"

In [7]:
print(db.load("model","llm").predict("Tell me a joke"))

Why can't a bicycle stand up by itself? Because it's two-tired!


## Generate Questions 

In [45]:
generate_template = """
Based on the information provided, please formulate one question related to the document excerpt. Answer in JSON format.

**Context**:
{%s}

Using the information above, generate your questions. Your question can be one of the following types: What, Why, When, Where, Who, How. Please respond in the following format:

 
{
  \"question_type\": \"Type of question, e.g., 'What'\",
  \"question\": \"Your question \",
}

 
"""

text ="""
The automatic activation of I-Roll that takes places when cruise control is active cannot be switched off. You can however disengage I-Roll so that it is not activated automatically when cruise control is not active.
To temporarily disengage I-Roll, press and hold the minus (-) button on the gear selector.
To engage I-Roll again, gently depress the accelerator pedal.
I-See
I-See is a set of functions that use information about the road topography ahead of the truck to optimise the gear selection and, as a result, save fuel. It lowers the fuel consumption and
Gearbox
improves the driveability when cruise control is active.
When you drive with cruise control active on a road, a sensor records the road topography. The recorded information is combined with geographical coordinates from the truck's GPS system. The data are saved, either in the system's memory or in an external topography database (via mobile network).
I-See uses these data to save fuel. When you drive with cruise control active on a road, for which data are available, I-See receives the data and can predict when hills and crests will appear. I-See automatically adapts throttle application, gear strategies and truck speed for more fuel efficient driving.
Activating I-See
"""
# llm_qna=db.load("model","llm")
prompt = lambda x: generate_template % x
res=llm.predict(prompt(text))
# print(prompt(text))

In [53]:
try:
    out=eval(res)
except (SyntaxError, NameError, TypeError, ZeroDivisionError):
    out=res.split("\n",2)[2]
    pass
eval(res)

SyntaxError: invalid syntax (<string>, line 1)

In [55]:
print(res)

Here is a possible question based on the context provided:

{
  "question_type": "What", 
  "question": "What does I-Roll do when cruise control is active?"
}


In [None]:
MODEL_IDENTIFIER_LLM = "llm"
prompt_template = (
    "The following is a document and question about the volvo user manual\n"
    "Only provide a very concise answer\n"
    "{context}\n\n"
    "Here's the question:{input}\n"
    "answer:"
)

# from superduper.ext.vllm import VllmModel
from superduper_vllm import VllmModel
from superduper.ext.openai import OpenAIChatCompletion

# llm = VllmModel(
#     identifier=MODEL_IDENTIFIER_LLM,
#     model_name="TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
#     prompt_func=prompt_template,
#     vllm_kwargs={ 
#         "gpu_memory_utilization": 0.50,
#         "max_model_len": 2048,
#         "quantization": "awq"
#                    },
#     predict_kwargs={"max_tokens": 1024, "temperature": 0.8},
# )
# Add the llm instance

# db.apply(llm)

## Prompt Template for LLM

In [None]:
prompt_template = (
    "The following is a document and question about the volvo user manual\n"
    "Only provide a very concise answer\n"
    "{context}\n\n"
    "Here's the question:{input}\n"
    "answer:"
)

# @model
def build_prompt(query, docs):
    # print(docs)
    chunks = [doc["text"]["txt"] for doc in docs]
    context = "\n\n".join(chunks)
    # context="blah"
    prompt = prompt_template.format(context=context, input=query)
    return prompt

## Test Prompt with documents from vector search output

In [None]:
from superduper.components.model import QueryModel
item = {'_outputs.chunker.txt': '<var:query>'}
top_k = 3
vector_search_model = QueryModel(
    identifier="VectorSearch",
    select=chunk_collection.like(
        item, 
        vector_index=VECTOR_INDEX_IDENTIFIER, 
        n=top_k
    ).select(),
    # The _source is the identifier of the upstream data, which can be used to locate the data from upstream sources using `_source`.
    postprocess=lambda docs: [{"text": doc['_outputs.chunker'], "_source": doc["_source"],"score": doc["score"]} for doc in docs],
    db=db
)

## Test Vector Search Model

In [None]:
query="What is the function of keys 10 to 12 on the left steering wheel keypad?"
pprint(vector_search_model.predict(query=query))

In [None]:
query="What is the function of keys 10 to 12 on the left steering wheel keypad?"
docs=vector_search_model.predict(query=query)
type(docs[0])
print(len(docs))
prompt=build_prompt(query,docs)

In [None]:
print(db.load("model","llm").predict(prompt))

## Define a QA function

In [None]:
from IPython.display import display
import pandas as pd
from superduper.components.model import QueryModel

def qa(query, vector_search_top_k=5):
    item = {'_outputs.chunker.txt': '<var:query>'}
    vector_search_model = QueryModel(
        identifier="VectorSearch",
        select=chunk_collection.like(
            item, 
            vector_index=VECTOR_INDEX_IDENTIFIER, 
            n=vector_search_top_k
        ).select(),
        postprocess=lambda docs: [{"text": doc['_outputs.chunker'], "_source": doc["_source"],"score": doc["score"]} for doc in docs],
        db=db
    )
    out=vector_search_model.predict(query=query)
    if out:
        out = sorted(out, key=lambda x: x["score"], reverse=True)
        prompt= build_prompt(query,out)
        output = db.load("model","llm").predict(prompt)
    page_messages = []
    for source in out:
        chunk_data = source['text'] # upstream_listener.outputs_key
        metadata = chunk_data["metadata"]
        page_number = metadata["page_number"]
        points = metadata["points"]
        score = source["score"]
        page_messages.append(
            {"page_number": page_number, "points": points, "score": score}
        )
    df = pd.DataFrame(page_messages)
    display(output)
    display(df)
    

In [None]:
query="What is the function of keys 10 to 12 on the left steering wheel keypad?"
qa(query, vector_search_top_k=5)

## Define a QA function (Legacy)

In [None]:
from IPython.display import Markdown
from IPython.display import display
import pandas as pd
def qa(query, vector_search_top_k=5):
    collection = db[COLLECTION_NAME_CHUNK]
    output, out = db.execute(
        model_name=MODEL_IDENTIFIER_LLM,
        query=query,
        context_select=collection.like(
            Document({CHUNK_OUTPUT_KEY: query}),
            vector_index=VECTOR_INDEX_IDENTIFIER,
            n=vector_search_top_k,
        ).select({}),
        context_key=f"{CHUNK_OUTPUT_KEY}.0.txt",
    )
    if out:
        out = sorted(out, key=lambda x: x["score"], reverse=True)
    page_messages = []
    for source in out:
        chunk_data = source.outputs("elements", "chunk")
        metadata = chunk_data["metadata"]
        page_number = metadata["page_number"]
        points = metadata["points"]
        score = source["score"]
        page_messages.append(
            {"page_number": page_number, "points": points, "score": score}
        )
    df = pd.DataFrame(page_messages)
    display(output.content)
    display(df)

In [None]:
qa("What is the function of keys 10 to 12 on the left steering wheel keypad?")