<!-- TABS -->
# Retrieval augmented generation

<!-- TABS -->
## Connect to superduper

:::note
Note that this is only relevant if you are running superduper in development mode.
Otherwise refer to "Configuring your production system".
:::

In [1]:
APPLY = False
COLLECTION_NAME = '<var:table_name>' if not APPLY else '_sample_rag'
ID_FIELD = '<var:id_field>' if not APPLY else 'id'
OUTPUT_PREFIX = 'outputs__'

In [2]:
from superduper import superduper, CFG

CFG.output_prefix = OUTPUT_PREFIX
CFG.bytes_encoding = 'str'
CFG.json_native = False

db = superduper()

[32m2024-Oct-18 12:32:41.32[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.misc.plugins[0m:[36m13  [0m | [1mLoading plugin: mongodb[0m
[32m2024-Oct-18 12:32:41.39[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m73  [0m | [1mBuilding Data Layer[0m
[32m2024-Oct-18 12:32:41.40[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent 5a71d0219dc2400ca6a2a5fed148aea4 not found in cache, loading from db[0m
[32m2024-Oct-18 12:32:41.40[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent b9dcd973f85e48caa6b2c9663e09826d not found in cache, loading from db[0m
[32m2024-Oct-18 12:32:41.40[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m646 [0m | [1mAdding model:chunker:b9dcd973f85e48caa6b2c9663e09826d to cache[0m
[32m2024-Oct-18 

  from tqdm.autonotebook import tqdm, trange


[32m2024-Oct-18 12:32:45.43[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m646 [0m | [1mAdding model:sentence-transformers-embedding:25719c24c4c04e549a54299148b83eab to cache[0m
[32m2024-Oct-18 12:32:45.43[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m646 [0m | [1mAdding model:embedding:7a624151d93e4e268c4c998c0b820d6a to cache[0m
[32m2024-Oct-18 12:32:45.44[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent 4f5bbc64a872451f8b975720adb2ed46 not found in cache, loading from db[0m
[32m2024-Oct-18 12:32:45.44[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent 341f9670ceb94795bb484268928c4bca not found in cache, loading from db[0m
[32m2024-Oct-18 12:32:45.44[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalaye

In [3]:
db.drop(force=True, data=True)

In [4]:
import json

with open('data.json', 'r') as f:
    data = json.load(f)
data = [{'x': r} for r in data]

<!-- TABS -->
## Insert simple data

After turning on auto_schema, we can directly insert data, and superduper will automatically analyze the data type, and match the construction of the table and datatype.

In [5]:
if APPLY:
    from superduper import Document
    ids = db.execute(db[COLLECTION_NAME].insert([Document(r) for r in data]))

<!-- TABS -->
## Apply a chunker for search

:::note
Note that applying a chunker is ***not*** mandatory for search.
If your data is already chunked (e.g. short text snippets or audio) or if you
are searching through something like images, which can't be chunked, then this
won't be necessary.
:::

In [6]:
from superduper import Model


class Chunker(Model):
    chunk_size: int = 200
    signature: str = 'singleton'

    def predict(self, text):
        text = text.split()
        chunks = [' '.join(text[i:i + self.chunk_size]) for i in range(0, len(text), self.chunk_size)]
        return chunks

Now we apply this chunker to the data by wrapping the chunker in `Listener`:

In [7]:
from superduper import Listener


upstream_listener = Listener(
    model=Chunker(identifier='chunker'),
    select=db[COLLECTION_NAME].select(ID_FIELD, 'x'),
    key='x',
    identifier='chunker',
    flatten=True,
)

In [8]:
if APPLY:
    db.apply(upstream_listener, force=True)

## Select outputs of upstream listener

:::note
This is useful if you have performed a first step, such as pre-computing 
features, or chunking your data. You can use this query to 
operate on those outputs.
:::

<!-- TABS -->
## Build text embedding model

OpenAI:

In [9]:
import os
from superduper.components.vector_index import sqlvector

from superduper_openai import OpenAIEmbedding

openai_embedding = OpenAIEmbedding(identifier='text-embedding-ada-002' , datatype=sqlvector(shape=(1536,)))

Sentence-transformers

In [10]:
import sentence_transformers
from superduper_sentence_transformers import SentenceTransformer

sentence_transformers_embedding = SentenceTransformer(
    identifier="sentence-transformers-embedding",
    model="BAAI/bge-small-en",
    datatype=sqlvector(shape=(1024,)),
    postprocess=lambda x: x.tolist(),
    predict_kwargs={"show_progress_bar": True},
)

In [11]:
from superduper.components.model import ModelRouter
from superduper.components.vector_index import sqlvector

embedding_model = ModelRouter(
    'embedding',
    models={'openai': openai_embedding, 'sentence_transformers': sentence_transformers_embedding},
    model='<var:embedding_model>' if not APPLY else 'openai',
    example='this is a test',
)

## Create vector-index

In [12]:
from superduper import VectorIndex, Listener

vector_index_name = 'vectorindex'

vector_index = VectorIndex(
    vector_index_name,
    indexing_listener=Listener(
        key=upstream_listener.outputs,
        select=db[upstream_listener.outputs].select(ID_FIELD, '_source', upstream_listener.outputs),
        model=embedding_model,
        identifier='embeddinglistener',
        upstream=[upstream_listener],
    )
)

In [13]:
if APPLY:
    db.apply(vector_index, force=True)

<!-- TABS -->
## Build LLM

In [14]:
from superduper_openai import OpenAIChatCompletion

llm_openai = OpenAIChatCompletion(identifier='llm-openai', model='gpt-3.5-turbo')

In [15]:
from superduper_anthropic import AnthropicCompletions

predict_kwargs = {
    "max_tokens": 1024,
    "temperature": 0.8,
}

llm_anthropic = AnthropicCompletions(identifier='llm-vllm', model='claude-2.1', predict_kwargs=predict_kwargs)

In [16]:
from superduper_vllm import VllmCompletion

predict_kwargs = {
    "max_tokens": 1024,
    "temperature": 0.8,
}

llm_vllm = VllmCompletion(
    identifier="llm-vllm",
    vllm_params={
        'model': 'TheBloke/Mistral-7B-Instruct-v0.2-AWQ',
        "gpu_memory_utilization": 0.7,
        "max_model_len": 1024,
        "quantization": "awq",
    },
    predict_kwargs=predict_kwargs,
)

[32m2024-Oct-18 12:32:47.73[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper_vllm.model[0m:[36m31  [0m | [1mSetting num_gpus to 1[0m


In [17]:
# # !huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
# from superduper_llamacpp.model import LlamaCpp

# llm_llamacpp = LlamaCpp(identifier="llm-llamacpp", model_name_or_path="mistral-7b-instruct-v0.2.Q4_K_M.gguf")

In [18]:
llm = ModelRouter(
    'llm',
    models={
        'openai': llm_openai,
        'anthropic': llm_anthropic,
        'vllm': llm_vllm,
        # 'llamacpp': llm_llamacpp,
    },
    model='<var:llm_model>' if not APPLY else 'openai',
)

## Answer question with LLM

In [19]:
from superduper import model
from superduper.components.model import RAGModel

prompt_template = (
    "Use the following context snippets, these snippets are not ordered!, Answer the question based on this context.\n"
    "{context}\n\n"
    "Here's the question: {query}"
)

rag = RAGModel(
    'rag-model',
    select=db[upstream_listener.outputs].select().like({upstream_listener.outputs: '<var:query>'}, vector_index=vector_index_name, n=5),
    prompt_template=prompt_template,
    key=upstream_listener.outputs,
    llm=llm,
)

In [20]:
if APPLY:
    db.apply(rag, force=True)

In [21]:
if APPLY:
    print(rag.predict('Tell me about vector-search'))

By applying the RAG model to the database, it will subsequently be accessible for use in other services.

In [22]:
from superduper import Application

app = Application(
    'rag-app',
    components=[
        upstream_listener,
        vector_index,
        rag,
    ]
)

In [23]:
if APPLY:
    db.apply(app, force=True)

You can now load the model elsewhere and make predictions using the following command.

## Create template

In [24]:
from superduper import Template

template = Template(
    'rag',
    template=app,
    data=data,
    substitutions={'_sample_rag': 'table_name', OUTPUT_PREFIX: 'output_prefix'},
    template_variables=['llm_model', 'embedding_model', 'table_name', 'id_field', 'output_prefix'],
    types={
        'id_field': {
            'type': 'str',
            'default': '_id',
        },
        'llm_model': {
            'type': 'str',
            'choices': ['openai', 'anthropic', 'vllm', 'llamacpp'],
            'default': 'openai',
        },
        'embedding_model': {
            'type': 'str',
            'choices': ['openai', 'sentence_transformers'],
            'default': 'openai',
        },
        'table_name': {
            'type': 'str',
            'default': '_sample_rag'
        },
        'output_prefix': {
            'type': 'str',
            'default': OUTPUT_PREFIX,
        }
    }
)



In [25]:
OUTPUT_PREFIX

'outputs__'

In [26]:
template.export('.')

In [27]:
from superduper import Template

In [28]:
t = Template.read('.')

In [29]:
c = t()

<var:table_name>.select("<var:id_field>", "x")
<var:output_prefix>chunker__?(listener:chunker.uuid).select("<var:id_field>", "_source", "<var:output_prefix>chunker__?(listener:chunker.uuid)")
<var:output_prefix>chunker__?(listener:chunker.uuid)
<var:output_prefix>chunker__?(listener:chunker.uuid)
<var:output_prefix>embeddinglistener__?(listener:embeddinglistener.uuid)
<var:output_prefix>chunker__?(listener:chunker.uuid)
<var:output_prefix>chunker__?(listener:chunker.uuid).select().like(documents[0], vector_index="vectorindex", n=5)
<var:output_prefix>chunker__?(listener:chunker.uuid)
[32m2024-Oct-18 12:32:49.33[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper_vllm.model[0m:[36m31  [0m | [1mSetting num_gpus to 1[0m


In [30]:
c.info(4)

