# Simple retrieval augmented generation with OpenAI

<!-- TABS -->
## Connect to superduper

:::note
Note that this is only relevant if you are running superduper in development mode.
Otherwise refer to "Configuring your production system".
:::

In [1]:
APPLY = True
SAMPLE_COLLECTION_NAME = 'sample_simple_rag'
COLLECTION_NAME = '<var:table_name>' if not APPLY else 'docs'
ID_FIELD = '<var:id_field>' if not APPLY else 'id'
OUTPUT_PREFIX = '_outputs__'
EAGER = False

In [2]:
from superduper import superduper, CFG

CFG.output_prefix = OUTPUT_PREFIX
CFG.bytes_encoding = 'str'

db = superduper('mongomock://')

[32m2024-Dec-03 14:42:08.83[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.misc.plugins[0m:[36m13  [0m | [1mLoading plugin: mongodb[0m
[32m2024-Dec-03 14:42:08.87[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m68  [0m | [1mBuilding Data Layer[0m
[32m2024-Dec-03 14:42:08.87[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.build[0m:[36m184 [0m | [1mConfiguration: 
 +---------------+--------------+
| Configuration |    Value     |
+---------------+--------------+
|  Data Backend | mongomock:// |
+---------------+--------------+[0m


In [3]:
import json
import requests
import io
from superduper import logging


def getter():
    logging.info('Downloading data...')
    response = requests.get('https://superduperdb-public-demo.s3.amazonaws.com/text.json')
    logging.info('Downloading data... (Done)')
    data = json.loads(response.content.decode('utf-8'))
    return [{'x': r} for r in data]

In [4]:
if APPLY:
    data = getter()

[32m2024-Dec-03 14:42:08.88[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36m__main__[0m:[36m8   [0m | [1mDownloading data...[0m
[32m2024-Dec-03 14:42:09.64[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36m__main__[0m:[36m10  [0m | [1mDownloading data... (Done)[0m


<!-- TABS -->
## Insert simple data

After turning on auto_schema, we can directly insert data, and superduper will automatically analyze the data type, and match the construction of the table and datatype.

In [5]:
if APPLY:
    from superduper import Document
    ids = db.execute(db[COLLECTION_NAME].insert([Document(r) for r in data]))

[32m2024-Dec-03 14:42:09.73[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (table, docs) not found in cache, loading from db[0m
[32m2024-Dec-03 14:42:09.73[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('table', 'docs')) from metadata...[0m
[32m2024-Dec-03 14:42:09.73[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m330 [0m | [1mTable docs does not exist, auto creating...[0m
[32m2024-Dec-03 14:42:09.73[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m336 [0m | [1mCreating table docs with schema {('x', 'str'), ('_fold', 'str')}[0m
[32m2024-Dec-03 14:42:09.73[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (schema, AUTO-_fold=<class 'str'>&x=<class 'str'>) not found in

<!-- TABS -->
## Apply a chunker for search

:::note
Note that applying a chunker is ***not*** mandatory for search.
If your data is already chunked (e.g. short text snippets or audio) or if you
are searching through something like images, which can't be chunked, then this
won't be necessary.
:::

In [18]:
from superduper import Model


class Chunker(Model):
    chunk_size: int = 200
    signature: str = 'singleton'

    def predict(self, text):
        text = text.split()
        chunks = [' '.join(text[i:i + self.chunk_size]) for i in range(0, len(text), self.chunk_size)]
        return chunks

Now we apply this chunker to the data by wrapping the chunker in `Listener`:

In [19]:
from superduper import Listener


upstream_listener = Listener(
    model=Chunker(identifier='chunker'),
    select=db[COLLECTION_NAME].select(ID_FIELD, 'x'),
    key='x',
    identifier='chunker',
    flatten=True,
)

In [20]:
if APPLY and EAGER:
    db.apply(upstream_listener, force=True)

## Select outputs of upstream listener

:::note
This is useful if you have performed a first step, such as pre-computing 
features, or chunking your data. You can use this query to 
operate on those outputs.
:::

<!-- TABS -->
## Build text embedding model

OpenAI:

In [21]:
import os

from superduper_openai import OpenAIEmbedding

openai_embedding = OpenAIEmbedding(
    identifier='text-embedding',
    model='text-embedding-ada-002',
)

## Create vector-index

In [22]:
from superduper import VectorIndex, Listener

vector_index_name = 'vectorindex'

vector_index = VectorIndex(
    vector_index_name,
    indexing_listener=Listener(
        key=upstream_listener.outputs,
        select=db[upstream_listener.outputs].select('id', '_source', upstream_listener.outputs),
        model=openai_embedding,
        identifier='embeddinglistener',
        upstream=[upstream_listener],
    )
)

In [23]:
if APPLY and EAGER:
    db.apply(vector_index, force=True)

<!-- TABS -->
## Build LLM

In [24]:
from superduper_openai import OpenAIChatCompletion


llm_openai = OpenAIChatCompletion(
    identifier='llm-model',
    model='gpt-3.5-turbo',
)

## Answer question with LLM

In [25]:
from superduper import model
from superduper.components.model import RAGModel

prompt_template = (
    "Use the following context snippets, these snippets are not ordered!, Answer the question based on this context.\n"
    "These snippets are samples from our internal data-repositories, and should be used exclusively and as a matter"
    " of priority to answer the question. Please answer in 20 words or less.\n\n"
    "{context}\n\n"
    "Here's the question: {query}"
)

rag = RAGModel(
    'simple_rag',
    select=db[upstream_listener.outputs].select().like({upstream_listener.outputs: '<var:query>'}, vector_index=vector_index_name, n=5),
    prompt_template=prompt_template,
    key=upstream_listener.outputs,
    llm=llm_openai,
)

In [26]:
if APPLY and EAGER:
    db.apply(rag, force=True)

By applying the RAG model to the database, it will subsequently be accessible for use in other services.

In [27]:
from superduper import Application

app = Application(
    'simple-rag-app',
    components=[
        upstream_listener,
        vector_index,
        rag,
    ]
)

[32m2024-Dec-03 14:42:49.74[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.components.application[0m:[36m39  [0m | [1mResorting components based on topological order.[0m
[32m2024-Dec-03 14:42:49.74[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.components.application[0m:[36m56  [0m | [1mNew order of components: ['listener:chunker:29ef473ef2504c37', 'vector_index:vectorindex:72d3c4b5d5cf4874', 'model:simple_rag:387b2d1d9a7e4ca7'][0m


In [28]:
if APPLY:
    db.apply(app, force=True)

[32m2024-Dec-03 14:42:49.76[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m621 [0m | [1mComponent (table, _outputs__chunker__29ef473ef2504c37) not found in cache, loading from db[0m
[32m2024-Dec-03 14:42:49.76[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m627 [0m | [1mLoad (('table', '_outputs__chunker__29ef473ef2504c37')) from metadata...[0m
[32m2024-Dec-03 14:42:50.12[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.apply[0m:[36m225 [0m | [1mFound identical model:chunker:52f7183df54d4d7b[0m
[32m2024-Dec-03 14:42:50.13[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.apply[0m:[36m298 [0m | [1mFound update model:chunker:6e1a47e7de2b4272[0m
[32m2024-Dec-03 14:42:50.14[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.apply[0m:[36m225 [0m | [1mFound identical listener:chunker:29ef473e

[32m2024-Dec-03 14:42:50.16[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.apply[0m:[36m73  [0m | [1mFound these changes and/ or additions that need to be made:[0m
[32m2024-Dec-03 14:42:50.16[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.apply[0m:[36m75  [0m | [1m----------------------------------------------------------------------------------------------------[0m
[32m2024-Dec-03 14:42:50.16[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.apply[0m:[36m76  [0m | [1mMETADATA EVENTS:[0m
[32m2024-Dec-03 14:42:50.16[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.apply[0m:[36m77  [0m | [1m----------------------------------------------------------------------------------------------------[0m
[32m2024-Dec-03 14:42:50.16[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.apply[0m:[36m86  [0m | [1m[0]: model:chunker:6e1a47

In [31]:
if APPLY:
    rag = db.load('model', 'simple_rag')
    print(rag.predict('Tell me about the project'))

[32m2024-Dec-03 14:43:03.80[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m796 [0m | [1mGetting vector-index[0m
[32m2024-Dec-03 14:43:03.80[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m804 [0m | [1m{}[0m
[32m2024-Dec-03 14:43:04.23[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m796 [0m | [1mGetting vector-index[0m
[32m2024-Dec-03 14:43:04.23[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m804 [0m | [1m{}[0m
Superduper is a data management and AI platform that allows users to add components using various client-side mechanisms.


You can now load the model elsewhere and make predictions using the following command.

## Create template

In [None]:
from superduper import Template, Table, Schema
from superduper.components.dataset import RemoteData

template = Template(
    'simple_rag',
    template=app,
    substitutions={
        COLLECTION_NAME: 'table_name',
        'text-embedding-ada-002': 'embedding_model',
        'gpt-3.5-turbo': 'llm_model',
    },
    template_variables=['table_name', 'id_field', 'embedding_model', 'llm_model'],
    default_table=Table(
        'sample_simple_rag',
        schema=Schema('sample_simple_rag/schema', fields={'x': 'str'}),
        data=RemoteData(
            'superduper-docs',
            getter=getter,
        )
    ),
    types={
        'id_field': {
            'type': 'str',
            'default': '_id',
        },
        'embedding_model': {
            'type': 'str',
            'default': 'text-embedding-ada-002',
            'choices': ['text-embedding-ada-002', 'nomic-embed-text:latest'],
        },
        'llm_model': {
            'type': 'str',
            'default': 'gpt-3.5-turbo',
            'choices': ['gpt-3.5-turbo', 'gpt-4-turbo', 'llama3.1:8b']
        },
        'table_name': {
            'type': 'str',
            'default': SAMPLE_COLLECTION_NAME,
        }
    },
    schema={
        "id_field": "str",
        "embedding_model": "str",
        "llm_model": "str",
        "table_name": "str",
        "databackend": "str",
    },
    db=db
)

In [None]:
template.export('.')