# Building Private Q&A Assistant Using Postgres + Pgvector and Open Source Model

## Prerequisites

Before starting the implementation, make sure you have the required libraries installed by running the following commands:

In [1]:
# !pip install superduperdb
# !pip install vllm
# !pip install sentence_transformers numpy==1.24.4
# !pip install 'ibis-framework[postgres]'
# !pip install pgvector
# !pip install psycopg2 

In [2]:
!rm -rf .superduperdb/ && mkdir -p .superduperdb

## Connect to datastore 

First, we need to establish a connection to a Postgres datastore via SuperDuperDB. You can configure the `Postgres_URI` based on your specific setup. 
Here are some examples of postgres URIs:

* For testing (default connection): `postgres://test`
* Local postgres instance: `postgres://localhost:27017`
* postgres with authentication: `postgres://superduper:superduper@postgres:27017/documents`
* postgres Atlas: `postgres+srv://<username>:<password>@<atlas_cluster>/<database>`

In [3]:
from superduperdb.base.config import VectorSearch, Compute

In [4]:
from superduperdb import superduper
from superduperdb.backends.ibis import Table
import os
from superduperdb.backends.ibis.field_types import dtype
from superduperdb.ext.pillow import pil_image
from superduperdb import Schema

connection_uri = "postgresql://postgres:test@localhost:8000/qa"


# It just super dupers your database
db = superduper(
    connection_uri,
    metadata_store='sqlite:///.superduperdb/metadata.sqlite',
#     cluster__vector_search = vs,
)

[32m 2024-Apr-18 02:14:21.59[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.base.build[0m:[36m65  [0m | [1mData Client is ready. <ibis.backends.postgres.Backend object at 0x13fba5890>[0m
[32m 2024-Apr-18 02:14:21.60[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.base.build[0m:[36m30  [0m | [1mConnecting to Metadata Client: sqlite:///.superduperdb/metadata.sqlite[0m
[32m 2024-Apr-18 02:14:21.62[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.backends.local.artifacts[0m:[36m29  [0m | [1mCreating artifact store directory[0m
[32m 2024-Apr-18 02:14:21.62[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.base.build[0m:[36m160 [0m | [1mConnecting to compute client: None[0m
[32m 2024-Apr-18 02:14:21.62[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.base.datalayer[0m:[36m89  [0m | [1mBuilding Data Layer[0m


In [5]:
!python -m superduperdb info

```
{
  "cfg": {
    "data_backend": "mongodb://localhost:27017/test_db",
    "lance_home": ".superduperdb/vector_indices",
    "artifact_store": null,
    "metadata_store": null,
    "cluster": {
      "compute": {
        "uri": null,
        "compute_kwargs": {}
      },
      "vector_search": {
        "uri": "postgresql://postgres:test@localhost:8000/qa",
        "type": "pg_vector",
        "backfill_batch_size": 100
      },
      "cdc": {
        "uri": null,
        "strategy": null
      }
    },
    "retries": {
      "stop_after_attempt": 2,
      "wait_max": 10.0,
      "wait_min": 4.0,
      "wait_multiplier": 1.0
    },
    "downloads": {
      "folder": null,
      "n_workers": 0,
      "headers": {
        "User-Agent": "me"
      },
      "timeout": null
    },
    "fold_probability": 0.05,
    "log_level": "INFO",
    "logging_type": "SYSTEM",
    "bytes_encoding": "Bytes"
  },
  "cwd": "/Users/tarun/Desktop/superduperDB/superduperdb/examples",
  "freeze": [
    "aio

In [6]:
import glob
import re

ROOT = '../docs/hr/content/docs/'

STRIDE = 3       # stride in numbers of lines
WINDOW = 25       # length of window in numbers of lines

files = sorted(glob.glob(f'{ROOT}/**/*.md', recursive=True))

def get_chunk_link(chunk, file_name):
    # Get the original link of the chunk
    file_link = file_name[:-3].replace(ROOT, 'https://docs.superduperdb.com/docs/docs/')
    # If the chunk has subtitles, the link to the first subtitle will be used first.
    first_title = (re.findall(r'(^|\n)## (.*?)\n', chunk) or [(None, None)])[0][1]
    if first_title:
        # Convert subtitles and splice URLs
        first_title = first_title.lower()
        first_title = re.sub(r'[^a-zA-Z0-9]', '-', first_title)
        file_link = file_link + '#' + first_title
    return file_link

def create_chunk_and_links(file, file_prefix=ROOT):
    with open(file, 'r') as f:
        lines = f.readlines()
    if len(lines) > WINDOW:
        chunks = ['\n'.join(lines[i: i + WINDOW]) for i in range(0, len(lines), STRIDE)]
    else:
        chunks = ['\n'.join(lines)]
    return [{'txt': chunk, 'link': get_chunk_link(chunk, file)}  for chunk in chunks]


all_chunks_and_links = sum([create_chunk_and_links(file) for file in files], [])

In [7]:
# Use !curl to download the 'superduperdb_docs.json' file
!curl -O https://datas-public.s3.amazonaws.com/superduperdb_docs.json

import json
from IPython.display import Markdown

# Open the downloaded JSON file and load its contents into the 'chunks' variable
with open('superduperdb_docs.json') as f:
    all_chunks_and_links = json.load(f)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  737k  100  737k    0     0   332k      0  0:00:02  0:00:02 --:--:--  333k


In [8]:
all_chunks_and_links[0]

{'txt': '# Anthropic\n\n\n\n`superduperdb` allows users to work with `anthropic` API models.\n\n\n\nRead more about this [here](/docs/docs/walkthrough/ai_models#anthropic).',
 'link': 'https://docs.superduperdb.com/docs/docs/ai_integrations/anthropic'}

In [9]:
new_all_chunks_and_links = list()
for i, e in enumerate(all_chunks_and_links):
    e['id'] = i
    new_all_chunks_and_links.append(e)

## Define Schema and Create table

For this use-case, you need a table with images and another table with text. SuperDuperDB extends standard SQL functionality, allowing developers to define their own data types through the `Encoder` abstraction.

In [10]:
Schema(
        'questiondocs-schema',
        fields={'id': dtype(str), 'txt': dtype(str), 'link': dtype(str)},
    )

Schema(identifier='questiondocs-schema', fields={'id': FieldType(identifier='String'), 'txt': FieldType(identifier='String'), 'link': FieldType(identifier='String')})

In [11]:
# 
# Define the 'captions' table
table = Table(
    'questiondocs',
    primary_id='id',
    schema=Schema(
        'questiondocs-schema',
        fields={'id': dtype(str), 'txt': dtype(str), 'link': dtype(str)},
    )
)



# Add the 'captions' and 'images' tables to the SuperDuperDB database
db.add(table)

([], Table(identifier='questiondocs'))

In [12]:
import pandas as pd

In [13]:
new_all_chunks_and_links_df = pd.DataFrame(new_all_chunks_and_links)

In [14]:
df = new_all_chunks_and_links_df.astype(str)

In [15]:
from superduperdb.base.document import Document as D


In [16]:
insert = table.insert(
        [
            D(
                {
                    'id': d['id'],
                    'txt': d['txt'],
                    'link': d['link'],
                }
            )
            for i, d in df.iterrows()
        ]
    )
_ =  db.execute(insert)

[32m 2024-Apr-18 02:14:26.91[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.backends.local.compute[0m:[36m34  [0m | [1mSubmitting job. function:<function callable_job at 0x13a392ca0>[0m
[32m 2024-Apr-18 02:14:27.15[0m| [32m[1mSUCCESS [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.backends.local.compute[0m:[36m40  [0m | [32m[1mJob submitted on <superduperdb.backends.local.compute.LocalComputeBackend object at 0x13fbb1410>.  function:<function callable_job at 0x13a392ca0> future:16073078-3d35-4e73-90ba-e7a862f6b03a[0m


In [17]:
q = table.select('txt', 'link')

In [18]:
result = db.execute(q)

In [19]:
result[0]

{'txt': '# Anthropic\n\n\n\n`superduperdb` allows users to work with `anthropic` API models.\n\n\n\nRead more about this [here](/docs/docs/walkthrough/ai_models#anthropic).',
 'link': 'https://docs.superduperdb.com/docs/docs/ai_integrations/anthropic'}

A `Model` is a wrapper around a self-built or ecosystem model, such as `torch`, `transformers`, `openai`.

In [20]:
from superduperdb import vector
vector(shape=(1024,))

DataType(identifier='vector[1024]', encoder=None, decoder=None, info=None, shape=(1024,), directory=None, encodable='native', bytes_encoding=<BytesEncoding.BYTES: 'Bytes'>)

In [21]:
import sentence_transformers
from superduperdb.ext.sentence_transformers import SentenceTransformer
from superduperdb.ext.numpy import array

### Model

In [22]:

model = SentenceTransformer(
    identifier="embedding",
    object=sentence_transformers.SentenceTransformer("BAAI/bge-large-en-v1.5"),
    postprocess=lambda x: x.tolist(),
    datatype=vector(shape=(1024,)),
    predict_kwargs={"show_progress_bar": True},
)

In [23]:
vector = model.predict_one('This is a test')
print('vector size: ', len(vector))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

vector size:  1024


In [24]:
vector

[0.011527689173817635,
 0.029759284108877182,
 0.0037916165310889482,
 0.03594416752457619,
 -0.015849102288484573,
 -0.014957858249545097,
 -0.01805204339325428,
 -0.0027551832608878613,
 0.030829966068267822,
 0.03400183096528053,
 0.014781944453716278,
 0.013992846943438053,
 0.028235789388418198,
 -0.022385237738490105,
 -0.020612549036741257,
 -0.009806545451283455,
 -0.03423108160495758,
 0.01943149045109749,
 -0.047497015446424484,
 -0.022495148703455925,
 -0.04039725288748741,
 0.026462944224476814,
 -0.03274437040090561,
 -0.02925405651330948,
 -0.01557212695479393,
 0.021357305347919464,
 -0.009792919270694256,
 0.015228939242661,
 0.03327728807926178,
 0.06047121807932854,
 -0.015124496072530746,
 -0.008077792823314667,
 -0.013754688203334808,
 -0.059881988912820816,
 0.010653371922671795,
 -0.032427672296762466,
 0.06399141997098923,
 -0.025481119751930237,
 -0.011209827847778797,
 -0.05197564512491226,
 -0.017725445330142975,
 0.017224730923771858,
 0.023844042792916298,
 

In [25]:
# Import the Listener class from the superduperdb module
from superduperdb import Listener


# Create a Listener instance with the specified model, key, and selection criteria
listener1 = Listener(
    model=model,          # The model to be used for listening
    key='txt',            # The key field in the documents to be processed by the model
    select=table.select('id', 'txt'),  # The selection criteria for the documents
    predict_kwargs={'max_chunk_size': 3000},
    identifier='listener1'
)

In [26]:
db.vector_indices

LoadDict(database=<superduperdb.base.datalayer.Datalayer object at 0x13fae9110>, field='vector_index', callable=None)

In [27]:
db.add(listener1)

[32m 2024-Apr-18 02:14:37.48[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.components.component[0m:[36m374 [0m | [1mInitializing DataType : dill[0m
[32m 2024-Apr-18 02:14:37.48[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.components.component[0m:[36m377 [0m | [1mInitialized  DataType : dill successfully[0m
[32m 2024-Apr-18 02:14:50.09[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.backends.local.compute[0m:[36m34  [0m | [1mSubmitting job. function:<function method_job at 0x13a392d40>[0m
[32m 2024-Apr-18 02:14:52.33[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.components.model[0m:[36m446 [0m | [1mQuery <superduperdb.backends.ibis.query.IbisCompoundSelect[
    [92m[1mquestiondocs.select('id', 'txt')[0m
] object at 0x1e949f890> not found in metadata, adding...[0m
[32m 2024-Apr-18 02:14:52.33[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperdup

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 992/992 [00:00<00:00, 457526.89it/s]


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

[32m 2024-Apr-18 02:27:32.08[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.components.model[0m:[36m659 [0m | [1mAdding 992 model outputs to `db`[0m
[32m 2024-Apr-18 02:27:38.05[0m| [32m[1mSUCCESS [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.backends.local.compute[0m:[36m40  [0m | [32m[1mJob submitted on <superduperdb.backends.local.compute.LocalComputeBackend object at 0x13fbb1410>.  function:<function method_job at 0x13a392d40> future:29e8f5e8-18db-4e67-b443-8d550d080cf6[0m


([<superduperdb.jobs.job.ComponentJob at 0x1e970a6d0>],
 Listener(identifier='listener1', key='txt', model=SentenceTransformer(preferred_devices=('cuda', 'mps', 'cpu'), device='cpu', identifier='embedding', signature='singleton', datatype=DataType(identifier='vector[1024]', encoder=None, decoder=None, info=None, shape=(1024,), directory=None, encodable='native', bytes_encoding=<BytesEncoding.BYTES: 'Bytes'>), output_schema=None, flatten=False, model_update_kwargs={}, predict_kwargs={'show_progress_bar': True}, compute_kwargs={}, object=SentenceTransformer(
   (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
   (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
   (2): Normalize()
 ), model='embedd

In [28]:
from superduperdb import VectorIndex

In [29]:
vi = VectorIndex(
    identifier='my-index',        # Unique identifier for the VectorIndex
    indexing_listener=listener1,    # Listener to be used for indexing documents
    measure='cosine'
)

In [30]:
jobs, _ = db.add(vi)

[32m 2024-Apr-18 02:27:43.92[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.backends.local.compute[0m:[36m34  [0m | [1mSubmitting job. function:<function callable_job at 0x13a392ca0>[0m
[32m 2024-Apr-18 02:27:43.92[0m| [32m[1mSUCCESS [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.backends.local.compute[0m:[36m40  [0m | [32m[1mJob submitted on <superduperdb.backends.local.compute.LocalComputeBackend object at 0x13fbb1410>.  function:<function callable_job at 0x13a392ca0> future:bae6a1be-697e-437d-addd-8084463a392f[0m


## Inference

In [38]:
%%time
from superduperdb.backends.ibis import Table
from superduperdb import Document as D
from IPython.display import *

# Define the query for the search
query = 'Code snippet how to create a `VectorIndex` with a torchvision model'
# query = 'can you explain vector-indexes with `superduperdb`?'

# Execute a search using SuperDuperDB to find documents containing the specified query
result = db.execute(
    query=table.like(D({'txt': query}), vector_index='my-index', n=5).select('id', 'txt', 'link')
)

# Display a horizontal rule to separate results
display(Markdown('---'))

# Display each document's 'txt' field and separate them with a horizontal rule
for r in result:
    display(Markdown(r['txt']))
    display(r['link'])
    display(Markdown('---'))

[32m 2024-Apr-18 03:06:23.01[0m| [1mINFO    [0m | [36mTaruns-Laptop.local[0m| [36msuperduperdb.base.datalayer[0m:[36m974 [0m | [1m{}[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

---

```



Read more about the `VectorIndex` concept [here](../walkthrough/vector_search.md).


'https://docs.superduperdb.com/docs/docs/fundamentals/procedural_vs_declarative_api'

---

the vectors calculated by the `Listener`, and, is fitted

based on those vectors and the label set.



```python

from sklearn.svm import SVC

from my_models.vision import MyTorchModule, prepare_image



from superduperdb.ext.numpy import array

from superduperdb.ext.sklearn import Estimator

from superduperdb.ext.torch import TorchModel

from superduperdb import Stack, VectorIndex, Listener

from superduperdb.backends.mongodb import Collection



collection = Collection('images')



my_listener=Listener(

    'my-listener',

    model=TorchModel(

        'my-cnn-vectorizer',

        object=MyTorchModule(),

        preprocess=prepare_image,

        postprocess=lambda x: x.numpy(),

        encoder=array(dtype='float', shape=(512,))

    )

    key='img',


'https://docs.superduperdb.com/docs/docs/walkthrough/creating_stacks_of_functionality'

---

from superduperdb.ext.torch import TorchModel

from superduperdb import Stack, VectorIndex, Listener

from superduperdb.backends.mongodb import Collection



collection = Collection('images')



my_listener=Listener(

    'my-listener',

    model=TorchModel(

        'my-cnn-vectorizer',

        object=MyTorchModule(),

        preprocess=prepare_image,

        postprocess=lambda x: x.numpy(),

        encoder=array(dtype='float', shape=(512,))

    )

    key='img',

    select=collection.find({'_fold': 'train'})

)



db.add(

    Stack(

        'my-stack',

        [

            my_listener,

            VectorIndex(


'https://docs.superduperdb.com/docs/docs/walkthrough/creating_stacks_of_functionality'

---



collection = Collection('images')



my_listener=Listener(

    'my-listener',

    model=TorchModel(

        'my-cnn-vectorizer',

        object=MyTorchModule(),

        preprocess=prepare_image,

        postprocess=lambda x: x.numpy(),

        encoder=array(dtype='float', shape=(512,))

    )

    key='img',

    select=collection.find({'_fold': 'train'})

)



db.add(

    Stack(

        'my-stack',

        [

            my_listener,

            VectorIndex(

                'my-index',

                indexing_listener=my_listener,

            ),


'https://docs.superduperdb.com/docs/docs/walkthrough/creating_stacks_of_functionality'

---

my_listener=Listener(

    'my-listener',

    model=TorchModel(

        'my-cnn-vectorizer',

        object=MyTorchModule(),

        preprocess=prepare_image,

        postprocess=lambda x: x.numpy(),

        encoder=array(dtype='float', shape=(512,))

    )

    key='img',

    select=collection.find({'_fold': 'train'})

)



db.add(

    Stack(

        'my-stack',

        [

            my_listener,

            VectorIndex(

                'my-index',

                indexing_listener=my_listener,

            ),

            Estimator(

                'my-classifier',

                object=SVC()


'https://docs.superduperdb.com/docs/docs/walkthrough/creating_stacks_of_functionality'

---

CPU times: user 1.33 s, sys: 273 ms, total: 1.6 s
Wall time: 360 ms


## Future Works
1. HNSW
2. IVFFlat 