<!-- TABS -->
# Text Vector Search

You'll find this example as well as the saved template in the main repository of `superduper`.
See [here](https://github.com/superduper-io/superduper/tree/main/templates/text_vector_search).

If you'd like to modify the template, or practice building it yourself, then you can rerun the `build.ipynb` notebook
in the template directory

<!-- TABS -->
## Connect to superduper

In [None]:
from superduper import superduper, CFG

CFG.auto_schema = True

db = superduper('mongomock://test_db')

<!-- TABS -->
## Get useful sample data

In [None]:
# <tab: Text>
!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text.json
import json

with open('text.json', 'r') as f:
    data = json.load(f)

In [None]:
# <tab: PDF>
!curl -O https://superduperdb-public-demo.s3.amazonaws.com/pdfs.zip && unzip -o pdfs.zip
import os

data = [f'pdfs/{x}' for x in os.listdir('./pdfs') if x.endswith('.pdf')]

In [None]:
import json

with open('./text_data.json') as f:
    text_data = json.load(f)

In [None]:
import json

with open('./pdf_data.json') as f:
    text_data = json.load(f)

<!-- TABS -->
## Apply a chunker for search

:::note
Note that applying a chunker is ***not*** mandatory for search.
If your data is already chunked (e.g. short text snippets or audio) or if you
are searching through something like images, which can't be chunked, then this
won't be necessary.
:::

In [None]:
# <tab: Text>
from superduper import model

CHUNK_SIZE = 200

@model
def chunker(text):
    text = text.split()
    chunks = [' '.join(text[i:i + CHUNK_SIZE]) for i in range(0, len(text), CHUNK_SIZE)]
    return chunks

In [None]:
db['docs'].insert([{'x': r} for r in data]).execute()

In [None]:
# <tab: PDF>
!pip install -q "unstructured[pdf]"
from superduper import model
from unstructured.partition.pdf import partition_pdf

CHUNK_SIZE = 500

@model(flatten=True)
def chunker(pdf_file):
    elements = partition_pdf(pdf_file)
    text = '\n'.join([e.text for e in elements])
    chunks = [text[i:i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
    return chunks

Now we wrap this chunker as a `Listener`, so that it processes incoming data

In [None]:
from superduper import Listener

upstream_listener = Listener(
    model=chunker,
    select=db['docs'].select(),
    key='x',
    identifier='chunker',
    flatten=True,
)

In [None]:
db.apply(upstream_listener)

<!-- TABS -->
## Build text embedding model

In [None]:
# <tab: OpenAI>
from superduper_openai import OpenAIEmbedding
from superduper import vector

import os

os.environ['OPENAI_API_KEY'] = 'sk-<your-secret>'

embedding_model = OpenAIEmbedding(
    identifier='text-embedding-ada-002',
    example='this is a test',
    datatype=vector(shape=(1024,)),
)

In [None]:
# <tab: JinaAI>
import os
from superduper_jina import JinaEmbedding

os.environ["JINA_API_KEY"] = "jina_xxxx"
 
# define the model
embedding_model = JinaEmbedding(identifier='jina-embeddings-v2-base-en')

In [None]:
# <tab: Sentence-Transformers>
from superduper import vector
import sentence_transformers
from superduper_sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(
    identifier="embedding",
    object=sentence_transformers.SentenceTransformer("BAAI/bge-small-en"),
    datatype=vector(shape=(1024,)),
    postprocess=lambda x: x.tolist(),
    predict_kwargs={"show_progress_bar": True},
    example='this is a test',
)

In [None]:
# from superduper.components.model import ModelRouter

# embedding_model = ModelRouter(
#     'embedding',
#     models={'openai': openai_model, 'sentence_transformers': st_model},
#     model='openai',
#     example='this is a test',
#     signature='singleton',
# )

## Create vector-index

In [None]:
from superduper import VectorIndex, Listener

vector_index = VectorIndex(
    'my-vector-index',
    indexing_listener=Listener(
        key=upstream_listener.outputs,              # the `Document` key `model` should ingest to create embedding
        select=db[upstream_listener.outputs].select(),                 # a `Select` query telling which data to search over
        model=embedding_model,         # a `_Predictor` how to convert data to embeddings
        identifier=f'{embedding_model.identifier}-listener',
        upstream=[upstream_listener],              # this makes sure that the table is already set up when the other components are triggered
    )
)

In [None]:
db.apply(vector_index)

In [None]:
from superduper import Application

application = Application(
    'text-vector-search', 
    components=[
        upstream_listener,
        vector_index,
    ]
)

In [None]:
db.apply(application)

## Perform a vector search

In [None]:
from superduper import Document
# Perform the vector search based on the query
item = Document({indexing_key: "Tell me about vector-search"})

In [None]:
results = db[upstream_listener.outputs].like({upstream_listener.outputs: "Tell me about vector-search"}, vector_index='my-vector-index', n=10).select().execute()

In [None]:
from superduper import Document

for result in results:
    print("\n", '-' * 20, '\n')
    print(Document(result.unpack())[upstream_listener.outputs])

In [None]:
from superduper import Template

t = Template(
    'vector-search',
    template=application,
    substitutions={'docs': 'table_name'},
)

In [None]:
t.export('.')

In [None]:
!cat component.json | jq .