<!-- TABS -->
# Multimodal vector search

<!-- TABS -->
## Start your system

In [None]:
# <tab: Development>
# Nothing to do here (everything runs in-process)

In [None]:
# <tab: Experimental Cluster>
!python -m superduperdb local_cluster

In [None]:
# <tab: Docker-Compose>
!make testenv_image
!make testenv_init

<!-- TABS -->
## Connect to SuperDuperDB

In [None]:
# <tab: MongoDB>
from superduperdb import superduper

db = superduper('mongodb://localhost:27017/documents')

In [None]:
# <tab: SQLite>
from superduperdb import superduper

db = superduper('sqlite://my_db.db')

In [None]:
# <tab: MySQL>
from superduperdb import superduper

user = 'superduper'
password = 'superduper'
port = 3306
host = 'localhost'
database = 'test_db'

db = superduper(f"mysql://{user}:{password}@{host}:{port}/{database}")

In [None]:
# <tab: Oracle>
from superduperdb import superduper

user = 'sa'
password = 'Superduper#1'
port = 1433
host = 'localhost'

db = superduper(f"mssql://{user}:{password}@{host}:{port}")

In [None]:
# <tab: PostgreSQL>
from superduperdb import superduper

user = 'superduper'
password = 'superduper'
port = 5432
host = 'localhost'
database = 'test_db'

db = superduper(f"postgres://{user}:{password}@{host}:{port}/{database}")

In [None]:
# <tab: Snowflake>
from superduperdb import superduper

user = "superduperuser"
password = "superduperpassword"
account = "XXXX-XXXX"  # ORGANIZATIONID-USERID
database = "FREE_COMPANY_DATASET/PUBLIC"

snowflake_uri = f"snowflake://{user}:{password}@{account}/{database}"

db = superduper(
    snowflake_uri, 
    metadata_store='sqlite:///your_database_name.db',
)

In [None]:
# <tab: Clickhouse>
from superduperdb import superduper

user = 'default'
password = ''
port = 8123
host = 'localhost'

db = superduper(f"clickhouse://{user}:{password}@{host}:{port}", metadata_store=f'mongomock://meta')

In [None]:
# <tab: DuckDB>
from superduperdb import superduper

db = superduper('duckdb://mydb.duckdb')

In [None]:
# <tab: Pandas>
from superduperdb import superduper

db = superduper(['my.csv'], metadata_store=f'mongomock://meta')

In [None]:
# <tab: MongoMock>
from superduperdb import superduper

db = superduper('mongomock:///test_db')

In [None]:
# <testing: >
import pandas as pd
data = [{'A': 10, 'B': 20, 'C':30}, {'x':100, 'y': 200, 'z': 300}]
df = pd.DataFrame(data=data)
df.to_csv('my.csv')

<!-- TABS -->
## Get useful sample data

In [None]:
# <tab: Text>
!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text.json
import json

with open('text.json', 'r') as f:
    data = json.load(f)

In [None]:
# <tab: PDF>
!curl -O https://superduperdb-public-demo.s3.amazonaws.com/pdfs.zip && unzip pdfs.zip
import os

data = [f'pdfs/{x}' for x in os.listdir('./pdfs')]
data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 6813k  100 6813k    0     0  1881k      0  0:00:03  0:00:03 --:--:-- 1883k   2948      0  0:39:26 --:--:--  0:39:26  2970
Archive:  pdfs.zip
replace pdfs/4.pdf? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# <tab: Image>
!curl -O s3://superduperdb-public-demo/images.zip && unzip images.zip
import os

data = [f'images/{x}' for x in os.listdir('./images')]

In [None]:
# <tab: Video>
!curl -O s3://superduperdb-public-demo/videos.zip && unzip videos.zip
import os

data = [f'videos/{x}' for x in os.listdir('./videos')]

In [None]:
# <tab: Audio>
!curl -O s3://superduperdb-public-demo/audio.zip && unzip audio.zip
import os

data = [f'audios/{x}' for x in os.listdir('./audios')]

<!-- TABS -->
## Create datatype

Data types such as "text" or "integer" which are natively support by your `db.databackend` don't need a datatype.

In [None]:
datatype = None

Otherwise do one of the following:

In [None]:
# <testing: >
from superduperdb import superduper

db = superduper("mongomock://test")

In [None]:
# <tab: PDF>
!pip install PyPDF2
from superduperdb import DataType
from superduperdb.components.datatype import File

datatype = DataType('pdf', encodable='file')

In [None]:
# <tab: Image>
from superduperdb.ext.pillow import pil_image
import PIL.Image

datatype = pil_image

In [None]:
# <tab: Audio>
from superduperdb.ext.numpy import array
from superduperdb import DataType
import scipy.io.wavfile
import io


def encoder(data):
    buffer = io.BytesIO()
    fs = data[0]
    content = data[1]
    scipy.io.wavfile.write(buffer, fs, content)
    return buffer.getvalue()


def decoder(data):
    buffer = io.BytesIO(data)
    content = scipy.io.wavfile.read(buffer)
    return content


datatype = DataType(
    'wav',
    encoder=encoder,
    decoder=decoder,
    encodable='artifact',
)

In [None]:
# <testing: >
!curl -O https://superduperdb-public-demo.s3.amazonaws.com/audio.zip && unzip audio.zip
test = scipy.io.wavfile.read('./audio/1.wav')
datatype.decoder(datatype.encoder(test))

In [None]:
# <tab: Video>
from superduperdb import DataType

# Create an instance of the Encoder with the identifier 'video_on_file' and load_hybrid set to False
datatype = DataType(
    identifier='video_on_file',
    encodable='artifact',
)

In [None]:
# <testing: >
db.add(datatype)

from superduperdb.backends.mongodb import Collection
from superduperdb import Document
collection = Collection("data")

print(origin_data)

db.execute(collection.insert_one(Document({"x": datatype(origin_data)})))

data = db.execute(collection.find_one())
print(data.unpack()["x"])

<!-- TABS -->
## Setup tables or collections

In [None]:
# <tab: MongoDB>
# Note this is an optional step for MongoDB
# Users can also work directly with `DataType` if they want to add
# custom data
from superduperdb import Schema

schema = None
if isinstance(datatype, DataType):
    schema = Schema(fields={'x': datatype})
    db.add(schema)

In [None]:
# <tab: SQL>
from superduperdb.backends.ibis import Table
from superduperdb.backends.ibis.field_types import FieldType

if isinstance(datatype, DataType):
    schema = Schema(fields={'x': datatype})
else:
    schema = Schema(fields={'x': FieldType(datatype)})
db.add(Table('documents', schema=schema))

<!-- TABS -->
## Insert data

In order to create data, we need to create a `Schema` for encoding our special `Datatype` column(s) in the databackend.

In [None]:
N_DATA = round(len(data) - len(data) // 4)

In [None]:
# <tab: MongoDB>
from superduperdb import Document

if schema is None:
    data = Document([{'x': datatype(x)} for x in data])    
    db.execute(collection.insert_many(data[:N_DATA]))
else:
    data = Document([{'x': x} for x in data])    
    db.execute(collection.insert_many(data[:N_DATA], schema='my_schema'))

In [None]:
# <tab: SQL>
from superduperdb import Document

db.execute(table.insert([Document({'x': x}) for x in data[:N_DATA]]))

In [None]:
sample_datapoint = data[-1]

<!-- TABS -->
## Apply a chunker for search

In [None]:
# <tab: Text>
from superduperdb import objectmodel

CHUNK_SIZE = 200

@objectmodel(flatten=True, model_update_kwargs={'document_embedded': False})
def chunker(text):
    text = text.split()
    chunks = [text[i:i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
    return chunks

In [None]:
# <tab: PDF>
!pip install PyPDF2
from superduperdb import objectmodel

CHUNK_SIZE = 500

@objectmodel(flatten=True, model_update_kwargs={'document_embedded': False})
def chunker(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(reader.pages)
    print(f'Number of pages {num_pages}')
    text = []    
    for i in range(num_pages):
        page = reader.pages[i]        
        page_text = page.extract_text()
        text.append(page_text)
    text = '\n\n'.join(text)
    chunks = [text[i:i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
    return chunks

In [None]:
# <testing: >
!curl -O 'https://arxiv.org/pdf/2303.08774.pdf?fbclid=IwAR2XS6JT2NLIP4MjFn9npot34FhddoqStNbLwIvWETf5ZGlCPsIbuYneo8s&mibextid=Zxz2cZ'
chunks = chunker('2303.08774.pdf')
len(chunks)

In [None]:
# <tab: Video>
!pip install opencv-python
import cv2
import tqdm
from PIL import Image
from superduperdb.ext.pillow import pil_image
from superduperdb import ObjectModel, Schema


@objectmodel(
    flatten=True,
    model_update_kwargs={'document_embedded': False},
    output_schema=Schema(identifier='output-schema', fields={'image': pil_image}),
)
def chunker(video_file):
    # Set the sampling frequency for frames
    sample_freq = 10
    
    # Open the video file using OpenCV
    cap = cv2.VideoCapture(video_file)
    
    # Initialize variables
    frame_count = 0
    fps = cap.get(cv2.CAP_PROP_FPS)
    extracted_frames = []
    progress = tqdm.tqdm()

    # Iterate through video frames
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Get the current timestamp based on frame count and FPS
        current_timestamp = frame_count // fps
        
        # Sample frames based on the specified frequency
        if frame_count % sample_freq == 0:
            extracted_frames.append({
                'image': Image.fromarray(frame[:,:,::-1]),  # Convert BGR to RGB
                'current_timestamp': current_timestamp,
            })
        frame_count += 1
        progress.update(1)
    
    # Release resources
    cap.release()
    cv2.destroyAllWindows()
    
    # Return the list of extracted frames
    return extracted_frames

In [None]:
# <tab: Audio>
from superduperdb import objectmodel, Schema

CHUNK_SIZE = 10  # in seconds

@objectmodel(
    flatten=True,
    model_update_kwargs={'document_embedded': False},
    output_schema=Schema(identifier='output-schema', fields={'audio': datatype}),
)
def chunker(audio):
    chunks = []
    for i in range(0, len(audio), CHUNK_SIZE):
        chunks.append(audio[1][i: i + CHUNK_SIZE])
    return [(audio[0], chunk) for chunk in chunks]

Now we apply this chunker to the data by wrapping the chunker in `Listener`:

In [None]:
db.add(
    Listener(
        model=chunker,
        select=select,
        key='x',
    )
)

<!-- TABS -->
## Build multimodal embedding models

Some embedding models such as [CLIP](https://github.com/openai/CLIP) come in pairs of `model` and `compatible_model`.
Otherwise:

In [None]:
compatible_model = None

In [9]:
# <tab: Text>
from superduperdb.ext.sentence_transformers import SentenceTransformer

# Load the pre-trained sentence transformer model
model = SentenceTransformer(identifier='all-MiniLM-L6-v2')

In [10]:
# <testing: >
import numpy as np
from PIL import Image

model.predict_one('some text')

In [11]:
# <tab: Image>
import torch
import clip
from torchvision import transforms
from superduperdb.ext.torch import TorchModel

class CLIPVisionEmbedding:
    def __init__(self):
        # Load the CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.preprocess = clip.load("RN50", device=self.device)
        
    def preprocess(self, image):
        # Load and preprocess the image
        image = self.preprocess(image).unsqueeze(0).to(self.device)
        return image
        
model = CLIPVisionEmbedding()
model = TorchModel(identifier='clip-vision', object=model.model, preprocess=model.preprocess, forward_method='encode_image')

In [12]:
# <tab: Text+Image>

import torch
import clip
from torchvision import transforms
from superduperdb import Model
from superduperdb.ext.torch import TorchModel

class CLIPTextEmbedding:
    def __init__(self):
        # Load the CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, _ = clip.load("RN50", device=self.device)
        
    def __call__(self, text):
        features = clip.tokenize([text])
        return self.model.encode_text(features)
        
model = CLIPTextEmbedding()
superdupermodel_text = Model(identifier='clip-text', object=model)

class CLIPVisionEmbedding:
    def __init__(self):
        # Load the CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.preprocess = clip.load("RN50", device=self.device)
        
    def preprocess(self, image):
        # Load and preprocess the image
        image = self.preprocess(image).unsqueeze(0).to(self.device)
        return image
        
model = TorchModel(identifier='clip-vision', object=model.model, preprocess=model.preprocess, forward_method='encode_image')
compatible_model = CLIPVisionEmbedding()

In [13]:
# <testing: >
import numpy as np
from PIL import Image

compatible_model.predict_one(Image.fromarray(np.ones((256,256,3)).astype(np.uint8)))

[32m 2024-Mar-27 14:47:07.83[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduperdb.components.component[0m:[36m344 [0m | [1mInitializing TorchModel : clip-vision[0m
[32m 2024-Mar-27 14:47:07.83[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduperdb.components.component[0m:[36m347 [0m | [1mInitialized  TorchModel : clip-vision successfully[0m


In [14]:
# <testing: >
import numpy as np
from PIL import Image

model.predict_one('some text')

[32m 2024-Mar-27 14:47:07.92[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduperdb.components.component[0m:[36m344 [0m | [1mInitializing ObjectModel : clip-text[0m
[32m 2024-Mar-27 14:47:07.92[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36msuperduperdb.components.component[0m:[36m347 [0m | [1mInitialized  ObjectModel : clip-text successfully[0m


In [2]:
# <tab: Audio>
!pip install librosa
import librosa
import numpy as np
from superduperdb import Model

def audio_embedding(audio_file):
    # Load the audio file
    y, sr = librosa.load(audio_file)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    return mfccs

model= Model(identifier='my-model-audio', object=audio_embedding)

Collecting librosa
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.59.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl.metadata (14 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.3.7-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.5 kB)
Collecting lazy-loader>=0.1 (from librosa)
  Downloading lazy_loader-0.3-py3-none-any.whl.metadata (4.3 kB)
Collecting llvmlite<0.43,>=0.42.0dev0 (from numba>=0.51.0->librosa)
  Downloading llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.8 kB)
Downloading librosa-0.10.1-py3-none-any.whl (253 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.7/253.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m[31m1

In [4]:
# <testing: >
import wave
import struct

sample_rate = 44100 
duration = 1 
frequency = 440
amplitude = 0.5

# Generate the sine wave
num_samples = int(sample_rate * duration)
t = np.linspace(0, duration, num_samples, False)
signal = amplitude * np.sin(2 * np.pi * frequency * t)

# Open a new WAV file
output_file = 'dummy_audio.wav'
wav_file = wave.open(output_file, 'w')

# Set the parameters for the WAV file
nchannels = 1  # Mono audio
sampwidth = 2  # Sample width in bytes (2 for 16-bit audio)
framerate = sample_rate
nframes = num_samples

# Set the parameters for the WAV file
wav_file.setparams((nchannels, sampwidth, framerate, nframes, 'NONE', 'not compressed'))

# Write the audio data to the WAV file
for sample in signal:
    wav_file.writeframes(struct.pack('h', int(sample * (2 ** 15 - 1))))

# Close the WAV file
wav_file.close()

# Test
model.predict_one(output_file)

## Create vector-index

In [None]:
# <tab: 1-Modality>
from superduperdb import VectorIndex, Listener

jobs, _ db.add(
    VectorIndex(
        'my-vector-index',
        indexing_listener=Listener(
            key='<my_key>',      # the `Document` key `model` should ingest to create embedding
            select=select,       # a `Select` query telling which data to search over
            model=model,         # a `_Predictor` how to convert data to embeddings
        )
    )
)

In [None]:
# <tab: 2-Modalities>
from superduperdb import VectorIndex, Listener

jobs, _ db.add(
    VectorIndex(
        'my-vector-index',
        indexing_listener=Listener(
            key='<my_key>',      # the `Document` key `model` should ingest to create embedding
            select=select,       # a `Select` query telling which data to search over
            model=model,         # a `_Predictor` how to convert data to embeddings
        ),
        compatible_listener=Listener(
            key='<other_key>',      # the `Document` key `model` should ingest to create embedding
            model=compatible_model,         # a `_Predictor` how to convert data to embeddings
            active=False,
        )
    )
)

<!-- TABS -->
## Perform a vector search

In [None]:
from superduperdb import Document

item = Document({'x': datatype(sample_datapoint)})

Once we have this search target, we can execute a search as follows:

In [None]:
# <tab: MongoDB>
select = collection.find().like(sample_datapoint)

In [None]:
# <tab: SQL>
select = table.like(item)

In [None]:
results = db.execute(select)