<!-- TABS -->
# Compute features

In [None]:
# <testing: >
from superduperdb import superduper, Document
from superduperdb.backends.mongodb import Collection
from superduperdb.ext.numpy import array
import numpy as np

db = superduper('mongomock://temp')
select = Collection("data").find()

image_array = array("float64", shape=(256, 256, 3))
db.add(image_array)

datas = []
for i in range(10):
    data = {
        "text": str(i),
        "image": image_array(np.random.random((256,256,3))),
        "input_data": i
    }
    datas.append(data)

db.execute(Collection("data").insert_many([Document(data) for data in datas]))

In [None]:
# <tab: Text>
# !pip install sentence-transformers
import sentence_transformers
from superduperdb import vector, Listener
from superduperdb.ext.sentence_transformers import SentenceTransformer

superdupermodel = SentenceTransformer(
    identifier="embedding",
    object=sentence_transformers.SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
    datatype=vector(shape=(384,)),
    postprocess=lambda x: x.tolist(),
)

db.add(
    Listener(
        model=superdupermodel,
        select=select,
        key="text",
        identifier="features"
    )
)

In [None]:
# <tab: Image>
# !pip install torch
# !pip install torchvision

import torch
import torchvision.models as models
from torchvision import transforms
from superduperdb.ext.torch import TorchModel
from superduperdb import Listener
from PIL import Image

class TorchVisionEmbedding:
    def __init__(self):
        # Load the pre-trained ResNet-18 model
        self.resnet = models.resnet18(pretrained=True)
        
        # Set the model to evaluation mode
        self.resnet.eval()
        
    def preprocess(self, image_array):
        # Preprocess the image
        image = Image.fromarray(image_array.astype(np.uint8))
        preprocess = preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        tensor_image = preprocess(image)
        return tensor_image
        
model = TorchVisionEmbedding()
superdupermodel = TorchModel(identifier='my-vision-model-torch', object=model.resnet, preprocess=model.preprocess, postprocess=lambda x: x.numpy().tolist())

db.add(
    Listener(
        model=superdupermodel,
        select=select,
        key="image",
        identifier="features"
    )
)

In [None]:
# <tab: Text-And-Image>
# !pip install torch
# !pip install torchvision
# !pip install git+https://github.com/openai/CLIP.git
import torch
import clip
from torchvision import transforms
from superduperdb import ObjectModel
from superduperdb import Listener

import torch
import clip
from PIL import Image

class CLIPModel:
    def __init__(self):
        # Load the CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.preprocess = clip.load("RN50", device=self.device)

    def __call__(self, text, image):
        with torch.no_grad():
            text = clip.tokenize([text]).to(self.device)
            image = self.preprocess(Image.fromarray(image.astype(np.uint8))).unsqueeze(0).to(self.device)
            image_features = self.model.encode_image(image)[0].numpy().tolist()
            text_features = self.model.encode_text(text)[0].numpy().tolist()
        return [image_features, text_features]
        
model = CLIPModel()

superdupermodel = ObjectModel(identifier="clip", object=model, signature="**kwargs", flatten=True, model_update_kwargs={"document_embedded": False})

db.add(
    Listener(
        model=superdupermodel,
        select=select,
        key={"text": "text", "image": "image"},
        identifier="features"
    )
)


In [None]:
# <testing: Text-And-Image>
datas = list(db.execute(Collection("_outputs.features::0").find()))
for data in datas:
    print(len(data["_outputs.features::0"]))

In [None]:
# <tab: Random>
import numpy as np
from superduperdb import superduper, ObjectModel, Listener

def random(*args, **kwargs):
    return np.random.random(1024, ).tolist()

superdupermodel = ObjectModel(identifier="random", object=random)

db.add(
    Listener(
        model=superdupermodel,
        select=select,
        key="text",
        identifier="features"
    )
)

In [None]:
# <tab: Custom>
import numpy as np
from superduperdb import superduper, ObjectModel, Listener


# Define any feature calculation function
def calc_fake_feature(input_data):
    fake_feature = list(range(10))
    return fake_feature

superdupermodel = ObjectModel(identifier="fake_feature", object=calc_fake_feature)

db.add(
    Listener(
        model=superdupermodel,
        select=select,
        # key of input_data
        key="input_data",
        identifier="features"
    )
)

In [None]:
# <testing>
datas = list(db.execute(select.outputs("features::0")))
for data in datas:
    print(len(data["_outputs.features::0"]))