<!-- TABS -->
# Create datatype

SuperduperDB supports automatic data conversion, so users don’t need to worry about the compatibility of different data formats (`PIL.Image`, `numpy.array`, `pandas.DataFrame`, etc.) with the database.

It also supports custom data conversion methods for transforming data, such as defining the following Datatype.

In [1]:
# <testing: >
from superduper import superduper

db = superduper("mongomock://test")

  from .autonotebook import tqdm as notebook_tqdm
2024-06-04 21:43:53,489	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


2024-Jun-04 21:43:53.49| INFO     | zhouhaha-2.local| superduper.base.build:69   | Data Client is ready. mongomock.MongoClient('localhost', 27017)
2024-Jun-04 21:43:53.58| INFO     | zhouhaha-2.local| superduper.base.build:42   | Connecting to Metadata Client with engine:  mongomock.MongoClient('localhost', 27017)
2024-Jun-04 21:43:53.58| INFO     | zhouhaha-2.local| superduper.base.build:155  | Connecting to compute client: None
2024-Jun-04 21:43:53.58| INFO     | zhouhaha-2.local| superduper.base.datalayer:85   | Building Data Layer
2024-Jun-04 21:43:53.59| INFO     | zhouhaha-2.local| superduper.base.build:220  | Configuration: 
 +---------------+------------------+
| Configuration |      Value       |
+---------------+------------------+
|  Data Backend | mongomock://test |
+---------------+------------------+


In [None]:
# <tab: Vector>
from superduper import vector

datatype = vector(shape=(3, ))

In [None]:
# <tab: Tensor>
from superduper.ext.torch import tensor
import torch

datatype = tensor(torch.float, shape=(32, 32, 3))

In [None]:
# <tab: Array>
from superduper.ext.numpy import array
import numpy as np

datatype = array(dtype="float64", shape=(32, 32, 3))

In [None]:
# <tab: Text>
datatype = 'str'

In [None]:
# <tab: PDF>
from superduper import DataType

# By creating a datatype and setting its encodable attribute to “file” for saving PDF files, 
# all datatypes encoded as “file” will have their corresponding files uploaded to the artifact store. 
# References will be recorded in the database, and the files will be downloaded locally when needed. 

datatype = DataType('pdf', encodable='file')

In [None]:
# <tab: Image>
from superduper.ext.pillow import pil_image
import PIL.Image

datatype = pil_image

In [1]:
# <tab: URI>

datatype = None

In [None]:
# <tab: Audio>
from superduper.ext.numpy import array
from superduper import DataType
import scipy.io.wavfile
import io


def encoder(data):
    buffer = io.BytesIO()
    fs = data[0]
    content = data[1]
    scipy.io.wavfile.write(buffer, fs, content)
    return buffer.getvalue()


def decoder(data):
    buffer = io.BytesIO(data)
    content = scipy.io.wavfile.read(buffer)
    return content


datatype = DataType(
    'wav',
    encoder=encoder,
    decoder=decoder,
    encodable='artifact',
)

In [None]:
# <testing: >
!curl -O https://superduper-public-demo.s3.amazonaws.com/audio.zip && unzip audio.zip
test = scipy.io.wavfile.read('./audio/1.wav')
datatype.decoder(datatype.encoder(test))

In [None]:
# <tab: Video>
from superduper import DataType

# Create an instance of the Encoder with the identifier 'video_on_file' and load_hybrid set to False
datatype = DataType(
    identifier='video_on_file',
    encodable='file',
)

In [None]:
# <tab: Encodable>
from superduper import DataType
import pandas as pd

def encoder(x, info=None):
    return x.to_json()

def decoder(x, info):
    return pd.read_json(x)
    
datatype = DataType(
    identifier="pandas",
    encoder=encoder,
    decoder=decoder
)

In [None]:
# <tab: Artifact>
from superduper import DataType
import numpy as np
import pickle


def pickle_encode(object, info=None):
    return pickle.dumps(object)

def pickle_decode(b, info=None):
    return pickle.loads(b)


datatype = DataType(
    identifier="VectorSearchMatrix",
    encoder=pickle_encode,
    decoder=pickle_decode,
    encodable='artifact',
)

In [None]:
# <testing: >
from superduper import DataType
if datatype and isinstance(datatype, DataType):
    db.apply(datatype)

In [None]:
# <testing: >

from superduper.backends.mongodb import Collection
from superduper import Document
collection = Collection("data")

print(origin_data)

db.execute(collection.insert_one(Document({"x": datatype(origin_data)})))

data = db.execute(collection.find_one())
print(data.unpack()["x"])