<!-- TABS -->
# Transfer learning

<!-- TABS -->
## Configure your production system

:::note
If you would like to use the production features 
of SuperDuperDB, then you should set the relevant 
connections and configurations in a configuration 
file. Otherwise you are welcome to use "development" mode 
to get going with SuperDuperDB quickly.
:::

In [None]:
import os

os.makedirs('.superduperdb', exist_ok=True)
os.environ['SUPERDUPERDB_CONFIG'] = '.superduperdb/config.yaml'

In [None]:
# <tab: MongoDB Community>
CFG = '''
data_backend: mongodb://127.0.0.1:27017/documents
artifact_store: filesystem://./artifact_store
cluster:
  cdc:
    strategy: null
    uri: ray://127.0.0.1:20000
  compute:
    uri: ray://127.0.0.1:10001
  vector_search:
    backfill_batch_size: 100
    type: in_memory
    uri: http://127.0.0.1:21000
'''

In [None]:
# <tab: MongoDB Atlas>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
        type: native
databackend: mongodb+srv://<user>:<password>@<mongo-host>:27017/documents
'''

In [None]:
# <tab: SQLite>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: sqlite://<path-to-db>.db
'''

In [None]:
# <tab: MySQL>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: mysql://<user>:<password>@<host>:<port>/database
'''

In [None]:
# <tab: Oracle>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: mssql://<user>:<password>@<host>:<port>
'''

In [None]:
# <tab: PostgreSQL>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: postgres://<user>:<password>@<host>:<port</<database>
'''

In [None]:
# <tab: Snowflake>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: snowflake://<user>:<password>@<account>/<database>
'''

In [None]:
# <tab: Clickhouse>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: clickhouse://<user>:<password>@<host>:<port>
'''

In [None]:
with open(os.environ['SUPERDUPERDB_CONFIG'], 'w') as f:
    f.write(CFG)

<!-- TABS -->
## Start your cluster

:::note
Starting a SuperDuperDB cluster is useful in production and model development
if you want to enable scalable compute, access to the models by multiple users for collaboration, 
monitoring.

If you don't need this, then it is simpler to start in development mode.
:::

In [None]:
# <tab: Experimental Cluster>
!python -m superduperdb local-cluster up

In [None]:
# <tab: Docker-Compose>
!make testenv_image
!make testenv_init

<!-- TABS -->
## Connect to SuperDuperDB

:::note
Note that this is only relevant if you are running SuperDuperDB in development mode.
Otherwise refer to "Configuring your production system".
:::

In [None]:
# <tab: MongoDB>
from superduperdb import superduper

db = superduper('mongodb://localhost:27017/documents')

In [None]:
# <tab: SQLite>
from superduperdb import superduper
db = superduper('sqlite://my_db.db')

In [None]:
# <tab: MySQL>
from superduperdb import superduper

user = 'superduper'
password = 'superduper'
port = 3306
host = 'localhost'
database = 'test_db'

db = superduper(f"mysql://{user}:{password}@{host}:{port}/{database}")

In [None]:
# <tab: Oracle>
from superduperdb import superduper

user = 'sa'
password = 'Superduper#1'
port = 1433
host = 'localhost'

db = superduper(f"mssql://{user}:{password}@{host}:{port}")

In [None]:
# <tab: PostgreSQL>
!pip install psycopg2
from superduperdb import superduper

user = 'postgres'
password = 'postgres'
port = 5432
host = 'localhost'
database = 'test_db'
db_uri = f"postgres://{user}:{password}@{host}:{port}/{database}"

db = superduper(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))

In [None]:
# <tab: Snowflake>
from superduperdb import superduper

user = "superduperuser"
password = "superduperpassword"
account = "XXXX-XXXX"  # ORGANIZATIONID-USERID
database = "FREE_COMPANY_DATASET/PUBLIC"

snowflake_uri = f"snowflake://{user}:{password}@{account}/{database}"

db = superduper(
    snowflake_uri, 
    metadata_store='sqlite:///your_database_name.db',
)

In [None]:
# <tab: Clickhouse>
from superduperdb import superduper

user = 'default'
password = ''
port = 8123
host = 'localhost'

db = superduper(f"clickhouse://{user}:{password}@{host}:{port}", metadata_store=f'mongomock://meta')

In [None]:
# <tab: DuckDB>
from superduperdb import superduper

db = superduper('duckdb://mydb.duckdb')

In [None]:
# <tab: Pandas>
from superduperdb import superduper

db = superduper(['my.csv'], metadata_store=f'mongomock://meta')

In [None]:
# <tab: MongoMock>
from superduperdb import superduper

db = superduper('mongomock:///test_db')

<!-- TABS -->
## Get useful sample data

In [None]:
from superduperdb.backends.ibis import dtype


In [None]:
# <tab: labeled_text>
!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text_classification.json
import json

with open("text_classification.json", "r") as f:
    data = json.load(f)
sample_datapoint = data[-1]

In [None]:
# <tab: labeled_image>
!curl -O https://superduperdb-public-demo.s3.amazonaws.com/images_classification.zip && unzip images.zip
import json
from PIL import Image

with open('images/images.json', 'r') as f:
    data = json.load(f)

data = [{'x': Image.open(d['image_path']), 'y': d['label']} for d in data]
sample_datapoint = data[-1]

<!-- TABS -->
## Setup tables or collections

In [None]:
# <tab: MongoDB>
# Note this is an optional step for MongoDB
# Users can also work directly with `DataType` if they want to add
# custom data
from superduperdb import Schema, DataType
from superduperdb.backends.mongodb import Collection

table_or_collection = Collection('documents')
USE_SCHEMA = False

if USE_SCHEMA and isinstance(datatype, DataType):
    schema = Schema(fields={'x': datatype})
    db.apply(schema)

In [None]:
# <tab: SQL>
from superduperdb.backends.ibis import Table
from superduperdb import Schema, DataType
from superduperdb.backends.ibis.field_types import dtype

datatype = "str"

if isinstance(datatype, DataType):
    schema = Schema(identifier="schema", fields={"id": dtype("str"), "x": datatype})
else:
    schema = Schema(
        identifier="schema", fields={"id": dtype("str"), "x": dtype(datatype)}
    )

table_or_collection = Table('documents', schema=schema)

db.apply(table_or_collection)

<!-- TABS -->
## Insert data

In order to create data, we need to create a `Schema` for encoding our special `Datatype` column(s) in the databackend.

In [None]:
# <tab: MongoDB>
from superduperdb import Document, DataType

def do_insert(data, schema = None):
    
    if schema is None and (datatype is None or isinstance(datatype, str)):
        data = [Document({'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': x}) for x in data]
        db.execute(table_or_collection.insert_many(data))
    elif schema is None and datatype is not None and isinstance(datatype, DataType):
        data = [Document({'x': datatype(x['x']), 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': datatype(x)}) for x in data]
        db.execute(table_or_collection.insert_many(data))
    else:
        data = [Document({'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': x}) for x in data]
        db.execute(table_or_collection.insert_many(data, schema=schema))


In [None]:
# <tab: SQL>
from superduperdb import Document

def do_insert(data):
    db.execute(table_or_collection.insert([Document({'id': str(idx), 'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'id': str(idx), 'x': x}) for idx, x in enumerate(data)]))


In [None]:
do_insert(data[:-len(data) // 4])

<!-- TABS -->
## Compute features

In [None]:
# <tab: Text>

key = 'txt'

import sentence_transformers
from superduperdb import vector, Listener
from superduperdb.ext.sentence_transformers import SentenceTransformer

superdupermodel = SentenceTransformer(
    identifier="embedding",
    object=sentence_transformers.SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
    datatype=vector(shape=(384,)),
    postprocess=lambda x: x.tolist(),
)

jobs, listener = db.apply(
    Listener(
        model=superdupermodel,
        select=select,
        key=key,
        identifier="features"
    )
)

In [None]:
# <tab: Image>

key = 'image'

import torchvision.models as models
from torchvision import transforms
from superduperdb.ext.torch import TorchModel
from superduperdb import Listener
from PIL import Image

class TorchVisionEmbedding:
    def __init__(self):
        # Load the pre-trained ResNet-18 model
        self.resnet = models.resnet18(pretrained=True)
        
        # Set the model to evaluation mode
        self.resnet.eval()
        
    def preprocess(self, image_array):
        # Preprocess the image
        image = Image.fromarray(image_array.astype(np.uint8))
        preprocess = preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        tensor_image = preprocess(image)
        return tensor_image
        
model = TorchVisionEmbedding()
superdupermodel = TorchModel(identifier='my-vision-model-torch', object=model.resnet, preprocess=model.preprocess, postprocess=lambda x: x.numpy().tolist())

jobs, listener = db.apply(
    Listener(
        model=superdupermodel,
        select=select,
        key=key,
        identifier="features"
    )
)

## Choose input key from listener outputs

:::note
This is useful if you have performed a first step, such as pre-computing 
features, or chunking your data. You can use this query to 
choose the input key for further models such as classification models.
:::

In [None]:
# <tab: MongoDB>
input_key = listener.outputs
select = table_or_collection.find()

In [None]:
# <tab: SQL>
input_key = listener.outputs
select = table_or_collection.outputs(listener.predict_id).select('y', input_key)


<!-- TABS -->
## Build and train classifier

In [None]:
# <tab: Scikit-Learn>
from sklearn.linear_model import LogisticRegression
from superduperdb.ext.sklearn.model import SklearnTrainer, Estimator

# Create a Logistic Regression model
model = LogisticRegression()
model = Estimator(
    object=model,
    identifier='my-model',
    trainer=SklearnTrainer(
        key=(input_key, 'y'),
        select=select,
    )
)

In [None]:
# <tab: Torch>
from torch import nn
from superduperdb.ext.torch.model import TorchModel
from superduperdb.ext.torch.training import TorchTrainer


class SimpleModel(nn.Module):
    def __init__(self, input_size=16, hidden_size=32, num_classes=3):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Loss function
def my_loss(X, y):
    return torch.nn.functional.binary_cross_entropy_with_logits(
        X[:, 0], y.type(torch.float)
    )


# Create a Logistic Regression model
model = SimpleModel()
model = TorchModel(
    identifier='my-model',
    object=model,         
    trainer=TorchTrainer(
        key=(input_key, 'y'),
        identifier='my_trainer',
        objective=my_loss,
        loader_kwargs={'batch_size': 10},
        max_iterations=100,
        validation_interval=10,
        select=select,
    ),
)

The following command adds the model to the system and trains the model in one command.

In [None]:
db.apply(model)