<!-- TABS -->
# Transfer learning

<!-- TABS -->
## Connect to superduper

In [1]:
from superduper import superduper

db = superduper('mongomock:///test_db')

[32m2024-Aug-30 21:34:07.16[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.misc.plugins[0m:[36m13  [0m | [1mLoading plugin: mongodb[0m
[32m2024-Aug-30 21:34:07.29[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m103 [0m | [1mBuilding Data Layer[0m
[32m2024-Aug-30 21:34:07.29[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.build[0m:[36m171 [0m | [1mConfiguration: 
 +---------------+----------------------+
| Configuration |        Value         |
+---------------+----------------------+
|  Data Backend | mongomock:///test_db |
+---------------+----------------------+[0m


<!-- TABS -->
## Get useful sample data

In [2]:
# <tab: Text-Classification>
!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text_classification.json
import json

with open("text_classification.json", "r") as f:
    data = json.load(f)
num_classes = 2

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1298k  100 1298k    0     0  1127k      0  0:00:01  0:00:01 --:--:-- 1130k


In [None]:
# <tab: Image-Classification>
!curl -O https://superduperdb-public-demo.s3.amazonaws.com/images_classification.zip && unzip images_classification.zip
import json
from PIL import Image

with open('images/images.json', 'r') as f:
    data = json.load(f)
    
data = [{'x': Image.open(d['image_path']), 'y': d['label']} for d in data]
num_classes = 2

After obtaining the data, we insert it into the database.

In [3]:
# <tab: Text-Classification>
datas = [{'txt': d['x'], 'label': d['y']} for d in data]

In [None]:
# <tab: Image-Classification>
datas = [{'image': d['x'], 'label': d['y']} for d in data]

<!-- TABS -->
## Insert simple data

After turning on auto_schema, we can directly insert data, and superduper will automatically analyze the data type, and match the construction of the table and datatype.

In [4]:
from superduper import Document

table_or_collection = db['docs']

ids = db.execute(table_or_collection.insert([Document(data) for data in datas]))
select = table_or_collection.select()

[32m2024-Aug-30 21:34:13.87[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m363 [0m | [1mTable docs does not exist, auto creating...[0m
[32m2024-Aug-30 21:34:13.87[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m369 [0m | [1mCreating table docs with schema {('label', 'int'), ('_fold', 'str'), ('txt', 'str')}[0m
[32m2024-Aug-30 21:34:13.98[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m344 [0m | [1mInserted 1000 documents into docs[0m


<!-- TABS -->
## Compute features

In [5]:
# <tab: Text>
key = 'txt'
import sentence_transformers
from superduper import vector, Listener
from superduper_sentence_transformers import SentenceTransformer

superdupermodel = SentenceTransformer(
    identifier="embedding",
    object=sentence_transformers.SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
    postprocess=lambda x: x.tolist(),
)

jobs, listener = db.apply(
    Listener(
        model=superdupermodel,
        select=select,
        key=key,
        identifier="features"
    )
)

  from tqdm.autonotebook import tqdm, trange


[32m2024-Aug-30 21:34:23.96[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.components.listener[0m:[36m94  [0m | [1mRequesting listener setup on CDC service[0m
[32m2024-Aug-30 21:34:23.96[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.components.listener[0m:[36m104 [0m | [1mSkipping listener setup on CDC service since no URI is set[0m
[32m2024-Aug-30 21:34:23.96[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.jobs.queue[0m:[36m210 [0m | [1mRunning jobs for listener::features[0m
[32m2024-Aug-30 21:34:23.96[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.backends.local.compute[0m:[36m67  [0m | [1mSubmitting job. function:<function method_job at 0x1121a74c0>[0m
[32m2024-Aug-30 21:34:23.99[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.components.model[0m:[36m720 [0m | [1mRequesting prediction in db - [embedding] with predict_id features
[0

  return torch.load(io.BytesIO(b))


[32m2024-Aug-30 21:34:28.66[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.components.model[0m:[36m853 [0m | [1mAdding 1000 model outputs to `db`[0m
[32m2024-Aug-30 21:34:29.94[0m| [1mINFO    [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.base.datalayer[0m:[36m344 [0m | [1mInserted 1000 documents into _outputs__features[0m
[32m2024-Aug-30 21:34:29.94[0m| [32m[1mSUCCESS [0m | [36mDuncans-MBP.fritz.box[0m| [36msuperduper.backends.local.compute[0m:[36m73  [0m | [32m[1mJob submitted on <superduper.backends.local.compute.LocalComputeBackend object at 0x2a63a6950>.  function:<function method_job at 0x1121a74c0> future:2638f7c5-655d-47e6-ba13-a214afd3dd3c[0m


In [None]:
# <tab: Image>
key = 'image'
import torchvision.models as models
from torchvision import transforms
from superduper_torch import TorchModel
from superduper import Listener
from PIL import Image

class TorchVisionEmbedding:
    def __init__(self):
        # Load the pre-trained ResNet-18 model
        self.resnet = models.resnet18(pretrained=True)
        
        # Set the model to evaluation mode
        self.resnet.eval()
        
    def preprocess(self, image):
        # Preprocess the image
        preprocess = preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        tensor_image = preprocess(image)
        return tensor_image
        
model = TorchVisionEmbedding()
superdupermodel = TorchModel(identifier='my-vision-model-torch', object=model.resnet, preprocess=model.preprocess, postprocess=lambda x: x.numpy().tolist())

jobs, listener = db.apply(
    Listener(
        model=superdupermodel,
        select=select,
        key=key,
        identifier="features"
    )
)

## Choose features key from feature listener

In [6]:
input_key = listener.outputs
training_select = select.outputs(listener.predict_id)

We can find the calculated feature data from the database.

In [7]:
feature = list(training_select.limit(1).execute())[0][input_key]
feature_size = len(feature)

<!-- TABS -->
## Build and train classifier

In [8]:
# <tab: Scikit-Learn>
from superduper_sklearn import Estimator, SklearnTrainer
from sklearn.svm import SVC

model = Estimator(
    identifier="my-model",
    object=SVC(),
    trainer=SklearnTrainer(
        "my-trainer",
        key=(input_key, "label"),
        select=training_select,
    ),
)

In [None]:
# <tab: Torch>
import torch
from torch import nn
from superduper_torch.model import TorchModel
from superduper_torch.training import TorchTrainer
from torch.nn.functional import cross_entropy


class SimpleModel(nn.Module):
    def __init__(self, input_size=16, hidden_size=32, num_classes=3):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

preprocess = lambda x: torch.tensor(x)

# Postprocess function for the model output    
def postprocess(x):
    return int(x.topk(1)[1].item())

def data_transform(features, label):
    return torch.tensor(features), label

# Create a Logistic Regression model
# feature_length is the input feature size
model = SimpleModel(feature_size, num_classes=num_classes)
model = TorchModel(
    identifier='my-model',
    object=model,         
    preprocess=preprocess,
    postprocess=postprocess,
    trainer=TorchTrainer(
        key=(input_key, 'label'),
        identifier='my_trainer',
        objective=cross_entropy,
        loader_kwargs={'batch_size': 10},
        max_iterations=1000,
        validation_interval=100,
        select=select,
        transform=data_transform,
    ),
)

Define a validation for evaluating the effect after training.

In [9]:
from superduper import Dataset, Metric, Validation


def acc(x, y):
    return sum([xx == yy for xx, yy in zip(x, y)]) / len(x)


accuracy = Metric(identifier="acc", object=acc)
validation = Validation(
    "transfer_learning_performance",
    key=(input_key, "label"),
    datasets=[
        Dataset(identifier="my-valid", select=training_select.add_fold('valid'))
    ],
    metrics=[accuracy],
)
model.validation = validation

If we execute the apply function, then the model will be added to the database, and because the model has a Trainer, it will perform training tasks.

In [10]:
db.apply(model)



([],
 Estimator(trainer=SklearnTrainer(identifier='my-trainer', uuid='4ed22cfa3a9c4175ae89b7511eda1afb', upstream=None, plugins=None, cache=False, key=('_outputs__features', 'label'), select=docs.select().outputs("features"), transform=None, metric_values={}, signature='*args', data_prefetch=False, prefetch_size=1000, prefetch_factor=100, in_memory=True, compute_kwargs={}, fit_params={}, predict_params={}, y_preprocess=None), identifier='my-model', uuid='1e6b4f4000ac4cb8b5123e2cc944a071', upstream=None, plugins=None, cache=False, signature='singleton', datatype=None, output_schema=None, flatten=False, model_update_kwargs={}, predict_kwargs={}, compute_kwargs={}, validation=Validation(identifier='transfer_learning_performance', uuid='b01186bf87cb49b0a6dcf86f53a660c5', upstream=None, plugins=None, cache=False, metrics=[Metric(identifier='acc', uuid='e83f664b7f6947cfb67947e472b7d467', upstream=None, plugins=None, cache=False, object=<function acc at 0x2d41cae80>)], key=('_outputs__feature

In [11]:
model.encode()

{'_base': '?my-model',
 '_builds': {'docs-select-outputs-features': {'_path': 'superduper_mongodb.query.parse_query',
   'documents': [],
   'query': 'docs.select().outputs("features")'},
  'my-trainer': {'_path': 'superduper_sklearn.model.SklearnTrainer',
   'uuid': '4ed22cfa3a9c4175ae89b7511eda1afb',
   'upstream': None,
   'plugins': None,
   'cache': False,
   'key': ('_outputs__features', 'label'),
   'select': '?docs-select-outputs-features',
   'transform': None,
   'metric_values': {},
   'signature': '*args',
   'data_prefetch': False,
   'prefetch_size': 1000,
   'prefetch_factor': 100,
   'in_memory': True,
   'compute_kwargs': {},
   'fit_params': {},
   'predict_params': {},
   'y_preprocess': None,
   'type_id': 'trainer',
   'version': 0,
   'hidden': False},
  'dill': {'_path': 'superduper.components.datatype.get_serializer',
   'method': 'dill',
   'encodable': 'artifact',
   'type_id': 'datatype',
   'version': None,
   'uuid': '2e98be744c274ffabc472e74bcdb8b13'},
  '

Get the training metrics

In [None]:
model = db.load('model', model.identifier)
model.metric_values

In [12]:
from superduper import Template

t = Template('transfer-learner', template=model, substitutions={'docs': 'table'})

In [13]:
t.export('.')