In [1]:
import dotenv

dotenv.load_dotenv('../.env')
dotenv.load_dotenv('../.env.secret')

import os

os.environ['MLFLOW_TRACKING_URI'] = 'http://localhost:5000'
from pathlib import Path
from datetime import datetime
import numpy as np
import numpy.random as random
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from fleet.base_schemas import TorchModelSpec
from fleet.utils import data
from fleet.model_builder.splitters import apply_split_indexes
from fleet.preprocessing import TransformConfig, FeaturizerConfig
from fleet.dataset_schemas import DatasetConfig, TorchDatasetConfig
from fleet.torch_.schemas import TorchTrainingConfig
from fleet import model_functions
from fleet.base_schemas import TorchModelSpec
from fleet.model_builder import optimizers
from fleet.torch_.models import CustomModel

In [2]:
mol_featurizer_config = {
    'name': 'mol_feat',
    'type': 'fleet.model_builder.featurizers.MoleculeFeaturizer',
    'forward_args': { 'mol': '$x1' },
    'constructor_args': {
        'allow_unknown': False,
        'sym_bond_list': False,
        'per_atom_fragmentation': False
    }
}
mol_feat = FeaturizerConfig.parse_obj(mol_featurizer_config).__root__.create()

In [3]:
dataset_config = TorchDatasetConfig.parse_obj({
    'name': 'sampl',
    'feature_columns': [
        { 'name': 'x1', 'dataType': { 'domainKind': 'smiles', } },
        { 'name': 'x2', 'dataType': { 'domainKind': 'categorical', 'classes': {'a': 1, 'b': 2}} },
        { 'name': 'x3', 'dataType': { 'domainKind': 'numeric', } },
    ],
    'target_columns': [{ 'name': 'y', 'dataType': { 'domainKind': 'numeric', }, 'outModule': 'out' }],
    'featurizers': [mol_featurizer_config]
})
pipe = data.PreprocessingPipeline(dataset_config)

N = 100

df = pd.DataFrame({
    'x1': ['C' * random.randint(4, 16) for i in range(N)],
    'x2': [ random.choice(['a', 'b']) for i in range(N) ],
    'x3': random.randn(N),
    'y': random.randn(N),
})
apply_split_indexes(df)


tdf = pipe.transform(df)[pipe.output_columns]

tdf

name='sampl' target_columns=[TargetTorchColumnConfig(name='y', data_type=NumericDataType(domain_kind='numeric'), out_module='out', loss_fn='torch.nn.MSELoss', column_type='regression')] feature_columns=[ColumnConfig(name='x1', data_type=SmileDataType(domain_kind='smiles')), ColumnConfig(name='x2', data_type=CategoricalDataType(domain_kind='categorical', classes={'a': 1, 'b': 2})), ColumnConfig(name='x3', data_type=NumericDataType(domain_kind='numeric'))] featurizers=[FleetmoleculefeaturizerLayerConfig(type='fleet.model_builder.featurizers.MoleculeFeaturizer', name='mol_feat', constructor_args=FleetmoleculefeaturizerConstructorArgs(allow_unknown=False, sym_bond_list=False, per_atom_fragmentation=False), forward_args=FleetmoleculefeaturizerForwardArgsReferences(mol='$x1'))] transforms=[]
X:
                x1 x2        x3         y step
0            CCCCC  a -0.203286  0.461845    1
1      CCCCCCCCCCC  b  0.099522 -1.759513    1
2    CCCCCCCCCCCCC  b -0.026042  0.060164    3
3       CCCC

Unnamed: 0,mol_feat,y,y.1,x2,x3
0,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",0.461845,,1,-0.203286
1,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",-1.759513,,2,0.099522
2,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",0.060164,,2,-0.026042
3,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",-1.219134,,1,0.638083
4,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",1.483105,,1,-1.339847
...,...,...,...,...,...
95,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",0.378631,,1,0.161128
96,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",0.378355,,2,-0.763055
97,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",-0.939447,,1,-0.664866
98,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",-1.733151,,2,0.265815


In [4]:
dataset = data.MarinerTorchDataset(data=df, dataset_config=dataset_config)
dataset[0:5]['y'].shape


name='sampl' target_columns=[TargetTorchColumnConfig(name='y', data_type=NumericDataType(domain_kind='numeric'), out_module='out', loss_fn='torch.nn.MSELoss', column_type='regression')] feature_columns=[ColumnConfig(name='x1', data_type=SmileDataType(domain_kind='smiles')), ColumnConfig(name='x2', data_type=CategoricalDataType(domain_kind='categorical', classes={'a': 1, 'b': 2})), ColumnConfig(name='x3', data_type=NumericDataType(domain_kind='numeric'))] featurizers=[FleetmoleculefeaturizerLayerConfig(type='fleet.model_builder.featurizers.MoleculeFeaturizer', name='mol_feat', constructor_args=FleetmoleculefeaturizerConstructorArgs(allow_unknown=False, sym_bond_list=False, per_atom_fragmentation=False), forward_args=FleetmoleculefeaturizerForwardArgsReferences(mol='$x1'))] transforms=[]
X:
                x1  x2        x3
0            CCCCC   1 -0.203286
1      CCCCCCCCCCC   2  0.099522
2    CCCCCCCCCCCCC   2 -0.026042
3       CCCCCCCCCC   1  0.638083
4   CCCCCCCCCCCCCC   1 -1.339847
..

RuntimeError: Element 1 of type <class 'int'> is not defined in the classes dictionary {'a': 1, 'b': 2}

In [None]:

spec = TorchModelSpec.parse_obj({
        'name': 'test model',
        'dataset': dataset_config,
        'spec': {
            'layers': [
                {
                    'name': 'concat',
                    'type': 'fleet.model_builder.layers.Concat',
                    'constructor_args': {
                        'dim': -1
                    },
                    'forward_args': {
                        'xs': [
                            '$pool',
                            '$x2',
                            '$x3',
                        ]
                    }
                },
                {
                    'name': 'out',
                    'type': 'torch.nn.Linear',
                    'constructor_args': {
                        'in_features': 26 + 1 + 1,
                        'out_features': 1
                    },
                    'forward_args': {
                        'input': '$concat'
                    }
                },
                 {
                    'name': 'gcn',
                    'type': 'torch_geometric.nn.GCNConv',
                    'constructor_args': {
                        'in_channels': 26,
                        'out_channels': 10
                    },
                    'forward_args': {
                        'x': '$mol_feat.x',
                        'edge_index': '$mol_feat.edge_index',
                    }
                },
                {
                    'name': 'pool',
                    'type': 'fleet.model_builder.layers.GlobalPooling',
                    'constructor_args': {
                        'aggr': 'sum'
                    },
                    'forward_args': {
                        'x': '$mol_feat.x',
                        'edge_index': '$mol_feat.edge_index',
                        'batch': '$mol_feat.batch',
                    }
                }, 
            ]
        }
    })

In [None]:
dm = data.DataModule(data=df, config=dataset_config)
dm.setup()
dataloader = dm.train_dataloader()
batch = next(iter(dataloader))
batch['mol_feat'].x.shape

In [None]:
batch['y'].shape

In [None]:
batch['x2'].shape

In [None]:
batch['x3'].shape

In [None]:
model = CustomModel(config=spec.spec, dataset_config=spec.dataset)
model(batch)

In [None]:
model_functions.fit(
    spec,
    dataset=df,
    train_config=TorchTrainingConfig(epochs=1,optimizer=optimizers.AdamOptimizer()),
    mlflow_model_name=f"Test Model {datetime.now()}",
    mlflow_experiment_name=f"Test Experiment {datetime.now()}",
)