In [1]:
import dotenv

dotenv.load_dotenv('../.env')
dotenv.load_dotenv('../.env.secret')

import os

os.environ['MLFLOW_TRACKING_URI'] = 'http://localhost:5000'
from pathlib import Path
from datetime import datetime
import numpy as np
import numpy.random as random
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from fleet.base_schemas import TorchModelSpec
from fleet.utils import data
from fleet.model_builder.splitters import apply_split_indexes
from fleet.preprocessing import TransformConfig, FeaturizerConfig
from fleet.dataset_schemas import DatasetConfig, TorchDatasetConfig
from fleet.torch_.schemas import TorchTrainingConfig
from fleet import model_functions
from fleet.base_schemas import TorchModelSpec
from fleet.model_builder import optimizers
from fleet.torch_.models import CustomModel

transformer = FunctionTransformer()
mol_featurizer_config = {
    'name': 'mol_feat',
    'type': 'fleet.model_builder.featurizers.MoleculeFeaturizer',
    'forward_args': { 'mol': '$smiles' },
    'constructor_args': {
        'allow_unknown': False,
        'sym_bond_list': False,
        'per_atom_fragmentation': False
    }
}
transformer = FeaturizerConfig.parse_obj(mol_featurizer_config).__root__.create()

In [2]:
transformer('CCC')

Data(x=[3, 26], edge_index=[2, 2], edge_attr=[2, 9])

In [3]:
sk_transformer = FunctionTransformer(transformer)

In [4]:
sk_transformer.fit('CCC')
a = sk_transformer.transform('CCC')
b = sk_transformer.fit_transform('CCC')
print(a, b)


Data(x=[3, 26], edge_index=[2, 2], edge_attr=[2, 9]) Data(x=[3, 26], edge_index=[2, 2], edge_attr=[2, 9])


In [5]:

sk_transformer.fit(['CCC', 'CC', 'CCCC'])
a = sk_transformer.transform(['CCC', 'CC', 'CCCC'])
b = sk_transformer.fit_transform(['CCC', 'CC', 'CCCC'])
print(a, b)

df = pd.DataFrame({'mol_feat': a, 'y': ['a', 'b', 'a']})
df


[Data(x=[3, 26], edge_index=[2, 2], edge_attr=[2, 9]), Data(x=[2, 26], edge_index=[2, 1], edge_attr=[1, 9]), Data(x=[4, 26], edge_index=[2, 3], edge_attr=[3, 9])] [Data(x=[3, 26], edge_index=[2, 2], edge_attr=[2, 9]), Data(x=[2, 26], edge_index=[2, 1], edge_attr=[1, 9]), Data(x=[4, 26], edge_index=[2, 3], edge_attr=[3, 9])]


Unnamed: 0,mol_feat,y
0,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",a
1,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",b
2,"[(x, [tensor([0., 1., 0., 0., 0., 0., 0., 0., ...",a


In [6]:
mol_featurizer_config = {
    'name': 'mol_feat',
    'type': 'fleet.model_builder.featurizers.MoleculeFeaturizer',
    'forward_args': { 'mol': '$x1' },
    'constructor_args': {
        'allow_unknown': False,
        'sym_bond_list': False,
        'per_atom_fragmentation': False
    }
}
mol_feat = FeaturizerConfig.parse_obj(mol_featurizer_config).__root__.create()
t = FunctionTransformer(mol_feat)
t.transform(['CCCC', 'CC'])

[Data(x=[4, 26], edge_index=[2, 3], edge_attr=[3, 9]),
 Data(x=[2, 26], edge_index=[2, 1], edge_attr=[1, 9])]

In [7]:
dataset_config = TorchDatasetConfig.parse_obj({
    'name': 'sampl',
    'feature_columns': [
        { 'name': 'x1', 'dataType': { 'domainKind': 'smiles', } },
        { 'name': 'x2', 'dataType': { 'domainKind': 'categorical', 'classes': {'a': 1, 'b': 2}} },
        { 'name': 'x3', 'dataType': { 'domainKind': 'numeric', } },
    ],
    'target_columns': [{ 'name': 'y', 'dataType': { 'domainKind': 'numeric', }, 'outModule': 'out' }],
    'featurizers': [mol_featurizer_config]
})
pipe = data.PreprocessingPipeline(dataset_config)

N = 100

df = pd.DataFrame({
    'x1': ['C' * random.randint(4, 16) for i in range(N)],
    'x2': [ random.choice(['a', 'b']) for i in range(N) ],
    'x3': random.randn(N),
    'y': random.randn(N),
})
apply_split_indexes(df)
# pipe.fit(*pipe.get_X_and_y(df))
# tdf = pipe.transform(df)

In [8]:
dataset = data.MarinerTorchDataset(data=df, dataset_config=dataset_config)
dataset[0:5]['y'].shape


(6,)

In [9]:

spec = TorchModelSpec.parse_obj({
        'name': 'test model',
        'dataset': dataset_config,
        'spec': {
            'layers': [
                {
                    'name': 'concat',
                    'type': 'fleet.model_builder.layers.Concat',
                    'constructor_args': {
                        'dim': -1
                    },
                    'forward_args': {
                        'xs': [
                            '$pool',
                            '$x2',
                            '$x3',
                        ]
                    }
                },
                {
                    'name': 'out',
                    'type': 'torch.nn.Linear',
                    'constructor_args': {
                        'in_features': 26 + 1 + 1,
                        'out_features': 1
                    },
                    'forward_args': {
                        'input': '$concat'
                    }
                },
                 {
                    'name': 'gcn',
                    'type': 'torch_geometric.nn.GCNConv',
                    'constructor_args': {
                        'in_channels': 26,
                        'out_channels': 10
                    },
                    'forward_args': {
                        'x': '$mol_feat.x',
                        'edge_index': '$mol_feat.edge_index',
                    }
                },
                {
                    'name': 'pool',
                    'type': 'fleet.model_builder.layers.GlobalPooling',
                    'constructor_args': {
                        'aggr': 'sum'
                    },
                    'forward_args': {
                        'x': '$mol_feat.x',
                        'edge_index': '$mol_feat.edge_index',
                        'batch': '$mol_feat.batch',
                    }
                }, 
            ]
        }
    })

In [10]:
dm = data.DataModule(data=df, config=dataset_config)
dm.setup()
dataloader = dm.train_dataloader()
batch = next(iter(dataloader))
batch['mol_feat'].x.shape

torch.Size([316, 26])

In [11]:
batch['y'].shape

torch.Size([32, 1])

In [12]:
batch['x2'].shape

torch.Size([32, 1])

In [13]:
batch['x3'].shape

torch.Size([32, 1])

In [14]:
model = CustomModel(config=spec.spec, dataset_config=spec.dataset)
model(batch)

{'y': tensor([[2.6467],
         [1.6439],
         [2.5513],
         [3.2856],
         [2.2475],
         [2.3474],
         [2.7054],
         [1.3772],
         [2.0324],
         [2.8186],
         [2.2451],
         [2.4320],
         [2.7756],
         [2.2322],
         [1.5156],
         [2.6630],
         [1.8293],
         [2.6450],
         [2.9735],
         [1.7742],
         [3.1206],
         [3.0497],
         [1.5635],
         [1.6194],
         [2.3243],
         [2.9685],
         [1.0631],
         [1.4155],
         [2.3788],
         [1.6472],
         [2.5427],
         [2.9208]], grad_fn=<AddmmBackward0>)}

In [15]:
model_functions.fit(
    spec,
    dataset=df,
    train_config=TorchTrainingConfig(epochs=1,optimizer=optimizers.AdamOptimizer()),
    mlflow_model_name=f"Test Model {datetime.now()}",
    mlflow_experiment_name=f"Test Experiment {datetime.now()}",
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name   | Type       | Params
--------------------------------------
0 | _model | ModuleDict | 299   
--------------------------------------
299       Trainable params
0         Non-trainable params
299       Total params
0.001     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=1` reached.


Result(mlflow_experiment_id='90', mlflow_model_version=None)