## Sci-kit dataset preparation as matrix

This notebook shows how to prepare the datasets for sci-kit based models. Sci-kit's API is very consistent, and it should work "out of the box" when given numeric (2D) matrices.

In [1]:
!pip install -q seaborn molfeat


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from typing import Literal, Union, Dict, List

import os
import seaborn as sns
import pandas as pd
from pathlib import Path
from humps import camel
from pydantic import BaseModel
import numpy as np
import sklearn.base
import mlflow.sklearn
import mlflow
import mlflow.tracking

from fleet.base_schemas import BaseModelFunctions
from fleet.model_builder.utils import get_references_dict
from fleet.dataset_schemas import DatasetConfigBuilder, DatasetConfig
from fleet import data_types
from fleet.utils import data
from fleet.yaml_model import YAML_Model
from fleet.model_builder.utils import get_class_from_path_string
from fleet.model_builder import splitters
from fleet.metrics import Metrics

from mariner.core.mlflowapi import log_sklearn_model_and_create_version

### Loading the dataset

In [3]:
! [ ! -f HIV.csv ] && wget https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/HIV.csv
! [ ! -f SAMPL.csv ] && wget https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/SAMPL.csv

In [4]:
hiv_df = pd.read_csv('HIV.csv')
sampl_df = pd.read_csv('SAMPL.csv')

if 'step' not in hiv_df.columns:
    splitters.apply_split_indexes(
        hiv_df,
        split_type="scaffold",
        split_column="smiles",
        split_target="80-10-10")
    hiv_df.to_csv('HIV.csv', index=False)

if 'step' not in sampl_df.columns:
    splitters.apply_split_indexes(
        sampl_df,
        split_type="scaffold",
        split_column="smiles",
        split_target="80-10-10")
    sampl_df.to_csv('SAMPL.csv', index=False)


In [5]:
sampl_df

Unnamed: 0,iupac,smiles,expt,calc,step
0,"4-methoxy-N,N-dimethyl-benzamide",CN(C)C(=O)c1ccc(cc1)OC,-11.01,-9.625,1
1,methanesulfonyl chloride,CS(=O)(=O)Cl,-4.87,-6.219,1
2,3-methylbut-1-ene,CC(C)C=C,1.83,2.452,1
3,2-ethylpyrazine,CCc1cnccn1,-5.45,-5.809,3
4,heptan-1-ol,CCCCCCCO,-4.21,-2.917,1
...,...,...,...,...,...
637,methyl octanoate,CCCCCCCC(=O)OC,-2.04,-3.035,1
638,pyrrolidine,C1CCNC1,-5.48,-4.278,3
639,4-hydroxybenzaldehyde,c1cc(ccc1C=O)O,-8.83,-10.050,1
640,1-chloroheptane,CCCCCCCCl,0.29,1.467,1


## Config Classes

First we need to create the interface to interact with sklearn classes and molfeat transforms.

To keep the same concepts from other parts of the app, we have 2 kinds of classes:

- `ConstructorArgs` classes: describe the arguments given to a class through it's constructor
- `Config` classes: describe the interaction with the class; `constructor_args` works as above explained, and `fit_args` models the arguments passed to the ML model class during fit.

In [6]:

class CamelCaseModel(BaseModel):
    """
    Subclass this class to work with camel case serialization of the model.
    """
    class Config:
        alias_generator = camel.case
        allow_population_by_field_name = True
        allow_population_by_alias = True
        underscore_attrs_are_private = True

class CreateFromType:
    """
    Adds a method to instantiate a class from it's class path (type) and constructor_args.
    
    Attributes:
        type (str): The class path of the class that will be instantiated.
        constructor_args (BaseModel): The constructor arguments passed to the class.
    """
    type: str
    constructor_args:  Union[None, BaseModel] = None
    
    def create(self):
        class_ = get_class_from_path_string(self.type)
        if self.constructor_args:
            return class_(**self.constructor_args.dict())
        return class_()


class FPVecFilteredTransformerConstructorArgs(BaseModel):
    """
    Models the constructor arguments of a FPVecFilteredTransformer.
    """
    del_invariant: bool = None
    length: int = None


class FPVecFilteredTransformerConfig(CamelCaseModel, CreateFromType):
    """
    Models the usage of FPVecFilteredTransformer. 
    """
    name: str
    constructor_args: FPVecFilteredTransformerConstructorArgs = FPVecFilteredTransformerConstructorArgs()
    type = 'molfeat.trans.fp.FPVecFilteredTransformer'
    forward_args: dict
    
    
hiv_dataset_config = DatasetConfigBuilder('Test').with_features(
    smiles=data_types.SmileDataType(),
).with_targets(
    activity=data_types.CategoricalDataType(classes={'CI': 0, 'CM': 1, 'CA': 2})
).add_transforms(FPVecFilteredTransformerConfig(name="MolFPFeaturizer", forward_args={
    'X': '$smiles',
})).build()


hiv_dataset_config

DatasetConfig(name='Test', target_columns=[ColumnConfig(name='activity', data_type=CategoricalDataType(domain_kind='categorical', classes={'CI': 0, 'CM': 1, 'CA': 2}))], feature_columns=[ColumnConfig(name='smiles', data_type=SmileDataType(domain_kind='smiles'))], featurizers=[], transforms=[FPVecFilteredTransformerConfig(name='MolFPFeaturizer', constructor_args=FPVecFilteredTransformerConstructorArgs(del_invariant=None, length=None), forward_args={'X': '$smiles'}, type='molfeat.trans.fp.FPVecFilteredTransformer')])

In [7]:
TaskType = Literal['regressor', 'multiclass', 'multilabel']

class KNeighborsRegressorConstructorArgs(BaseModel):
    n_neighbors: int = 5
    algorithm: Literal['kd_tree'] = 'kd_tree'


class KNeighborsRegressorConfig(CamelCaseModel, CreateFromType):
    type: Literal['sklearn.neighbors.KNeighborsRegressor'] = 'sklearn.neighbors.KNeighborstRegressor'
    constructor_args: KNeighborsRegressorConstructorArgs = KNeighborsRegressorConstructorArgs()
    fit_args: Dict[str, str]
    task_type: List[TaskType] = [ 'regressor' ]
    

class RandomForestRegressorConstructorArgs(BaseModel):
    n_estimators: int = 50
    max_depth: Union[None, int] = None
    min_samples_split: Union[float, int] = 2
    min_samples_leaf: Union[float, int] = 1
    min_weight_fraction_leaf: float = .0
    max_features: Union[None, Literal['sqrt', 'log2']] = 1.
    max_leaf_nodes: Union[None, int] = None
    min_impurity_decrease: float = .0
    bootstrap: bool = True
    oob_score: bool = False
    n_jobs: Union[int, None] = None
    ccp_alpha: float = .0
    max_samples: Union[None, int, float] = None


class RandomForestRegressorConfig(CamelCaseModel, CreateFromType):
    type: Literal['sklearn.ensemble.RandomForestRegressor'] = 'sklearn.ensemble.RandomForestRegressor'
    task_type: List[TaskType] = [ 'regressor' ]
    constructor_args: RandomForestRegressorConstructorArgs = RandomForestRegressorConstructorArgs()
    fit_args: Dict[str, str]


class SklearnDatasetConfig(DatasetConfig):
    pass


class SklearnModelSchema(CamelCaseModel, YAML_Model):
    model: Union[KNeighborsRegressorConfig, RandomForestRegressorConfig]


class SklearnModelSpec(CamelCaseModel, YAML_Model):
    framework = 'sklearn'
    name: str
    dataset: SklearnDatasetConfig
    spec: SklearnModelSchema



We also create a class to operate the models with a dataset. Later this class can be improved to log different metrics depending on the model task.

In [8]:


class SklearnModelFunctions(BaseModelFunctions):
    
    model: Union[None, sklearn.base.RegressorMixin, sklearn.base.ClassifierMixin]
    
    def __init__(self, spec: SklearnModelSpec, dataset: pd.DataFrame):
        self.spec = spec
        self.dataset = dataset
        self.metrics = Metrics(model_type="regression", return_type="float")
    
    def _prepare_X_and_y(self, dataset: Union[None, pd.DataFrame]=None, filter_step: Union[None, int] = None, targets=True):
        model_config = self.spec.spec
        dataset_config = self.spec.dataset
        dataset = dataset if dataset is not None else self.dataset
        
        if filter_step is not None:
            dataset = dataset[dataset['step'] == filter_step]
        
        references = get_references_dict(model_config.model.fit_args)
        args = { key: dataset[ref][:] for key, ref in references.items() }
        
        assert 'X' in args, 'sklearn models take an X argument'
        X = np.stack(args['X'].to_numpy())

        if targets:
            assert 'y' in args, 'sklearn models take an Y argument'
            y = args['y'].to_numpy()
            return X, y
        else:
            return X

        
    def train(self, *, params: BaseModel=None):
        model_config = self.spec.spec
        dataset_config = self.spec.dataset
        self.model = model_config.model.create()
        dataset = data.build_columns_numpy(
            dataset_config=dataset_config,
            df=self.dataset
        )
        X, y = self._prepare_X_and_y(filter_step=1)
        self.model.fit(X, y)
        y_pred = self.model.predict(X)
        metrics_dict = self.metrics.get_training_metrics(y_pred, y)
        mlflow.log_metrics(metrics_dict)
        return metrics_dict

    def val(self):
        X, y = self._prepare_X_and_y(filter_step=2)
        if self.model is None:
            raise ValueError('sklearn model not trained')
        y_pred = self.model.predict(X)
        metrics_dict = self.metrics.get_validation_metrics(y_pred, y)
        mlflow.log_metrics(metrics_dict)
        return metrics_dict
    
    def test(self):
        X, y = self._prepare_X_and_y(filter_step=3)
        if self.model is None:
            raise ValueError('sklearn model not trained')
        y_pred = self.model.predict(X)
        metrics_dict = self.metrics.get_test_metrics(y_pred, y)
        mlflow.log_metrics(metrics_dict)
        return metrics_dict
    
    def predict(self, X: pd.DataFrame):
        model_config = self.spec.spec
        dataset_config = self.spec.dataset
        dataset = data.build_columns_numpy(
            dataset_config=dataset_config,
            df=X
        )
        X = self._prepare_X_and_y(dataset=dataset, targets=False)
        return self.model.predict(X)
    
    def log_model(
        self,
        model_name: Union[None, str] = None,
        version_description: Union[None, str] = None,
        run_id: Union[None, str] = None
    ):
        return log_sklearn_model_and_create_version(
            self.model,
            model_name=model_name,
            version_description=version_description,
            run_id=run_id
        )


In [9]:


# Creates a dataset config for SAMPL using DatasetConfigBuilder
sampl_dataset_config = DatasetConfigBuilder('SAMPL').with_features(
    smiles=data_types.SmileDataType(),
).with_targets(
    expt=data_types.NumericDataType()
).add_transforms(FPVecFilteredTransformerConfig(name="MolFPFeaturizer", forward_args={
    'X': '$smiles',
})).build()

sampl_df = pd.read_csv('SAMPL.csv')
assert 'step' in sampl_df.columns


rf_model_yaml = """
model:
    type: sklearn.ensemble.RandomForestRegressor
    constructorArgs:
        n_estimators: 100
    fitArgs:
        X: $MolFPFeaturizer
        y: $expt
"""

knn_model_yaml = """
model:
    type: sklearn.neighbors.KNeighborsRegressor
    fitArgs:
        X: $MolFPFeaturizer
        y: $expt
"""

# Creating the mlflow registeredd model
client = mlflow.tracking.MlflowClient(
    tracking_uri=os.getenv('MLFLOW_TRACKING_URI'),
)
reg_model = client.get_registered_model('SAMPL Regressor')
if not reg_model:
    reg_model = client.create_registered_model('SAMPL Regressor')


# Creates a sklearn model config from yaml str
model_functions = {
    'rf': SklearnModelFunctions(
            dataset=sampl_df,
            spec=SklearnModelSpec(
                name="SAMPL Regressor - Random Forest",
                dataset=sampl_dataset_config,
                spec=SklearnModelSchema.from_yaml_str(rf_model_yaml),
        )),
    'knn': SklearnModelFunctions(
            dataset=sampl_df,
            spec=SklearnModelSpec(
                name="SAMPL Regressor - KNN",
                dataset=sampl_dataset_config,
                spec=SklearnModelSchema.from_yaml_str(knn_model_yaml),
        )),
}

results = []
for key, model in model_functions.items():
    with mlflow.start_run(nested=True) as run:
        train_metrics = model.train()
        val_metrics = model.val()
        test_metrics = model.test()
        mlflow_model_version = model.log_model(
            model_name=f"SAMPL Regressor",
            run_id=run.info.run_id)
        print(mlflow_model_version)
        item = {'key': key}
        for metrickey, metricvalue in (
            train_metrics | val_metrics | test_metrics
        ).items():
            item[metrickey] = metricvalue
        results.append(item)
pd.DataFrame(results)

  from .autonotebook import tqdm as notebook_tqdm
2023/06/09 16:14:45 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: SAMPL Regressor, version 3


<ModelVersion: creation_timestamp=1686338085866, current_stage='None', description=None, last_updated_timestamp=1686338085866, name='SAMPL Regressor', run_id='3ba51681dabb47b7b02ebf64a4d1ac2b', run_link=None, source='file:///home/vilma/github.com/trident-bio/mariner/backend/notebooks/mlruns/0/3ba51681dabb47b7b02ebf64a4d1ac2b/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>


2023/06/09 16:14:47 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: SAMPL Regressor, version 4


<ModelVersion: creation_timestamp=1686338087517, current_stage='None', description=None, last_updated_timestamp=1686338087517, name='SAMPL Regressor', run_id='298645bbd1ee499285e0477042d114fa', run_link=None, source='file:///home/vilma/github.com/trident-bio/mariner/backend/notebooks/mlruns/0/298645bbd1ee499285e0477042d114fa/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>


Unnamed: 0,key,train/mse,train/mae,train/ev,train/mape,train/R2,train/pearson,val/mse,val/mae,val/ev,val/mape,val/R2,val/pearson,test/mse,test/mae,test/ev,test/mape,test/R2,test/pearson
0,rf,0.509258,0.3971,0.953504,884.732117,0.952593,0.98098,33.169895,3.657475,0.361493,0.679886,0.102549,0.648435,18.981701,3.393853,0.261331,0.58328,-0.436132,0.515757
1,knn,3.327777,1.317216,0.735269,977.110596,0.690215,0.866206,48.096344,4.645344,0.130169,0.998109,-0.301304,0.364089,29.303942,4.282523,-0.010579,0.678985,-1.2171,0.189845


In [10]:
# Test prediction of models
for key, model in model_functions.items():
    X = sampl_df[sampl_df['step'] == 2]
    y_pred = model.predict(X)
    sns.scatterplot(x=y_pred, y=y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[transform.name] = apply(transform.create(), value)


ValueError: X has 357 features, but RandomForestRegressor is expecting 1224 features as input.