# Preprocessing Pipeline

In [1]:
! [ ! -f SAMPL.csv ] && wget https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/SAMPL.csv
! pip install molfeat -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from typing import Union, List, Literal, Annotated, Callable, Dict, Any, Tuple

from pprint import pformat
from humps import camel
from pydantic import BaseModel, Field

import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.datasets
import sklearn.utils.validation
import sklearn.exceptions
from molfeat.trans.fp import FPVecFilteredTransformer
import torch.nn

from fleet.model_builder import splitters
from fleet import data_types
from fleet.utils.data import (
    dataset_topo_sort,
    get_default_data_type_featurizer,
    get_args,
)
from fleet.yaml_model import YAML_Model
from fleet.dataset_schemas import DatasetConfig, ColumnConfig
from fleet.model_builder.featurizers import DNASequenceFeaturizer
from fleet.model_builder.utils import get_class_from_path_string

In [3]:
# Some utility classes:


class CamelCaseModel(BaseModel):
    """
    Subclass this class to work with camel case serialization of the model.
    """

    class Config:
        alias_generator = camel.case
        allow_population_by_field_name = True
        allow_population_by_alias = True
        underscore_attrs_are_private = True


class CreateFromType(BaseModel):
    """
    Adds a method to instantiate a class from it's class path (type) and constructor_args.

    Attributes:
        type (str): The class path of the class that will be instantiated.
        constructor_args (BaseModel): The constructor arguments passed to the class.
    """

    type: str
    constructor_args: Union[None, BaseModel]

    def create(self):
        class_ = get_class_from_path_string(self.type)
        if self.constructor_args:
            return class_(**self.constructor_args.dict())
        return class_()

In [4]:
# Featurizers


class FPVecFilteredTransformerConstructorArgs(BaseModel):
    """
    Models the constructor arguments of a FPVecFilteredTransformer.
    """

    del_invariant: bool = None
    length: int = None


class FPVecFilteredTransformerConfig(CamelCaseModel, CreateFromType):
    """
    Models the usage of FPVecFilteredTransformer.
    """

    name: str
    constructor_args: FPVecFilteredTransformerConstructorArgs = (
        FPVecFilteredTransformerConstructorArgs()
    )
    type: Literal[
        "molfeat.trans.fp.FPVecFilteredTransformer"
    ] = "molfeat.trans.fp.FPVecFilteredTransformer"
    forward_args: Union[Dict[str, str], list[str]]


class DNASequenceFeaturizerConfig(CamelCaseModel):
    name: str
    type: Literal[
        "fleet.model_builder.featurizers.DNASequenceFeaturizer"
    ] = "fleet.model_builder.featurizers.DNASequenceFeaturizer"
    forward_args: Union[Dict[str, str], list[str]]


Featurizer = Annotated[
    Union[
        FPVecFilteredTransformerConfig,
        DNASequenceFeaturizerConfig,
    ],
    Field(discriminator="type"),
]


class FeaturizerConfig(CamelCaseModel):
    __root__: Featurizer

In [5]:
# Transforms


class LabelEncoderConfig(CreateFromType, CamelCaseModel):
    type: Literal[
        "sklearn.preprocessing.LabelEncoder"
    ] = "sklearn.preprocessing.LabelEncoder"
    name: str
    forward_args: Union[Dict[str, str], list[str]]


class StandardScalerConstructorArgs(BaseModel):
    with_mean: bool = True
    with_std: bool = True


class StandardScalerConfig(CreateFromType, CamelCaseModel):
    type: Literal[
        "sklearn.preprocessing.StandardScaler"
    ] = "sklearn.preprocessing.StandardScaler"
    constructor_args: StandardScalerConstructorArgs = (
        StandardScalerConstructorArgs()
    )
    name: str
    forward_args: Union[Dict[str, str], list[str]]


Transformer = Annotated[
    Union[
        StandardScalerConfig,
        LabelEncoderConfig,
    ],
    Field(discriminator="type"),
]


class TransformConfig(CamelCaseModel):
    __root__: Transformer

In [6]:
TransformConfig.parse_obj(
    {
        "name": "transformer",
        "type": "sklearn.preprocessing.StandardScaler",
        "forward_args": ["$sepal_width"],
    }
).__root__.create()

In [7]:
class ColumnConfigWithPreprocessing(CamelCaseModel):
    name: str
    data_type: data_types.DataType
    transforms: Union[None, List[CreateFromType]] = None
    featurizers: Union[None, List[CreateFromType]] = None


class DatasetConfig(CamelCaseModel):
    name: str
    feature_columns: List[ColumnConfig]
    target_columns: List[ColumnConfig]
    featurizers: List[Featurizer]
    transforms: List[Transformer]


class DatasetConfigWithPreprocessing(CamelCaseModel, YAML_Model):
    name: str
    target_columns: List[ColumnConfigWithPreprocessing]
    feature_columns: List[ColumnConfigWithPreprocessing]

    def to_dataset_config(self) -> DatasetConfig:
        featurizers = []
        transforms = []
        for col in self.feature_columns + self.target_columns:

            def f(array, col, attr, parser):
                if hasattr(col, attr) and isinstance(getattr(col, attr), list):
                    previous_key = col.name
                    for idx, col_featurizer in enumerate(getattr(col, attr)):
                        key = f"{col.name}-feat-{idx}"
                        featurizer_args = col_featurizer.dict() | {
                            "name": key,
                            "forward_args": [f"${previous_key}"],
                        }
                        if featurizer_args["constructor_args"] is None:
                            featurizer_args.pop("constructor_args")
                        array.append(parser.parse_obj(featurizer_args))
                        previous_key = key

            f(featurizers, col, "featurizers", FeaturizerConfig)
            f(transforms, col, "transforms", TransformConfig)

        return DatasetConfig(
            name=self.name,
            target_columns=[
                ColumnConfig(name=col.name, data_type=col.data_type)
                for col in self.target_columns
            ],
            feature_columns=[
                ColumnConfig(name=col.name, data_type=col.data_type)
                for col in self.feature_columns
            ],
            featurizers=[feat.__root__ for feat in featurizers],
            transforms=[transf.__root__ for transf in transforms],
        )

In [8]:
iris_df = sklearn.datasets.load_iris(as_frame=True)

classes = iris_df.target_names


def get_class(item):
    return classes[item]


iris_df.data["sepal length (cm)"]
iris_df.data["species"] = np.apply_along_axis(get_class, 0, iris_df.target)
iris_df = iris_df.data

iris_config = DatasetConfigWithPreprocessing.from_yaml_str(
    """
name: Iris
targetColumns:
  - name: species
    dataType:
      domainKind: categorical
      classes: {}
    transforms:
      - type: sklearn.preprocessing.LabelEncoder
featureColumns:
  - name: sepal length (cm)
    dataType:
      domainKind: numeric
      unit: cm
    transforms:
      - type: sklearn.preprocessing.StandardScaler
  - name: sepal width (cm)
    dataType:
      domainKind: numeric
      unit: cm
    transforms:
      - type: sklearn.preprocessing.StandardScaler
  - name: petal length (cm)
    dataType:
      domainKind: numeric
      unit: cm
    transforms:
      - type: sklearn.preprocessing.StandardScaler
  - name: petal width (cm)
    dataType:
      domainKind: numeric
      unit: cm
    transforms:
      - type: sklearn.preprocessing.StandardScaler
"""
).to_dataset_config()

print(repr(iris_config))
iris_df.describe()

DatasetConfig(name='Iris', feature_columns=[ColumnConfig(name='sepal length (cm)', data_type=NumericDataType(domain_kind='numeric')), ColumnConfig(name='sepal width (cm)', data_type=NumericDataType(domain_kind='numeric')), ColumnConfig(name='petal length (cm)', data_type=NumericDataType(domain_kind='numeric')), ColumnConfig(name='petal width (cm)', data_type=NumericDataType(domain_kind='numeric'))], target_columns=[ColumnConfig(name='species', data_type=CategoricalDataType(domain_kind='categorical', classes={}))], featurizers=[], transforms=[StandardScalerConfig(type='sklearn.preprocessing.StandardScaler', constructor_args=StandardScalerConstructorArgs(with_mean=True, with_std=True), name='sepal length (cm)-feat-0', forward_args=['$sepal length (cm)']), StandardScalerConfig(type='sklearn.preprocessing.StandardScaler', constructor_args=StandardScalerConstructorArgs(with_mean=True, with_std=True), name='sepal width (cm)-feat-0', forward_args=['$sepal width (cm)']), StandardScalerConfig(t

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [9]:
class PreprocessingPipeline:
    """
    Preprocesses the dataset.

    Args:
        dataset_config: The object describing the columns and data_types of the dataset.
        df: The :class:`pd.DataFrame` that holds the dataset data.
    """

    def __init__(
        self,
        dataset_config: DatasetConfig,
    ):
        """
        Creates the pipeline steps without executing them.

        Args:
            dataset_config: The object describing the pipeline.
        """
        self.dataset_config = dataset_config
        self.featurizers = []
        self.transforms = []
        self.featurizers_config = {
            feat.name: feat for feat in dataset_config.featurizers
        }
        self.transforms_config = {
            transform.name: transform
            for transform in dataset_config.transforms
        }
        self.featurizers = {
            feat.name: feat.create() for feat in dataset_config.featurizers
        }
        self.transforms = {
            transform.name: transform.create()
            for transform in dataset_config.transforms
        }
        self._fitted = False

    _state_attrs = ["featurizers", "transforms", "dataset_config", "_fitted"]

    def get_state(self):
        state = {}
        for state_attr in self._state_attrs:
            state[state_attr] = getattr(self, state_attr)
        return state

    @classmethod
    def load_from_state(cls, state):
        instance = cls(state["dataset_config"])
        for state_attr in cls._state_attrs:
            setattr(instance, state_attr, state[state_attr])
        return instance

    def _prepare_transform(self, func: Any):
        if isinstance(func, sklern.base.TransformerMixin):
            return func
        elif callable(func):
            return sklearn.preprocessing.FunctionTransformer(func)
        else:
            raise ValueError(
                "func must be one of %r"
                % (["sklearn.base.TransformerMixin", "Callable"])
            )

    def get_X_and_y(
        self, df: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        feature_columns = [
            col.name for col in self.dataset_config.feature_columns
        ]
        target_columns = [
            col.name for col in self.dataset_config.target_columns
        ]
        return df.loc[:, feature_columns], df.loc[:, target_columns]

    def _prepare_X_and_y(self, X, y):
        try:
            if not isinstance(X, pd.DataFrame):
                X = pd.DataFrame(
                    X,
                    columns=[
                        col.name for col in self.dataset_config.feature_columns
                    ],
                )

            if not isinstance(y, pd.DataFrame):
                y = pd.DataFrame(
                    y,
                    columns=[
                        col.name for col in self.dataset_config.target_columns
                    ],
                )
            return X, y
        except:
            raise TypeError(
                "X and y must be pandas.DataFrame or numpy.ndarray"
            )

    def get_preprocess_steps(self):
        feats, transforms = map(list, dataset_topo_sort(self.dataset_config))
        result = []
        for config in feats:
            result.append((config, self.featurizers[config.name]))
        for config in transforms:
            result.append((config, self.transforms[config.name]))
        return result

    def fit(
        self,
        X: Union[pd.DataFrame, np.ndarray],
        y: Union[pd.DataFrame, np.ndarray, None] = None,
    ):
        """
        Fits the featurizers and transforms to the data.
        """
        X, y = self._prepare_X_and_y(X, y)

        data = pd.concat([X, y], axis=1)

        for config, transformer in self.get_preprocess_steps():
            args = get_args(data, config)
            if "molfeat" not in config.type:
                args = list(map(lambda x: x.reshape(-1, 1), args))
            try:
                transformer.fit(*args)
            except:
                raise RuntimeError("Failed to fit %r" % pformat(config))
        self._fitted = True

    def fit_transform(
        self,
        X: Union[pd.DataFrame, np.ndarray],
        y: Union[pd.DataFrame, np.ndarray, None] = None,
    ):
        X, y = self._prepare_X_and_y(X, y)

        data = pd.concat([X, y], axis=1)

        for config, transformer in self.get_preprocess_steps():
            args = get_args(data, config)
            args = map(lambda x: x.reshape(-1, 1), args)
            try:
                data[config.name] = transformer.fit_transform(*args)
            except:
                raise RuntimeError("Failed to fit %r" % pformat(config))

        self._fitted = True

        return data

    def transform(
        self,
        X: Union[pd.DataFrame, np.ndarray],
        y: Union[pd.DataFrame, np.ndarray, None] = None,
    ):
        X, y = self._prepare_X_and_y(X, y)

        data = pd.concat([X, y], axis=1)

        for config, transformer in self.get_preprocess_steps():
            args = get_args(data, config)
            args = map(lambda x: x.reshape(-1, 1), args)
            try:
                data[config.name] = transformer.transform(*args)
            except:
                raise RuntimeError("Failed to fit %r" % pformat(config))

        self._fitted = True

        return data

In [10]:
iris_preprocessing = PreprocessingPipeline(iris_config)
X, y = iris_preprocessing.get_X_and_y(iris_df)
iris_preprocessing.fit_transform(X, y)

  y = column_or_1d(y, warn=True)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,sepal length (cm)-feat-0,sepal width (cm)-feat-0,petal length (cm)-feat-0,petal width (cm)-feat-0,species-feat-0
0,5.1,3.5,1.4,0.2,setosa,-0.900681,1.019004,-1.340227,-1.315444,0
1,4.9,3.0,1.4,0.2,setosa,-1.143017,-0.131979,-1.340227,-1.315444,0
2,4.7,3.2,1.3,0.2,setosa,-1.385353,0.328414,-1.397064,-1.315444,0
3,4.6,3.1,1.5,0.2,setosa,-1.506521,0.098217,-1.283389,-1.315444,0
4,5.0,3.6,1.4,0.2,setosa,-1.021849,1.249201,-1.340227,-1.315444,0
...,...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,1.038005,-0.131979,0.819596,1.448832,2
146,6.3,2.5,5.0,1.9,virginica,0.553333,-1.282963,0.705921,0.922303,2
147,6.5,3.0,5.2,2.0,virginica,0.795669,-0.131979,0.819596,1.053935,2
148,6.2,3.4,5.4,2.3,virginica,0.432165,0.788808,0.933271,1.448832,2


In [11]:
iris_preprocessing = PreprocessingPipeline(iris_config)
X, y = iris_preprocessing.get_X_and_y(iris_df)
iris_preprocessing.fit(X, y)
iris_preprocessing.transform(X, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,sepal length (cm)-feat-0,sepal width (cm)-feat-0,petal length (cm)-feat-0,petal width (cm)-feat-0,species-feat-0
0,5.1,3.5,1.4,0.2,setosa,-0.900681,1.019004,-1.340227,-1.315444,0
1,4.9,3.0,1.4,0.2,setosa,-1.143017,-0.131979,-1.340227,-1.315444,0
2,4.7,3.2,1.3,0.2,setosa,-1.385353,0.328414,-1.397064,-1.315444,0
3,4.6,3.1,1.5,0.2,setosa,-1.506521,0.098217,-1.283389,-1.315444,0
4,5.0,3.6,1.4,0.2,setosa,-1.021849,1.249201,-1.340227,-1.315444,0
...,...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,1.038005,-0.131979,0.819596,1.448832,2
146,6.3,2.5,5.0,1.9,virginica,0.553333,-1.282963,0.705921,0.922303,2
147,6.5,3.0,5.2,2.0,virginica,0.795669,-0.131979,0.819596,1.053935,2
148,6.2,3.4,5.4,2.3,virginica,0.432165,0.788808,0.933271,1.448832,2


In [12]:
iris_preprocessing = PreprocessingPipeline(iris_config)
X, y = iris_preprocessing.get_X_and_y(iris_df)


iris_preprocessing.fit(X, y)


def describe_preprocessing(iris_preprocessing):
    state = iris_preprocessing.get_state()
    transforms_and_feats = state["transforms"] | state["featurizers"]
    df = []
    for transformer_name, transformer in transforms_and_feats.items():
        transformer_data = {"name": transformer_name}
        try:
            sklearn.utils.validation.check_is_fitted(transformer)
            transformer_data["fitted"] = True
        except sklearn.exceptions.NotFittedError:
            transformer_data["fitted"] = False
        if isinstance(transformer, sklearn.preprocessing.StandardScaler):
            transformer_data["mean"] = transformer.mean_
            transformer_data["var"] = transformer.var_
        elif isinstance(transformer, sklearn.preprocessing.LabelEncoder):
            transformer_data["classes"] = str(transformer.classes_)
        df.append(transformer_data)
    return pd.DataFrame(df)


describe_preprocessing(iris_preprocessing)

  y = column_or_1d(y, warn=True)


Unnamed: 0,name,fitted,mean,var,classes
0,sepal length (cm)-feat-0,True,[5.843333333333334],[0.6811222222222223],
1,sepal width (cm)-feat-0,True,[3.0573333333333337],[0.1887128888888889],
2,petal length (cm)-feat-0,True,[3.7580000000000005],[3.0955026666666665],
3,petal width (cm)-feat-0,True,[1.1993333333333336],[0.5771328888888888],
4,species-feat-0,True,,,['setosa' 'versicolor' 'virginica']


In [13]:
# Loading from stored state, should have the same data
loaded_iris_preprocessing = PreprocessingPipeline.load_from_state(
    iris_preprocessing.get_state()
)
describe_preprocessing(loaded_iris_preprocessing)

Unnamed: 0,name,fitted,mean,var,classes
0,sepal length (cm)-feat-0,True,[5.843333333333334],[0.6811222222222223],
1,sepal width (cm)-feat-0,True,[3.0573333333333337],[0.1887128888888889],
2,petal length (cm)-feat-0,True,[3.7580000000000005],[3.0955026666666665],
3,petal width (cm)-feat-0,True,[1.1993333333333336],[0.5771328888888888],
4,species-feat-0,True,,,['setosa' 'versicolor' 'virginica']


In [14]:
sampl_config = DatasetConfigWithPreprocessing.from_yaml_str(
    """
name: SAMPL
featureColumns:
  - name: smiles
    dataType:
      domainKind: smiles
    featurizers:
      - type: molfeat.trans.fp.FPVecFilteredTransformer
targetColumns:
  - name: expt
    dataType:
      domainKind: numeric
    transforms:
      - type: sklearn.preprocessing.StandardScaler
"""
).to_dataset_config()

sampl_df = pd.read_csv("SAMPL.csv")

if "step" not in sampl_df.columns:
    splitters.apply_split_indexes(
        sampl_df,
        split_type="scaffold",
        split_column="smiles",
        split_target="80-10-10",
    )
if "step" not in iris_df.columns:
    splitters.apply_split_indexes(
        iris_df, split_type="random", split_target="60-20-20"
    )

    sampl_df.to_csv("SAMPL.csv", index=False)

sampl_preprocessing = PreprocessingPipeline(sampl_config)
X, y = sampl_preprocessing.get_X_and_y(sampl_df)
sampl_preprocessing.fit(X, y)
describe_preprocessing(sampl_preprocessing)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,name,fitted,mean,var
0,expt-feat-0,True,[-3.8030062305295944],[14.782657785007911]
1,smiles-feat-0,False,,
