In [4]:
from dotenv import load_dotenv

load_dotenv(".env")
load_dotenv(".env.secret", override=True)
from fleet.scikit_.schemas import SklearnModelSpec
import pandas as pd

schema = """
name: HIV Random Forest Classifier
framework: sklearn
dataset:
  name: HIV
  strategy: pipeline
  featureColumns:
    - name: smiles
      dataType:
        domainKind: smiles
      featurizers:
        - name: MolFPFeaturizer
          type: molfeat.trans.fp.FPVecFilteredTransformer
      constructor_args:
        del_invariant: False
        length: 512
    - name: activity
      dataType:
        domainKind: categorical
        classes:
          CI: 1
          CM: 0
      featurizers:
        - name: OneHotEncoder
          type: sklearn.preprocessing.OneHotEncoder
  targetColumns:
    - name: HIV_active
      dataType:
        domainKind: categorical
        classes:
          positive: 1
          negative: 0
      featurizers:
        - name: LabelEncoder-Out
          type: sklearn.preprocessing.LabelEncoder
spec:
  model:
    type: sklearn.ensemble.RandomForestClassifier
"""

spec = SklearnModelSpec.from_yaml_str(schema)

import pandas as pd
from pathlib import Path

datasets = Path("..") / "tests" / "data" / "csv"
df = pd.read_csv(datasets / "HIV.csv")
df["HIV_active"] = df["HIV_active"].apply(
    lambda x: ("positive" if x else "negative")
)
df.HIV_active.unique()
df.to_csv(datasets / "HIV2.csv", index=False)

In [2]:
classes = {"CI": 1, "CO": 0}
entries = list(classes.items())
entries.sort(key=lambda x: x[1])
entries

[('CO', 0), ('CI', 1)]

In [3]:
from fleet import model_functions
from mariner.core import mlflowapi  # TODO: move to fleet

# Currently it is necessary to create a mlflow model
# To make sure every model is tracked. We try to not
# create more dependencies from fleet to mariner, which
# is why the mariner's experiment and model data is not
# required
mlflow_model_name = "TEST"
reg_model = mlflowapi.get_registered_model(name=mlflow_model_name)
if not reg_model:
    reg_model = mlflowapi.create_registered_model(
        name=mlflow_model_name,
        description="TEST Model description",
        tags={"stage": "test"},
    )
print("Registered Mlflow Model: %r" % reg_model)

Registered Mlflow Model: <RegisteredModel: creation_timestamp=1697481420737, description='TEST Model description', last_updated_timestamp=1697543087939, latest_versions=[<ModelVersion: creation_timestamp=1697543087939, current_stage='None', description='', last_updated_timestamp=1697543087939, name='TEST', run_id='127d831cdfcc4681a7426e7cf9fd4774', run_link='', source='s3://dev-mariner-datasets/33/127d831cdfcc4681a7426e7cf9fd4774/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='17'>], name='TEST', tags={'stage': 'test'}>


In [4]:
import re

mlflow_experiment_name = "TEST-TRAIN-"
experiments = mlflowapi.search_experiments(mlflow_experiment_name)
last_experiment = experiments[0]


def inc_name(name: str):
    pattern = re.compile(r"\d+$")
    matchs = re.findall(pattern, name)
    match = matchs[-1]
    index_ = name.index(match)
    return name[:index_] + str(int(match) + 1)


mlflow_experiment_name = inc_name(last_experiment.name)
mlflow_experiment_name

'TEST-TRAIN-32'

In [5]:
result = model_functions.fit(
    spec=spec,
    dataset=df,
    mlflow_model_name=mlflow_model_name,
    mlflow_experiment_name=mlflow_experiment_name,
    experiment_name="TEST-TRAIN-1",
    user_id=1,
    train_config=None,
)
result

categories ['CM', 'CI']
['smiles-out', 'activity-out']
['HIV_active-out']


  from .autonotebook import tqdm as notebook_tqdm
2023/10/17 09:40:42 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: TEST, version 18
Failed metrics to http://localhost:8000 /api/v1/experiments/epoch_metrics. Make sure the env var "SERVER_HOST" is populated in the ray services, and that it points to the mariner backend


['smiles-out', 'activity-out']
['HIV_active-out']


Result(mlflow_experiment_id='34', mlflow_model_version=<ModelVersion: creation_timestamp=1697546442759, current_stage='None', description='', last_updated_timestamp=1697546442759, name='TEST', run_id='180d72786e3f4bad8b7ce65369799864', run_link='', source='s3://dev-mariner-datasets/34/180d72786e3f4bad8b7ce65369799864/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='18'>)

# The following code can be run standalone.

In [6]:
from dotenv import load_dotenv

load_dotenv(".env")
load_dotenv(".env.secret", override=True)
import pandas as pd
from pathlib import Path
from mariner.core import mlflowapi
from fleet import (
    mlflow as fleet_mlflowapi,
)  # Todo merge other api into this one
from fleet import model_functions
import mlflow
from fleet.scikit_.schemas import SklearnModelSpec

mlflow_model_name = "TEST"
datasets = Path(".") / "tests" / "data" / "csv"
df = pd.read_csv(datasets / "HIV.csv")
test_data = df[df.step == 3]

reg_model = mlflowapi.get_registered_model(name=mlflow_model_name)
version = reg_model.latest_versions[-1]
print(version)
run = mlflowapi.get_run(version.run_id)
pipeline = fleet_mlflowapi.load_pipeline(version.run_id)
schema = mlflow.artifacts.load_text(
    run.info.artifact_uri + "/model_config.yaml"
)
spec = SklearnModelSpec.from_yaml_str(schema)

<ModelVersion: creation_timestamp=1697546442759, current_stage='None', description='', last_updated_timestamp=1697546442759, name='TEST', run_id='180d72786e3f4bad8b7ce65369799864', run_link='', source='s3://dev-mariner-datasets/34/180d72786e3f4bad8b7ce65369799864/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='18'>
categories ['CM', 'CI']


In [7]:
pipeline.dataset_config.featurizers[-1]

LabelEncoderConfig(type='sklearn.preprocessing.LabelEncoder', constructor_args=None, name='HIV_active-out', forward_args={'X': '$HIV_active'})

In [8]:
import numpy as np

featurizer, adapt_args = pipeline.featurizers["HIV_active-out"]
# adapt_args(args=(['0']), method=featurizer.transform)
print(featurizer.inverse_transform(np.zeros((3, 1), dtype=int)))
print(featurizer.inverse_transform(np.ones((3, 1), dtype=int)))
featurizer.classes_

['avocado' 'avocado' 'avocado']
['potato' 'potato' 'potato']


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array(['avocado', 'potato'], dtype=object)

In [9]:
model_functions.predict(
    spec=spec,
    mlflow_model_name=mlflow_model_name,
    mlflow_model_version=version.version,
    input_=test_data,
    return_labels=True,
)

categories ['CM', 'CI']
skip_roots=%r ['HIV_active']
Adding config %r smiles
Adding config %r smiles-out
Adding config %r activity
Adding config %r activity-out
['smiles-out', 'activity-out']
skip_roots=%r ['smiles', 'activity']
Adding config %r HIV_active
Adding config %r HIV_active-out


{'HIV_active': ['potato', 'potato', 'potato', 'potato', 'potato', 'potato']}

In [10]:
# Example of using featurizers to do and undo data transformations

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

featurizers = {"one-hot": OneHotEncoder(), "label-encoder": LabelEncoder()}

# One hot only encodes matrices
df = pd.DataFrame({"x1": ["A", "A", "B", "C"], "y": [0, 0, 1, 1]})
transformed = featurizers["one-hot"].fit_transform(df)
print(transformed)
print(featurizers["one-hot"].inverse_transform(transformed))

# Label encoders only encode vectors
transformed = featurizers["label-encoder"].fit_transform(df.y)
print(transformed)
print(featurizers["label-encoder"].inverse_transform(transformed))


# If one hot are used on vectors must reshape as a column matrix with .reshape(-1, 1)
transformed = featurizers["one-hot"].fit_transform(
    df.x1.to_numpy().reshape(-1, 1)
)
print(transformed)
print(featurizers["one-hot"].inverse_transform(transformed))

# If label encoder is used on matrices:
#   ValueError: y should be a 1d array, got an array of shape (4, 2) instead.
# transformed = featurizers['label-encoder'].fit_transform(df.to_numpy())
# print(transformed)
# print(featurizers['label-encoder'].inverse_transform(transformed))

# If label encoder is used on column matrix it works fine
# But we get a warning:
#   sklearn/preprocessing/_label.py:114: DataConversionWarning: A column-vector y was passed when a 1d array was expected.
#   Please change the shape of y to (n_samples, ), for example using ravel().
transformed = featurizers["label-encoder"].fit_transform(
    df.y.to_numpy().reshape(-1, 1)
)
print(transformed)
print(featurizers["label-encoder"].inverse_transform(transformed))

  (0, 0)	1.0
  (0, 3)	1.0
  (1, 0)	1.0
  (1, 3)	1.0
  (2, 1)	1.0
  (2, 4)	1.0
  (3, 2)	1.0
  (3, 4)	1.0
[['A' 0]
 ['A' 0]
 ['B' 1]
 ['C' 1]]
[0 0 1 1]
[0 0 1 1]
  (0, 0)	1.0
  (1, 0)	1.0
  (2, 1)	1.0
  (3, 2)	1.0
[['A']
 ['A']
 ['B']
 ['C']]
[0 0 1 1]
[0 0 1 1]


  y = column_or_1d(y, warn=True)
