In [1]:
from dotenv import load_dotenv

load_dotenv("../.env")
load_dotenv("../.env.secret")
import os

os.environ["MLFLOW_TRACKING_URI"] = "http://localhost:5000"
os.environ["POSTGRES_SERVER"] = "localhost"

import pandas as pd
from fleet.utils import data
from torch.utils.data import DataLoader
from torch import nn
import torch
import datetime
from fleet.model_builder import splitters
from fleet.model_builder.constants import TrainingStep
from fleet.base_schemas import TorchModelSpec
from fleet.model_functions import fit
from fleet.torch_.schemas import TorchTrainingConfig
from fleet.model_builder import optimizers
from fleet.model_builder.dataset import Collater

model_path = "../tests/data/yaml/multiclass_classification_model.yaml"
csv_path = "../tests/data/csv/iris.csv"

with open(model_path, encoding="utf-8") as model_file:
    print(model_file.read())


model_spec = TorchModelSpec.from_yaml(model_path)
df = pd.read_csv(csv_path)

# Creates the "step" column on the dataframe with the specified splitting.
splitters.apply_split_indexes(df, split_type="random", split_target="90-5-5")

# Converts the dataframe to numpy outputs after applying featurizers and transforms.
df = data.build_columns_numpy(dataset_config=model_spec.dataset, df=df)

# Instantiate torch data loader from preprocessed dataframe
dataset = data.MarinerTorchDataset(
    data=df,
    dataset_config=model_spec.dataset,
)
dataloader = DataLoader(dataset, batch_size=4, collate_fn=Collater())

# Example of a torch dataset batch
next(iter(dataloader))

name: multiclass_classification_model
dataset:
  name: Iris
  targetColumns:
    - name: species
      dataType:
        domainKind: categorical
        classes:
          0: 0
          1: 1
          2: 2
      outModule: Linear1
  featureColumns:
    - name: sepal_length
      dataType:
        domainKind: numeric
        unit: cm
    - name: sepal_width
      dataType:
        domainKind: numeric
        unit: cm
    - name: petal_length
      dataType:
        domainKind: numeric
        unit: cm
    - name: petal_width
      dataType:
        domainKind: numeric
        unit: cm

spec:
  layers:
    - name: Concat
      type: fleet.model_builder.layers.Concat
      constructorArgs:
        dim: 1
      forwardArgs:
        xs:
          - $sepal_length
          - $sepal_width
          - $petal_length
          - $petal_width

    - name: Linear0
      type: torch.nn.Linear
      constructorArgs:
        in_features: 4
        out_features: 128
      forwardArgs:
        input: 

{'sepal_length': tensor([[6.9000],
         [7.7000],
         [6.3000],
         [5.2000]]),
 'sepal_width': tensor([[3.1000],
         [3.8000],
         [2.9000],
         [3.5000]]),
 'petal_length': tensor([[4.9000],
         [6.7000],
         [5.6000],
         [1.5000]]),
 'petal_width': tensor([[1.5000],
         [2.2000],
         [1.8000],
         [0.2000]]),
 'species': tensor([[1],
         [2],
         [2],
         [0]])}

In [2]:
now = datetime.datetime.now()

# Trains the model, uploading metrics to MLFlow
# NOT logging models because checkpoint_config is absent in train_config
result = fit(
    spec=model_spec,
    train_config=TorchTrainingConfig(
        epochs=10,
        batch_size=20,
        optimizer=optimizers.AdamOptimizer(),
    ),
    mlflow_model_name=f"Test-model-{now}",
    mlflow_experiment_name=f"Test-experiment-{now}",
    datamodule_args={
        "split_type": "random",
        "split_target": "80-10-10",
    },
    dataset=df,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name   | Type       | Params
--------------------------------------
0 | _model | ModuleDict | 1.0 K 
--------------------------------------
1.0 K     Trainable params
0         Non-trainable params
1.0 K     Total params
0.004     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
`Trainer.fit` stopped: `max_epochs=10` reached.


In [3]:
# Getting metrics published to mlflow.
from mlflow.tracking.client import MlflowClient

client = MlflowClient()
runs = client.search_runs(experiment_ids=[result.mlflow_experiment_id])

runs[0]

<Run: data=<RunData: metrics={'epoch': 9.0,
 'train/accuracy/species': 0.8888888955116272,
 'train/f1/species': 0.8888888955116272,
 'train/loss/species': 0.5477851629257202,
 'train/precision/species': 0.8888888955116272,
 'train/recall/species': 0.8888888955116272,
 'val/accuracy/species': 0.8571428656578064,
 'val/f1/species': 0.8571428656578064,
 'val/loss/species': 0.6200952529907227,
 'val/precision/species': 0.8571428656578064,
 'val/recall/species': 0.8571428656578064}, params={'config': '{"layers": [{"type": "fleet.model_builder.layers.Concat", "name": '
           '"Concat", "constructor_args": {"dim": 1}, "forward_args": {"xs": '
           '["$sepal_length", "$sepal_width", "$petal_length", '
           '"$petal_width"]}}, {"type": "torch.nn.Linear", "name": "Linear0", '
           '"c',
 'dataset_config': '{"name": "Iris", "target_columns": [{"name": "species", '
                   '"data_type": {"domain_kind": "categorical", "classes": '
                   '{"0": 0, "1": 