In [1]:
%load_ext nb_mypy

Version 1.0.5


In [2]:
# to make imports and folder paths work
# todo: Instead create python package and install locally
import os, sys
os.chdir(
    f'{os.environ["HOME"]}/repos/sagemaker-pipelines-abstraction/src'
)

In [3]:
from abc import abstractmethod

from pydantic_settings import BaseSettings

# Decision 1: Composition versus Inheritance
- Composition: separate config objects for shared, Specific, and any additional config values
- inheritance: single config, inheriting from a comment step config)
- Decision: Use composition
  - Generally, it is a OOP best practice to use composition over inheritance, because inheritance leads to tight coupling. 
  - However, the downside of composition in our case is that it requires use of more advanced programming patterns that not all python programmers may be familiar off, namely the use of Generics (at least if you want to have type safety, which should be in negotiable). 
  - Furthermore, a small additional downside of composition is that it makes the configs a little more awkward, because the config object is now of wrapper composed of multiple different config's. 
  - Nevertheless, I think these downsides are easily worth it for the better maintainability of composition. This decision has been reinforced by working with the different type of Sagemaker steps, which I think are much more frustrating to work with than they should be as a result of their use of inheritance.

# Decision 2: Properties vs data classes
## Design chosen: Use data classes

In [4]:
# todo: check if we can use decorator pattern to avoid load_pydantic_config_from_file().
class StepConfig(BaseSettings):
    input_filename: str
    output_filename: str
    output_train_filename: str
    output_val_filename: str
    output_test_filename: str
    instance_type: str
    instance_count: int
    step_name: str

class ProcessingConfig(BaseSettings):
    """
    This class provides the schema for the step-specific config file.
    It is passed to step factory in the latter's constructor.
    """
    sklearn_framework_version: str
    # Override default field with more specific filenames
    output_filename: None = None  # type: ignore[assignment]
    output_train_filename: str
    output_val_filename: str
    output_test_filename: str

class FrameworkProcessingConfig(BaseSettings):
    """
    So far no extra configs needed. (While it would be nice if we could set `estimator_cls=SKLearn`
    in the config file, but we would have to use `eval()` to construct a python object from the
    string, which is a potential security vulnerability.)
    """
    ...

In [5]:
from typing import TypeVar, Generic, Type
from dataclasses import dataclass

# Each of the types must be a subclass of StepConfig
StepSpecificConfigType = TypeVar("StepSpecificConfigType", bound=BaseSettings)
AdditionalConfigType = TypeVar("AdditionalConfigType", bound=BaseSettings)


@dataclass
class StepConfigFacade(
    Generic[StepSpecificConfigType, AdditionalConfigType]
):
    # This config type is hard-coded, since it does not depend on step type.
    general_step_config: StepConfig
    step_specific_config: StepSpecificConfigType
    additional_config: AdditionalConfigType

FrameworkProcessingConfigFacade = StepConfigFacade[
    ProcessingConfig,
    FrameworkProcessingConfig,
]

As desired, if we try to instantiate  a StepConfigFacade with a type variable that is not a subtype of BaseSettings – such as trying to use a dictionary for the AdditionalConfig – the type checker catches this mistake:

In [6]:
FaultyConfigFacade = StepConfigFacade[
    ProcessingConfig,
    dict[str, str],
]

<cell>1: [1m[31merror:[m Type argument [m[1m"dict[str, str]"[m of [m[1m"StepConfigFacade"[m must be a subtype of [m[1m"BaseSettings"[m  [m[33m[type-var][m
<cell>1: [1m[31merror:[m Value of type variable [m[1m"AdditionalConfigType"[m of [m[1m"StepConfigFacade"[m cannot be [m[1m"dict[str, str]"[m  [m[33m[type-var][m


In [7]:
# Define example configs
_step_config = StepConfig(
    input_filename='input.parquet',
    output_filename='output.parquet',
    output_train_filename='output_train.parquet',
    output_val_filename='output_val.parquet',
    output_test_filename='output_test.parquet',
    instance_type='local',
    instance_count=1,
    step_name='processing',
)

_processing_config = ProcessingConfig(
    sklearn_framework_version='0.23-1',
    output_train_filename='output_train.parquet',
    output_val_filename='output_val.parquet',
    output_test_filename='output_test.parquet',
)

_framework_processing_config = FrameworkProcessingConfig()

fw_proc_configs = FrameworkProcessingConfigFacade(
    general_step_config=_step_config,
    step_specific_config=_processing_config,
    additional_config=_framework_processing_config,
)

# Access config value
fw_proc_configs.step_specific_config.sklearn_framework_version

'0.23-1'

## Design discarded: use properties
This works, but it is simpler to use data classes instead of properties:

In [8]:
from typing import TypeVar, Generic, Type

# Each of the types must be a subclass of BaseSettings
StepSpecificConfigType = TypeVar("StepSpecificConfigType", bound=BaseSettings)
AdditionalConfigType = TypeVar("AdditionalConfigType", bound=BaseSettings)

class StepConfigFacade(
    Generic[StepSpecificConfigType, AdditionalConfigType]
):
    @property
    @abstractmethod
    def general_step_config(self) -> StepConfig:
        """This config type is hard-coded, since it does not depend on step type."""
        ...

    @property
    @abstractmethod
    def step_specific_config(self) -> StepSpecificConfigType:
        ...

    @property
    @abstractmethod
    def additional_config(self) -> AdditionalConfigType:
        ...

FrameworkProcessingConfigFacade = StepConfigFacade[
    ProcessingConfig,
    FrameworkProcessingConfig,
]

Like with data classes, static analysis catches the error here:

In [9]:
FrameworkProcessingConfigFacade = StepConfigFacade[
    ProcessingConfig,
    dict[str, str],
]

<cell>1: [1m[31merror:[m Type argument [m[1m"dict[str, str]"[m of [m[1m"StepConfigFacade"[m must be a subtype of [m[1m"BaseSettings"[m  [m[33m[type-var][m
<cell>1: [1m[31merror:[m Value of type variable [m[1m"AdditionalConfigType"[m of [m[1m"StepConfigFacade"[m cannot be [m[1m"dict[str, str]"[m  [m[33m[type-var][m
