# Facade + Factory-Patterns

In [1]:
%load_ext nb_mypy

Version 1.0.5


In [2]:
# to make imports and folder paths work
# todo: Instead create python package and install locally
import os, sys
os.chdir(
    f'{os.environ["HOME"]}/repos/sagemaker-pipelines-abstraction/src'
)

In [3]:
from abc import ABC, abstractmethod
from functools import cached_property
from typing import Literal, Callable, TypeAlias, Any
from pathlib import Path
from datetime import datetime
from typing import TypeVar, Generic
from dataclasses import dataclass

from pydantic_settings import BaseSettings
from loguru import logger
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import Step
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.processing import FrameworkProcessor
from sagemaker.workflow.steps import ConfigurableRetryStep, ProcessingStep
from sm_pipelines_oo.shared_config_schema import Environment

from sm_pipelines_oo.shared_config_schema import SharedConfig, Environment
# from sm_pipelines_oo.steps.interfaces import StepFactoryInterface
from sm_pipelines_oo.aws_connector.interface import AWSConnectorInterface
from sm_pipelines_oo.utils import load_pydantic_config_from_file
from sm_pipelines_oo.aws_connector.interface import AWSConnectorInterface
from sm_pipelines_oo.aws_connector.implementation import AWSConnector, LocalAWSConnector, \
    create_aws_connector
from sm_pipelines_oo.pipeline_wrapper import PipelineWrapper


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/thomas-22/.config/sagemaker/config.yaml


## Config

In [4]:
# Components
# ==========

class ProcessingConfig(BaseSettings):
    """
    This class defines all the config values shared by any subtype of processing step.
    """
    input_filename: str
    output_train_filename: str
    output_val_filename: str
    output_test_filename: str
    instance_type: str
    instance_count: int
    sklearn_framework_version: str
    code_filename: str

class FrameworkProcessingConfig(BaseSettings):
    """
    This class defines all the config values that are specific to a FrameworkProcessor.
    """
    # While it would be nice if we could set `estimator_cls=SKLearn`in the config file, we would
    # have to use `eval()` to construct a python object from the string, which is a potential
    # security vulnerability.)
    estimator_cls: Literal['SKLearn'] = 'SKLearn'

In [5]:
# Generic StepConfig FACADE
# =========================

# Each of the types must be a subclass of BaseSettings
StepSpecificConfigType = TypeVar("StepSpecificConfigType", bound=BaseSettings)
AdditionalConfigType = TypeVar("AdditionalConfigType", bound=BaseSettings)

@dataclass
class StepConfigFacade(
    Generic[StepSpecificConfigType, AdditionalConfigType]
):
    step_name: str
    steptype_specific_configs: StepSpecificConfigType
    additional_configs: AdditionalConfigType

FrameworkProcessingConfigFacade = StepConfigFacade[
    ProcessingConfig,
    FrameworkProcessingConfig,
]

In [6]:
# Config FACTORY
# ===============
class ConfigFactoryInterface:
    """Abstraction layer that allows accessing both shared config and step configs."""
    @abstractmethod
    def get_step_configs(self, env: Environment, step_name: str) -> StepConfigFacade:
        ...

    @abstractmethod
    def get_shared_config(self, env: Environment) -> SharedConfig:
        ...

### Usage

In [7]:
# Define example configs
fw_proc_configs = FrameworkProcessingConfigFacade(
    step_name='pre_processing',
    steptype_specific_configs=ProcessingConfig(
        input_filename='input.parquet',
        output_train_filename='output_train.parquet',
        output_val_filename='output_val.parquet',
        output_test_filename='output_test.parquet',
        instance_type='local',
        instance_count=1,
        sklearn_framework_version='0.23-1',
        code_filename='pre_processing.py',
    ),
    additional_configs=FrameworkProcessingConfig(
        estimator_cls='SKLearn',
    ),
)


class MockFWPConfigFactory(ConfigFactoryInterface):
    """
    This class is used to create a mock config for testing purposes.
    """
    def __init__(self) -> None:
        pass

    def get_step_configs(self, env: Environment, step_name: str) -> FrameworkProcessingConfigFacade:
        if env != 'test':
            raise ValueError(f'env must be "test", but got {env}')

        if step_name == 'processing':
            return fw_proc_configs
        else:
            raise NotImplementedError(f'No config defined for step {step_name}')

    def get_shared_config(self, env: Environment) -> SharedConfig:
        if env != 'test':
            raise ValueError(f'env must be "test", but got {env}')

        return SharedConfig(
            project_name="test",
            project_version='v0.0',
            region='us-east-1',
            role_name='test_role',
            project_bucket_name='test-bucket',
        )


In [8]:
class DefaultConfigFactory(ConfigFactoryInterface):
    """
    The pipeline façade will usually use this class to load configs. Only explicitly specify a
    different class implementing the same interface for testing purposes (e.g., to directly define
    config rather than reading from file).
    """
    # todo: Put reading of configs from file, etc, here
    pass

In [9]:
StepType: TypeAlias = Literal['processing', 'training', 'model', 'evaluation']

class PathFactory:
    # todo: use general step config, once stable.
    # todo: Add interface (or rename to distinguish from actual factory patterns)
    def __init__(self, step_config_facade: StepConfigFacade, shared_config: SharedConfig):
        self._step_config_facade = step_config_facade
        self._shared_config = shared_config

    # S3 Folder Paths
    # ===============

    @cached_property
    def _default_s3_data_folder(self) -> str:
        default_bucket_name = self._shared_config.project_bucket_name
        project_version = self._shared_config.project_version
        step_name = self._step_config_facade.step_name
        return f"s3://{default_bucket_name}/{project_version}/{step_name}"

    @cached_property
    def s3_input_folder(self) -> str:
        """
        Returns custom s3 folder with input data, if provided. Otherwise returns default s3 input
        folder.
        """
        custom_s3_folder: str | None = self._step_config_facade.input_s3_dir
        default_s3_folder = f"{self._default_s3_data_folder}/input"
        return  custom_s3_folder or default_s3_folder

    @cached_property
    def s3_output_folder(self) -> str:
        """
        Returns custom s3 folder with output data, if provided. Otherwise returns default s3 output
        folder.
        """
        custom_s3_folder: str | None = self._step_config_facade.output_s3_dir
        default_s3_folder = f'{self._default_s3_data_folder}/output'
        return  custom_s3_folder or default_s3_folder

    # Local Folder Paths
    # =================

    def get_local_folderpath(self, step_type: StepType) -> str:
        # Note that `/opt/ml/${STEP_TYPE}/` is *required* by Sagemaker.
        # todo: Use more precise type annotation? (Create type for StepType)
        step_name: str = self._step_config_facade.step_name
        return f'/opt/ml/{step_type}/{step_name}'

    @property
    def source_dir(self) -> str:
        # Hard-code source_directory name to simplify configs.
        return f"code/{self._step_config_facade.step_name}/"

    @property
    def step_code_file(self) -> str:
        # Hard-code name of step's code file to simplify configs.
        return f"{self._step_config_facade.step_name}.py"


<cell>26: [1m[31merror:[m [m[1m"StepConfigFacade[Any, Any]"[m has no attribute [m[1m"input_s3_dir"[m  [m[33m[attr-defined][m
<cell>34: [1m[31merror:[m [m[1m"StepConfigFacade[Any, Any]"[m has no attribute [m[1m"output_s3_dir"[m  [m[33m[attr-defined][m


## Step

In [10]:
# *General* step FACTORY INTERFACE
# ===============================

class StepFactoryInterface(ABC):

    @abstractmethod
    def create_step(
        self,
        shared_config: SharedConfig,
        step_config_facade: StepConfigFacade,
        path_factory: PathFactory, # todo: create interface
        aws_connector: AWSConnectorInterface,

    ) -> ConfigurableRetryStep:
        ...

    @property
    @abstractmethod
    def step_name(self) -> str:
        ...

    @property
    @abstractmethod
    def step_type(self) -> StepType:
        """This tells us where Sagemaker stores local copy of data, e.g., /opt/ml/processing/."""
        ...


class ProcessingStepFactoryInterface(StepFactoryInterface):
    """
    This subclass is distinguished only by more specific return type for step (and step_type name).
    ."""
    @abstractmethod
    def create_step(
        self,
        shared_config: SharedConfig,
        step_config_facade: StepConfigFacade,
        path_factory: PathFactory, # todo: create interface
        aws_connector: AWSConnectorInterface,
    ) -> ProcessingStep:
         ...

    @property
    @abstractmethod
    def step_type(self) -> Literal['processing']:
        ...


class FrameworkProcessorFactory(ProcessingStepFactoryInterface):
    # todo: Check if there is an inbuilt type for fwp-step.
    @property
    def step_type(self) -> Literal['processing']:
        return 'processing'

In [11]:
from typing import TypedDict
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Run Args
# ========
class ProcessorRunArgs(TypedDict):
    inputs: list[ProcessingInput]
    outputs: list[ProcessingOutput]
    arguments: list[str] | None

class FrameworkProcessorRunArgs(ProcessorRunArgs):
    # Additional args for FrameworkProcessor:
    source_dir: str
    code: str


In [12]:
from sagemaker.session import Session
from sagemaker.workflow.pipeline_context import PipelineSession, LocalPipelineSession

# Step Factory IMPLEMENTATION
# ===========================

class FrameworkProcessingStepFactory(ProcessingStepFactoryInterface):
    """
    shared config etc will be passed during create_step().
    """

    def __init__(
        self,
        step_name: str,
    ):
        self._step_name = step_name

        # This determines how to construct the estimator object from the string in the config file, avoiding the
        # use of `eval`, which is a potential security vulnerability.
        self._str_to_cls_mapping: dict[str, Any] = {  # todo:  find supertype
            'SKLearn': SKLearn,
        }

    def support_additional_estimators(self, additional_estimator_mapping: dict[str, Any]) -> None:
        """
        Allow user to add additional estimators (following the open-closed principle).

        Note: We don't use composition, because passing this mapping to constructor would violate
        the shared interface for StepFactory's constructor. Instead, we add an addiitonal method
        to allow updating the mapping.
        """
        self._str_to_cls_mapping.update(additional_estimator_mapping)

    @property
    def step_name(self) -> str:
        """
        Facade calls this method before create_step(), so it can retrieve the right config first.
        """
        return self._step_name

    # todo: Generalize types to other processors
    def _processor(
        self,
        step_config_facade: FrameworkProcessingConfigFacade,
        sagemaker_session: PipelineSession | Session | LocalPipelineSession,
        role_arn: str,
    ) -> Processor:  # type: ignore
        """Instantiate processor."""

        # Get processor class from classname string in config file
        _processor_cls_name: str = step_config_facade.additional_configs.estimator_cls
        processor_cls = self._str_to_cls_mapping[_processor_cls_name]
        return processor_cls(
            framework_version=step_config_facade.steptype_specific_configs.sklearn_framework_version,
            instance_type=step_config_facade.steptype_specific_configs.instance_type,
            instance_count=step_config_facade.steptype_specific_configs.instance_count,
            base_job_name=step_config_facade.step_name,
            sagemaker_session=sagemaker_session,
            role=role_arn,
            # **self._processor_extra_kwargs,
        )  # type: ignore

    def _get_processor_run_args(
        self,
        step_config_facade: FrameworkProcessingConfigFacade,
    ) -> FrameworkProcessorRunArgs:

        skl_run_args = FrameworkProcessorRunArgs(
            inputs = [
                ProcessingInput(
                    source=step_config_facade.input_s3_dir,
                    destination=f'/opt/ml/{self.step_type}/{step_config_facade.step_name}/input/'
                ),
            ],
            outputs = [
                ProcessingOutput(
                    output_name="train",
                    source=f'/opt/ml/{self.step_type}/{step_config_facade.step_name}/train',
                    destination=f"{step_config_facade.s3_output_folder}/train",
                ),
                ProcessingOutput(
                    output_name="validation",
                    source=f"/opt/ml/{self.step_type}/{step_config_facade.step_name}/validation",
                    destination=f"{step_config_facade.s3_output_folder}/validation",
                ),
                ProcessingOutput(
                    output_name="test",
                    source=f"/opt/ml/{self.step_type}/{step_config_facade.step_name}/test",
                    destination=f"{step_config_facade.s3_output_folder}/test",
                ),
            ],
            source_dir=f"code/{_step_config_facade.step_name}/",
            code=f'{step_config_facade.step_name}.py', # Hard-code to avoid extra config value
            arguments=None, # Todo: Decide whether this should come from configuration. May depend on type of step.
        )
        return skl_run_args

    # todo: Add more specific return type (may have to create custom type, but check Sagemaker sdk code again)
    def create_step(
            self,
            shared_config: SharedConfig,
            step_config_facade: StepConfigFacade,
            path_factory: PathFactory,
            aws_connector: AWSConnectorInterface,
        ) -> ProcessingStep:
        # todo: think about how to create these here
        step_config_facade = fw_proc_configs

        return ProcessingStep(
            name=step_config_facade.step_name,
            processor=self._processor(
                step_config_facade=step_config_facade,
                sagemaker_session=aws_connector.sm_session,
                role_arn=aws_connector.role_arn,
            ),
            inputs=[
                ProcessingInput(
                    source=step_config_facade.steptype_specific_configs.input_filename,
                    destination=step_config_facade.steptype_specific_configs.output_data,
                    s3_data_type='S3Prefix',
                    s3_input_mode='File',
                )
            ],
            outputs=[
                ProcessingOutput(
                    source=step_config_facade.steptype_specific_configs.output_data,
                    destination=step_config_facade.steptype_specific_configs.output_data,
                    s3_upload_mode='EndOfJob',
                )
            ],
            code=step_config_facade.steptype_specific_configs.code_filename,
        )
fw_proc_step_factory = FrameworkProcessingStepFactory(step_name='preprocessing')

<cell>71: [1m[31merror:[m [m[1m"StepConfigFacade[ProcessingConfig, FrameworkProcessingConfig]"[m has no attribute [m[1m"input_s3_dir"[m  [m[33m[attr-defined][m
<cell>79: [1m[31merror:[m [m[1m"StepConfigFacade[ProcessingConfig, FrameworkProcessingConfig]"[m has no attribute [m[1m"s3_output_folder"[m  [m[33m[attr-defined][m
<cell>84: [1m[31merror:[m [m[1m"StepConfigFacade[ProcessingConfig, FrameworkProcessingConfig]"[m has no attribute [m[1m"s3_output_folder"[m  [m[33m[attr-defined][m
<cell>89: [1m[31merror:[m [m[1m"StepConfigFacade[ProcessingConfig, FrameworkProcessingConfig]"[m has no attribute [m[1m"s3_output_folder"[m  [m[33m[attr-defined][m
<cell>92: [1m[31merror:[m Name [m[1m"_step_config_facade"[m is not defined  [m[33m[name-defined][m
<cell>119: [1m[31merror:[m [m[1m"ProcessingConfig"[m has no attribute [m[1m"output_data"[m  [m[33m[attr-defined][m
<cell>126: [1m[31merror:[m [m[1m"ProcessingConfig"[m has no

NameError: name 'Processor' is not defined

## Pipeline facade

In [None]:
class PipelineFacade:
    def __init__(
        self,
        # Each step factory is instantiated with its step_name, thus identifying step_config
        step_factories: list[StepFactoryInterface],
        env: Environment,
        config_factory: ConfigFactoryInterface | None,
        path_factory: PathFactory,
        estimator_name_to_class_mapping: dict[str, Any] | None = None,
    ) -> None:
        self._env: Environment = env
        self._step_factories = step_factories
        self._config_factory = config_factory
        self._estimator_name_to_class_mapping = estimator_name_to_class_mapping
        self._path_factory = path_factory

        # Derived attributes
        # ==================
        # Note that we are using the config_factory *property*, which is always defined
        self._shared_config: SharedConfig = self.config_factory.get_shared_config(env=self._env)


    @property
    def estimator_name_to_class_mapping(self) -> dict[str, Any]:
        """
        This determines how to construct the estimator object from the string in the config file, avoiding the
        use of `eval`, which is a potential security vulnerability.
        """
        # Default mapping
        if self._estimator_name_to_class_mapping is None:
            return {'SKLearn': SKLearn} # todo: add more estimators

        # Allow user to override default to specify additional estimator classes
        else:
            return self._estimator_name_to_class_mapping

    @property
    def config_factory(self) -> ConfigFactoryInterface:
        # Generally, default is fine
        if self._config_factory is None:
            return DefaultConfigFactory()

        # Allow user to pass a custom factory, e.g. a mock factory for testing.
        else:
            return self._config_factory

    @cached_property
    def _steps(self) -> list[Step]:
        steps: list[Step] = []
        for step_factory in self._step_factories:
            step_configs: StepConfigFacade = self.config_factory.get_step_configs(
                env=self._env,
                step_name=step_factory.step_name,
            )
            step: Step = step_factory.create_step(
                shared_config=self._shared_config,
                step_configs=step_configs,
                path_factory=self._path_factory,
            )
            steps.append(step)
        return steps

    @cached_property
    def _aws_connector(self) -> AWSConnectorInterface:
        """
        This code makes connector.implementation.create_aws_connector() redundant, except for use
        outside of pipeline.
        Todo: decide where to put code for the latter case.
        """
        # todo: make this a factory, so we can move this logic out of facade?
        if self._env == 'local':
            return LocalAWSConnector()
        else:
            return AWSConnector(
                environment=self._env,
                # this error will resolve once we don't use SharedConfig from this notebook but
                # library's AWSConnector.
                shared_config=self._shared_config,  # type: ignore
                run_as_pipeline=True
            )

    @cached_property
    def _pipeline(self) -> Pipeline:
        """
        We could make this a private  method and call it in __init__(), but this is shorter.
        """
        pipeline_name = f'{self._shared_config.project_name}-{datetime.now():%Y-%m-%d-%H-%M-%S}'
        pipeline = Pipeline(
            name=pipeline_name,
            steps=self._steps,
            sagemaker_session=self._aws_connector.sm_session,
        )
        pipeline.create(role_arn=self._aws_connector.role_arn)
        return pipeline

    def run(self) -> None:
        try:
            logger.info(f"Starting pipeline run for project {self._shared_config.project_name}")
            execution = self._pipeline.start()
            execution.wait()
            execution.list_steps()

        except Exception as e:
            logger.error(e)


In [None]:
pipeline= PipelineFacade(
    step_factories=[fw_proc_step_factory],
    env='local',
    config_factory=MockFWPConfigFactory(),
)