# Facade + Factory-Patterns

In [1]:
%load_ext nb_mypy

Version 1.0.5


In [2]:
# to make imports and folder paths work
# todo: Instead create python package and install locally
import os, sys
os.chdir(
    f'{os.environ["HOME"]}/repos/sagemaker-pipelines-abstraction/src'
)

In [3]:
from abc import ABC, abstractmethod
from functools import cached_property
from typing import Literal, Callable, TypeAlias, Any
from pathlib import Path
from datetime import datetime
from typing import TypeVar, Generic
from dataclasses import dataclass

from pydantic_settings import BaseSettings
from loguru import logger
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import Step
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.processing import FrameworkProcessor
from sagemaker.workflow.steps import ConfigurableRetryStep, ProcessingStep
from sm_pipelines_oo.shared_config_schema import Environment

from sm_pipelines_oo.shared_config_schema import SharedConfig, Environment
# from sm_pipelines_oo.steps.interfaces import StepFactoryInterface
from sm_pipelines_oo.connector.interface import AWSConnectorInterface
from sm_pipelines_oo.utils import load_pydantic_config_from_file
from sm_pipelines_oo.connector.interface import AWSConnectorInterface
from sm_pipelines_oo.connector.implementation import AWSConnector, LocalAWSConnector, \
    create_aws_connector
from sm_pipelines_oo.pipeline_wrapper import PipelineWrapper


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/thomas-22/.config/sagemaker/config.yaml


## Config

In [12]:
# Components
# ==========

class ProcessingConfig(BaseSettings):
    """
    This class defines all the config values shared by any subtype of processing step.
    """
    input_filename: str
    output_train_filename: str
    output_val_filename: str
    output_test_filename: str
    instance_type: str
    instance_count: int
    sklearn_framework_version: str

class FrameworkProcessingConfig(BaseSettings):
    """
    This class defines all the config values that are specific to a FrameworkProcessor.
    """
    # While it would be nice if we could set `estimator_cls=SKLearn`in the config file, we would
    # have to use `eval()` to construct a python object from the string, which is a potential
    # security vulnerability.)
    estimator_cls: Literal['SKLearn'] = 'SKLearn'

In [13]:
# Generic StepConfig FACADE
# =========================

# Each of the types must be a subclass of BaseSettings
StepSpecificConfigType = TypeVar("StepSpecificConfigType", bound=BaseSettings)
AdditionalConfigType = TypeVar("AdditionalConfigType", bound=BaseSettings)

@dataclass
class StepConfigFacade(
    Generic[StepSpecificConfigType, AdditionalConfigType]
):
    step_name: str
    step_specific_config: StepSpecificConfigType
    additional_config: AdditionalConfigType

FrameworkProcessingConfigFacade = StepConfigFacade[
    ProcessingConfig,
    FrameworkProcessingConfig,
]

In [14]:
# Config FACTORY
# ===============
class ConfigFactoryInterface:
    """Abstraction layer that allows accessing both shared config and step configs."""
    @abstractmethod
    def get_step_configs(self, env: Environment, step_name: str) -> StepConfigFacade:
        ...

    @abstractmethod
    def get_shared_config(self, env: Environment) -> SharedConfig:
        ...

### Usage

In [15]:
# Define example configs
fw_proc_configs = FrameworkProcessingConfigFacade(
    step_name='pre_processing',
    step_specific_config=ProcessingConfig(
        input_filename='input.parquet',
        output_train_filename='output_train.parquet',
        output_val_filename='output_val.parquet',
        output_test_filename='output_test.parquet',
        instance_type='local',
        instance_count=1,
        sklearn_framework_version='0.23-1',
    ),
    additional_config=FrameworkProcessingConfig(
        estimator_cls='SKLearn',
    ),
)


class MockFWPConfigFactory(ConfigFactoryInterface):
    """
    This class is used to create a mock config for testing purposes.
    """
    def __init__(self) -> None:
        pass

    def get_step_configs(self, env: Environment, step_name: str) -> FrameworkProcessingConfigFacade:
        if env != 'test':
            raise ValueError(f'env must be "test", but got {env}')

        if step_name == 'processing':
            return fw_proc_configs
        else:
            raise NotImplementedError(f'No config defined for step {step_name}')

    def get_shared_config(self, env: Environment) -> SharedConfig:
        if env != 'test':
            raise ValueError(f'env must be "test", but got {env}')

        return SharedConfig(
            project_name="test",
            project_version='v0.0',
            region='us-east-1',
            role_name='test_role',
            project_bucket_name='test-bucket',
        )


In [8]:
class DefaultConfigFactory(ConfigFactoryInterface):
    """
    The pipeline façade will usually use this class to load configs. Only explicitly specify a
    different class implementing the same interface for testing purposes (e.g., to directly define
    config rather than reading from file).
    """
    # todo: Put reading of configs from file, etc, here
    raise NotImplementedError # type: ignore


NotImplementedError: 

## Step

In [9]:
# *General* step FACTORY INTERFACE
# ==============================
class StepFactoryInterface(ABC):

    @abstractmethod
    def create_step(
         self,
         shared_config: SharedConfig,
         step_configs: StepConfigFacade
    ) -> ConfigurableRetryStep:
        ...

    @property
    @abstractmethod
    def step_name(self) -> str:
        ...


class ProcessingStepFactoryInterface(StepFactoryInterface):
    """This subclass is distinguished only by more specific return type for step."""
    @abstractmethod
    def create_step(
        self,
        shared_config: SharedConfig,
        step_configs: StepConfigFacade
    ) -> ProcessingStep:
         ...

class FrameworkProcessorFactory(ProcessingStepFactoryInterface):
    # todo: Check if there is an inbuilt type for fwp-step.
    ...

In [None]:
from typing import TypedDict
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Run Args
# ========
class ProcessorRunArgs(TypedDict):
    inputs: list[ProcessingInput]
    outputs: list[ProcessingOutput]
    arguments: list[str] | None

class FrameworkProcessorRunArgs(ProcessorRunArgs):
    # Additional args for FrameworkProcessor:
    source_dir: str
    code: str


In [19]:
from sagemaker.session import Session
from sagemaker.workflow.pipeline_context import PipelineSession, LocalPipelineSession

from sm_pipelines_oo.steps.step_utils import PathFactory

# Step Factory IMPLEMENTATION
# ===========================

class FrameworkProcessingStepFactory(ProcessingStepFactoryInterface):
    """
    shared config etc will be passed during create_step().
    """

    def __init__(
        self,
        step_name: str,
    ):
        self._step_name = step_name

        # This determines how to construct the estimator object from the string in the config file, avoiding the
        # use of `eval`, which is a potential security vulnerability.
        self._str_to_cls_mapping: dict[str, Any] = {  # todo:  find supertype
            'SKLearn': SKLearn,
        }

    def support_additional_estimators(self, additional_estimator_mapping: dict[str, Any]) -> None:
        """
        Allow user to add additional estimators (following the open-closed principle).

        Note: We don't use composition, because passing this mapping to constructor would violate
        the shared interface for StepFactory's constructor. Instead, we add an addiitonal method
        to allow updating the mapping.
        """
        self._str_to_cls_mapping.update(additional_estimator_mapping)

    @property
    def step_name(self) -> str:
        """
        Facade calls this method before create_step(), so it can retrieve the right config first.
        """
        return self._step_name

    # todo: Generalize types to other processors
    def _processor(
        self,
        step_config_facade: FrameworkProcessingConfigFacade,
        sagemaker_session: PipelineSession | Session | LocalPipelineSession,
        role_arn: str,
    ) -> Processor:  # type: ignore
        """Instantiate processor."""
        # Get processor class from classname string in config file
        _processor_cls_name: str = step_config_facade.additional_config.estimator_cls
        processor_cls = self._str_to_cls_mapping[_processor_cls_name]
        return processor_cls(
            framework_version=step_config_facade.step_specific_config.sklearn_framework_version,
            instance_type=step_config_facade.step_specific_config.instance_type,
            instance_count=step_config_facade.step_specific_config.instance_count,
            base_job_name=step_config_facade.step_name,
            sagemaker_session=sagemaker_session,
            role=role_arn,
            # **self._processor_extra_kwargs,
        )  # type: ignore

    def _get_processor_run_args(self) -> ProcessorRunArgs:
        s3_input_folder: str = self.path_factory.s3_input_folder
        s3_output_folder: str = self.path_factory.s3_output_folder
        local_folderpath: str = self.path_factory.local_folderpath

        skl_run_args = ProcessorRunArgs(
            inputs = [
                ProcessingInput(
                    source=s3_input_folder,
                    destination=f"{local_folderpath}/input/"
                ),
            ],
            outputs = [
                ProcessingOutput(
                    output_name="train",
                    source=f"/{local_folderpath}/train",
                    destination=f"{s3_output_folder}/train",
                ),
                ProcessingOutput(
                    output_name="validation",
                    source=f"/{local_folderpath}/validation",
                    destination=f"{s3_output_folder}/validation",
                ),
                ProcessingOutput(
                    output_name="test",
                    source=f"/{local_folderpath}/test",
                    destination=f"{s3_output_folder}/test",
                ),
            ],
            source_dir=self.path_factory.source_dir,
            code=self.path_factory.step_code_file,
            arguments=None # Todo: Decide whether this should come from configuration. May depend on type of step.
        )
        return skl_run_args

    # todo: Add more specific return type (may have to create custom type, but check Sagemaker sdk code again)
    def create_step(self, shared_config: SharedConfig, step_config_facade: StepConfigFacade) -> ProcessingStep:
        # todo: think about how to create these here
        step_config_facade = fw_proc_configs

        return ProcessingStep(
            name=step_config_facade.step_name,
            processor=FrameworkProcessor(
                estimator_cls=step_config_facade,
                framework_version=step_config.sklearn_framework_version,
                instance_type=step_config.instance_type,
                instance_count=step_config.instance_count,
                role=step_config.role_name,
            ),
            inputs=[ProcessingInput(
                source=step_config.input_data,
                destination=step_config.output_data,
                s3_data_type='S3Prefix',
                s3_input_mode='File',
            )],
            outputs=[ProcessingOutput(
                source=step_config.output_data,
                destination=step_config.output_data,
                s3_upload_mode='EndOfJob',
            )],
            code=step_config.code,
        )
fw_proc_step_factory = FrameworkProcessingStepFactory(step_name='preprocessing')

<cell>44: [1m[31merror:[m [m[1m"StepConfigFacade[ProcessingConfig, FrameworkProcessingConfig]"[m has no attribute [m[1m"estimator_cls"[m  [m[33m[attr-defined][m
<cell>48: [1m[31merror:[m [m[1m"StepConfigFacade[ProcessingConfig, FrameworkProcessingConfig]"[m has no attribute [m[1m"sklearn_framework_version"[m  [m[33m[attr-defined][m
<cell>49: [1m[31merror:[m [m[1m"StepConfigFacade[ProcessingConfig, FrameworkProcessingConfig]"[m has no attribute [m[1m"instance_type"[m  [m[33m[attr-defined][m
<cell>50: [1m[31merror:[m [m[1m"StepConfigFacade[ProcessingConfig, FrameworkProcessingConfig]"[m has no attribute [m[1m"instance_count"[m  [m[33m[attr-defined][m
<cell>52: [1m[31merror:[m [m[1m"FrameworkProcessingStepFactory"[m has no attribute [m[1m"aws_connector"[m  [m[33m[attr-defined][m
<cell>53: [1m[31merror:[m [m[1m"FrameworkProcessingStepFactory"[m has no attribute [m[1m"aws_connector"[m  [m[33m[attr-defined][m
<cell>54: 

NameError: name 'Processor' is not defined

## Pipeline facade

In [17]:
class PipelineFacade:
    def __init__(
        self,
        # Each step factory is instantiated with its step_name, thus identifying step_config
        step_factories: list[StepFactoryInterface],
        env: Environment,
        config_factory: ConfigFactoryInterface | None,
        estimator_name_to_class_mapping: dict[str, Any] | None,
    ) -> None:
        self._env: Environment = env
        self._step_factories = step_factories
        self._config_factory = config_factory
        self._estimator_name_to_class_mapping = estimator_name_to_class_mapping

        # Derived attributes
        # ==================
        # Note that we are using the config_factory *property*, which is always defined
        self._shared_config: SharedConfig = self.config_factory.get_shared_config(env=self._env)


    @property
    def estimator_name_to_class_mapping(self) -> dict[str, Any]:
        """
        This determines how to construct the estimator object from the string in the config file, avoiding the
        use of `eval`, which is a potential security vulnerability.
        """
        # Default mapping
        if self._estimator_name_to_class_mapping is None:
            return {'SKLearn': SKLearn} # todo: add more estimators

        # Allow user to override default to specify additional estimator classes
        else:
            return self._estimator_name_to_class_mapping

    @property
    def config_factory(self) -> ConfigFactoryInterface:
        # Generally, default is fine
        if self._config_factory is None:
            return DefaultConfigFactory()

        # Allow user to pass a custom factory, e.g. a mock factory for testing.
        else:
            return self._config_factory

    @cached_property
    def _steps(self) -> list[Step]:
        steps: list[Step] = []
        for step_factory in self._step_factories:
            step_configs: StepConfigFacade = self.config_factory.get_step_configs(
                env=self._env,
                step_name=step_factory.step_name,
            )
            step: Step = step_factory.create_step(
                shared_config=self._shared_config,
                step_configs=step_configs,
            )
            steps.append(step)
        return steps

    @cached_property
    def _aws_connector(self) -> AWSConnectorInterface:
        """
        This code makes connector.implementation.create_aws_connector() redundant, except for use
        outside of pipeline.
        Todo: decide where to put code for the latter case.
        """
        # todo: make this a factory, so we can move this logic out of facade?
        if self._env == 'local':
            return LocalAWSConnector()
        else:
            return AWSConnector(
                environment=self._env,
                # this error will resolve once we don't use SharedConfig from this notebook but
                # library's AWSConnector.
                shared_config=self._shared_config,  # type: ignore
                run_as_pipeline=True
            )

    @cached_property
    def _pipeline(self) -> Pipeline:
        """
        We could make this a private  method and call it in __init__(), but this is shorter.
        """
        pipeline_name = f'{self._shared_config.project_name}-{datetime.now():%Y-%m-%d-%H-%M-%S}'
        pipeline = Pipeline(
            name=pipeline_name,
            steps=self._steps,
            sagemaker_session=self._aws_connector.sm_session,
        )
        pipeline.create(role_arn=self._aws_connector.role_arn)
        return pipeline

    def run(self) -> None:
        try:
            logger.info(f"Starting pipeline run for project {self._shared_config.project_name}")
            execution = self._pipeline.start()
            execution.wait()
            execution.list_steps()

        except Exception as e:
            logger.error(e)


<cell>39: [1m[31merror:[m Cannot instantiate abstract class [m[1m"DefaultConfigFactory"[m with abstract attributes [m[1m"get_shared_config"[m and [m[1m"get_step_configs"[m  [m[33m[abstract][m
<cell>49: [1m[31merror:[m Item [m[1m"None"[m of [m[1m"ConfigFactoryInterface | None"[m has no attribute [m[1m"get_step_configs"[m  [m[33m[union-attr][m


In [None]:
pipeline= PipelineFacade(
    step_factories=[fw_proc_step_factory],
    env='local',
    config_factory=MockFWPConfigFactory(),
)

NameError: name 'fw_proc_step_factory' is not defined