# Facade + Factory-Patterns

In [40]:
%load_ext nb_mypy

The nb_mypy extension is already loaded. To reload it, use:
  %reload_ext nb_mypy


In [41]:
# to make imports and folder paths work
# todo: Instead create python package and install locally
import os, sys
os.chdir(
    f'{os.environ["HOME"]}/repos/sagemaker-pipelines-abstraction/src'
)

In [54]:
from abc import ABC, abstractmethod
from functools import cached_property
from typing import Literal, Callable, TypeAlias, Any
from pathlib import Path
from datetime import datetime
from typing import TypeVar, Generic
from dataclasses import dataclass

from pydantic_settings import BaseSettings
from loguru import logger
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import Step
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.processing import FrameworkProcessor
from sagemaker.workflow.steps import ConfigurableRetryStep, ProcessingStep
from sm_pipelines_oo.shared_config_schema import Environment

from sm_pipelines_oo.shared_config_schema import SharedConfig, Environment
# from sm_pipelines_oo.steps.interfaces import StepFactoryInterface
from sm_pipelines_oo.connector.interface import AWSConnectorInterface
from sm_pipelines_oo.utils import load_pydantic_config_from_file
from sm_pipelines_oo.connector.interface import AWSConnectorInterface
from sm_pipelines_oo.connector.implementation import AWSConnector, LocalAWSConnector, \
    create_aws_connector
from sm_pipelines_oo.pipeline_wrapper import PipelineWrapper


## Config

In [43]:
# Components
# ==========

class StepConfig(BaseSettings):
    input_filename: str
    output_filename: str
    output_train_filename: str
    output_val_filename: str
    output_test_filename: str
    instance_type: str
    instance_count: int
    step_name: str

class ProcessingConfig(BaseSettings):
    """
    This class provides the schema for the step-specific config file.
    It is passed to step factory in the latter's constructor.
    """
    sklearn_framework_version: str
    # Override default field with more specific filenames
    output_filename: None = None  # type: ignore[assignment]
    output_train_filename: str
    output_val_filename: str
    output_test_filename: str

class FrameworkProcessingConfig(BaseSettings):
    """
    So far no extra configs needed. (While it would be nice if we could set `estimator_cls=SKLearn`
    in the config file, but we would have to use `eval()` to construct a python object from the
    string, which is a potential security vulnerability.)
    """
    estimator_cls: Literal['SKLearn'] = 'SKLearn'

In [44]:
# Generic StepConfig FACADE
# =========================

# Each of the types must be a subclass of BaseSettings
StepSpecificConfigType = TypeVar("StepSpecificConfigType", bound=BaseSettings)
AdditionalConfigType = TypeVar("AdditionalConfigType", bound=BaseSettings)

@dataclass
class StepConfigFacade(
    Generic[StepSpecificConfigType, AdditionalConfigType]
):
    # This config type is hard-coded, since it does not depend on step type.
    general_step_config: StepConfig
    step_specific_config: StepSpecificConfigType
    additional_config: AdditionalConfigType

FrameworkProcessingConfigFacade = StepConfigFacade[
    ProcessingConfig,
    FrameworkProcessingConfig,
]

In [45]:
# Config FACTORY
# ===============
class ConfigFactoryInterface:
    """Abstraction layer that allows accessing both shared config and step configs."""
    @abstractmethod
    def get_step_configs(self, env: Environment, step_name: str) -> StepConfigFacade:
        ...

    @abstractmethod
    def get_shared_config(self, env: Environment) -> SharedConfig:
        ...

### Usage

In [46]:
# Define example configs
_step_config = StepConfig(
    input_filename='input.parquet',
    output_filename='output.parquet',
    output_train_filename='output_train.parquet',
    output_val_filename='output_val.parquet',
    output_test_filename='output_test.parquet',
    instance_type='local',
    instance_count=1,
    step_name='processing',
)

_processing_config = ProcessingConfig(
    sklearn_framework_version='0.23-1',
    output_train_filename='output_train.parquet',
    output_val_filename='output_val.parquet',
    output_test_filename='output_test.parquet',
)

_framework_processing_config = FrameworkProcessingConfig(
    estimator_cls='SKLearn',
)

fw_proc_configs = FrameworkProcessingConfigFacade(
    general_step_config=_step_config,
    step_specific_config=_processing_config,
    additional_config=_framework_processing_config,
)


class MockFWPConfigFactory(ConfigFactoryInterface):
    """
    This class is used to create a mock config for testing purposes.
    """
    def __init__(self) -> None:
        pass

    def get_step_configs(self, env: Environment, step_name: str) -> FrameworkProcessingConfigFacade:
        if env != 'test':
            raise ValueError(f'env must be "test", but got {env}')

        if step_name == 'processing':
            return fw_proc_configs
        else:
            raise NotImplementedError(f'No config defined for step {step_name}')

    def get_shared_config(self, env: Environment) -> SharedConfig:
        if env != 'test':
            raise ValueError(f'env must be "test", but got {env}')

        return SharedConfig(
            project_name="test",
            project_version='v0.0',
            region='us-east-1',
            role_name='test_role',
            project_bucket_name='test-bucket',
        )


In [57]:
class DefaultConfigFactory(ConfigFactoryInterface):
    """
    The pipeline façade will usually use this class to load configs. Only explicitly specify a
    different class implementing the same interface for testing purposes (e.g., to directly define
    config rather than reading from file).
    """
    # todo: Put reading of configs from file, etc, here
    raise NotImplementedError # type: ignore


NotImplementedError: 

## Step

In [50]:
# *General* step FACTORY INTERFACE
# ==============================
class StepFactoryInterface(ABC):

    @abstractmethod
    def create_step(
         self,
         shared_config: SharedConfig,
         step_configs: StepConfigFacade
    ) -> ConfigurableRetryStep:
        ...

    @property
    @abstractmethod
    def step_name(self) -> str:
        ...


class ProcessingStepFactoryInterface(StepFactoryInterface):
    """This subclass is distinguished only by more specific return type for step."""
    @abstractmethod
    def create_step(
        self,
        shared_config: SharedConfig,
        step_configs: StepConfigFacade
    ) -> ProcessingStep:
         ...

class FrameworkProcessorFactory(ProcessingStepFactoryInterface):
    # todo: Check if there is an inbuilt type for fwp-step.
    ...

In [51]:
# Step Factory IMPLEMENTATION
# ===========================

class FrameworkProcessingStepFactory(ProcessingStepFactoryInterface):
    """
    shared config etc will be passed during create_step().
    """

    def __init__(
        self,
        step_name: str,
    ):
        self._step_name = step_name

        # This determines how to construct the estimator object from the string in the config file, avoiding the
        # use of `eval`, which is a potential security vulnerability.
        self._str_to_cls_mapping: dict[str, Any] = {  # todo:  find supertype
            'SKLearn': SKLearn,
        }

    @property
    def step_name(self) -> str:
        return self._step_name

    def support_additional_estimators(self, additional_estimator_mapping: dict[str, Any]) -> None:
        """Allow user to add additional estimators (following the open-closed principle)."""
        self._str_to_cls_mapping.update(additional_estimator_mapping)

    def _construct_step_configs(self, env: Environment, step_name: str) -> FrameworkProcessingConfigFacade:
        """Load configs from file and return them as a single wrapper object."""
        # todo: use composition, e.g. configfactory, to ensure testability w/o mocking
        return get_mock_fwp_configs()


    # todo: Generalize types to other processors
    def _processor(self, step_configs: FrameworkProcessingConfig) -> Processor:  # type: ignore
        """
        Instantiate processor, combining step-specific configs with configs from AWS connector.

        Note that we could technically run this in __init__() now, because we do no longer use
        anything from the shared_config. However, leaving it here keeps the option open to make it
        a separate method that accepts outside configs as arguments, if necessary in the future.
        """
        return self._processor_cls(
            framework_version=step_configs.sklearn_framework_version,
            instance_type=step_configs.instance_type,
            instance_count=step_configs.instance_count,
            base_job_name=step_configs.step_name,
            sagemaker_session=self.aws_connector.sm_session,
            role=self.aws_connector.role_arn,
            **self._processor_extra_kwargs,
        )  # type: ignore

    def _get_processor_run_args(self) -> ProcessorRunArgs:
        s3_input_folder: str = self.path_factory.s3_input_folder
        s3_output_folder: str = self.path_factory.s3_output_folder
        local_folderpath: str = self.path_factory.local_folderpath

        skl_run_args = ProcessorRunArgs(
            inputs = [
                ProcessingInput(
                    source=s3_input_folder,
                    destination=f"{local_folderpath}/input/"
                ),
            ],
            outputs = [
                ProcessingOutput(
                    output_name="train",
                    source=f"/{local_folderpath}/train",
                    destination=f"{s3_output_folder}/train",
                ),
                ProcessingOutput(
                    output_name="validation",
                    source=f"/{local_folderpath}/validation",
                    destination=f"{s3_output_folder}/validation",
                ),
                ProcessingOutput(
                    output_name="test",
                    source=f"/{local_folderpath}/test",
                    destination=f"{s3_output_folder}/test",
                ),
            ],
            source_dir=self.path_factory.source_dir,
            code=self.path_factory.step_code_file,
            arguments=None # Todo: Decide whether this should come from configuration. May depend on type of step.
        )
        return skl_run_args

    # todo: Add more specific return type (may have to create custom type, but check Sagemaker sdk code again)
    def create_step(self, env: Environment, shared_config: SharedConfig) -> ProcessingStep:
        # todo: think about how to create these here
        step_configs = fw_proc_configs

        return ProcessingStep(
            name=step_config.step_name,
            processor=FrameworkProcessor(
                estimator_cls=step_configs,
                framework_version=step_config.sklearn_framework_version,
                instance_type=step_config.instance_type,
                instance_count=step_config.instance_count,
                role=step_config.role_name,
            ),
            inputs=[ProcessingInput(
                source=step_config.input_data,
                destination=step_config.output_data,
                s3_data_type='S3Prefix',
                s3_input_mode='File',
            )],
            outputs=[ProcessingOutput(
                source=step_config.output_data,
                destination=step_config.output_data,
                s3_upload_mode='EndOfJob',
            )],
            code=step_config.code,
        )
fw_proc_step_factory = FrameworkProcessingStepFactory(step_name='preprocessing')

<cell>32: [1m[31merror:[m Name [m[1m"get_mock_fwp_configs"[m is not defined  [m[33m[name-defined][m
<cell>44: [1m[31merror:[m [m[1m"FrameworkProcessingStepFactory"[m has no attribute [m[1m"_processor_cls"[m; maybe [m[1m"_processor"[m?  [m[33m[attr-defined][m
<cell>45: [1m[31merror:[m [m[1m"FrameworkProcessingConfig"[m has no attribute [m[1m"sklearn_framework_version"[m  [m[33m[attr-defined][m
<cell>46: [1m[31merror:[m [m[1m"FrameworkProcessingConfig"[m has no attribute [m[1m"instance_type"[m  [m[33m[attr-defined][m
<cell>47: [1m[31merror:[m [m[1m"FrameworkProcessingConfig"[m has no attribute [m[1m"instance_count"[m  [m[33m[attr-defined][m
<cell>48: [1m[31merror:[m [m[1m"FrameworkProcessingConfig"[m has no attribute [m[1m"step_name"[m  [m[33m[attr-defined][m
<cell>49: [1m[31merror:[m [m[1m"FrameworkProcessingStepFactory"[m has no attribute [m[1m"aws_connector"[m  [m[33m[attr-defined][m
<cell>50: [1m[31

NameError: name 'Processor' is not defined

## Pipeline facade

In [58]:
class PipelineFacade:
    def __init__(
        self,
        # Each step factory is instantiated with its step_name, thus identifying step_config
        step_factories: list[StepFactoryInterface],
        env: Environment,
        config_factory: ConfigFactoryInterface | None,
    ) -> None:
        self._env: Environment = env
        self._step_factories = step_factories

        # Generally, we can simply use the default StepConfigFactory.
        # However, we want to be able to pass a custom factory for testing purposes.
        self._config_factory: ConfigFactoryInterface = (
            config_factory if config_factory is not None
                else DefaultConfigFactory()  # type: ignore
        )
                # Now that we know config factory is defined, we can use it to load shared config
        self._shared_config: SharedConfig = self._config_factory.get_shared_config(
            env=self._env
        )

    @cached_property
    def _steps(self) -> list[Step]:
        steps: list[Step] = []
        for step_factory in self._step_factories:
            step_configs: StepConfigFacade = self._config_factory.get_step_configs(
                env=self._env,
                step_name=step_factory.step_name,
            )
            step: Step = step_factory.create_step(
                shared_config=self._shared_config,
                step_configs=step_configs,
            )
            steps.append(step)
        return steps

    @cached_property
    def _aws_connector(self) -> AWSConnectorInterface:
        """
        This code makes connector.implementation.create_aws_connector() redundant, except for use
        outside of pipeline.
        Todo: decide where to put code for the latter case.
        """
        # todo: make this a factory, and move it out of facade?
        if self._env == 'local':
            return LocalAWSConnector()
        else:
            return AWSConnector(
                environment=self._env,
                # this error will resolve once we don't use SharedConfig from this notebook but
                # library's AWSConnector.
                shared_config=self._shared_config,  # type: ignore
                run_as_pipeline=True
            )

    @cached_property
    def _pipeline(self) -> Pipeline:
        """
        We could make this a private  method and call it in __init__(), but this is shorter.
        """
        pipeline_name = f'{self._shared_config.project_name}-{datetime.now():%Y-%m-%d-%H-%M-%S}'
        pipeline = Pipeline(
            name=pipeline_name,
            steps=self._steps,
            sagemaker_session=self._aws_connector.sm_session,
        )
        pipeline.create(role_arn=self._aws_connector.role_arn)
        return pipeline

    def run(self) -> None:
        try:
            logger.info(f"Starting pipeline run for project {self._shared_config.project_name}")
            execution = self._pipeline.start()
            execution.wait()
            execution.list_steps()

        except Exception as e:
            logger.error(e)


<cell>16: [1m[31merror:[m Cannot instantiate abstract class [m[1m"DefaultConfigFactory"[m with abstract attributes [m[1m"get_shared_config"[m and [m[1m"get_step_configs"[m  [m[33m[abstract][m


In [53]:
pipeline= PipelineFacade(
    step_factories=[fw_proc_step_factory],
    env='local',
    config_factory=MockFWPConfigFactory(),
)

NameError: name 'fw_proc_step_factory' is not defined