This notebook uses the [penguins dataset](https://www.kaggle.com/parulpandey/palmer-archipelago-antarctica-penguin-data) and installs the `keras-tuner` library.

<img src='https://imgur.com/orZWHly.png' alt='Penguins dataset' width="1024">

In [19]:
import os
import sagemaker
import numpy as np
import boto3
import json
import urllib.request

from sagemaker.tensorflow import TensorFlow
from sagemaker.inputs import FileSystemInput
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.parameters import ParameterInteger, ParameterString
from sagemaker.workflow.pipeline import Pipeline


role = sagemaker.get_execution_role()

### Download the Dataset

In [32]:
DATA_FILEPATH = "penguins/data.csv"
S3_FILEPATH = "s3://mlschool/penguins"

# Download the official Penguins dataset and store it locally.
urllib.request.urlretrieve(
    "https://storage.googleapis.com/download.tensorflow.org/data/palmer_penguins/penguins_size.csv", 
    DATA_FILEPATH
)

# Upload the dataset to S3. We need to do this to make it available to 
# the preprocessing step.
INPUT_DATA_URI = sagemaker.s3.S3Uploader.upload(
    local_path=DATA_FILEPATH, 
    desired_s3_uri=S3_FILEPATH,
)

INPUT_DATA_URI

's3://mlschool/penguins/data.csv'

### Define the Pipeline Parameters

These are the parameters that will be used by the Pipeline:

* `dataset_location`: This parameter represents the location of the dataset in S3. We'll use this parameter during the Preprocessing Step to perform feature engineering on the original data.

In [33]:
dataset_location = ParameterString(
    name="DatasetLocation",
    default_value=INPUT_DATA_URI,
)

### Create the Preprocessing Step

This script is responsible from doing feature engineering on the original dataset and spliting the data into a train and a validation set.

In [36]:
%%writefile penguins/preprocessor.py

import os
import numpy as np
import pandas as pd
import tempfile

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler


BASE_DIR = "/opt/ml/processing"
INPUT_PATH = Path(BASE_DIR) / "input"
DATA_FILEPATH = INPUT_PATH / "data.csv"


if __name__ == "__main__":
    df = pd.read_csv(DATA_FILEPATH)
    
    numerical_columns = [column for column in df.columns if df[column].dtype in ["int64", "float64"]]

    numerical_preprocessor = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ])

    categorical_preprocessor = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("numerical", numerical_preprocessor, numerical_columns),
            ("categorical", categorical_preprocessor, ["island"]),
        ]
    )
    
    y = df.species
    X = df.drop(["species", "sex"], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(
        X, 
        y, 
        test_size=0.20, 
        random_state=42
    )

    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    label_encoder = LabelEncoder()

    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)
    
    train = np.concatenate((y_train, X_train), axis=1)
    test = np.concatenate((y_test, X_test), axis=1)
    
    pd.DataFrame(train).to_csv(f"{BASE_DIR}/train/train.csv", header=False, index=False)
    pd.DataFrame(test).to_csv(f"{BASE_DIR}/validation/validation.csv", header=False, index=False)


Overwriting penguins/preprocessor.py


In [37]:
sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    instance_type="ml.t3.medium",
    instance_count=1,
    base_job_name="penguins-sklearn-preprocess",
    role=role,
)

preprocess_step = ProcessingStep(
    name="PenguinsPreprocessor",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=dataset_location, destination="/opt/ml/processing/input"),  
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
    ],
    code="penguins/preprocessor.py",
)

### Define the Pipeline

In [38]:
pipeline = Pipeline(
    name="PenguinsPipeline",
    parameters=[
        dataset_location,
    ],
    steps=[preprocess_step],
)

### Run the Pipeline

In [39]:
pipeline.upsert(role_arn=role)
execution = pipeline.start()


In [3]:
%%writefile penguins-train.py


import os
import argparse

import numpy as np
import random
import tensorflow as tf

from pathlib import Path
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Conv2D, Dense, MaxPooling2D
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Input, Dropout


# def train(base_directory, epochs=10, batch_size=32):
#     (X_train, y_train), (X_test, y_test) = mnist.load_data()

#     X_train = (X_train - 0.0) / (255.0 - 0.0)
#     X_test = (X_test - 0.0) / (255.0 - 0.0)

#     X_train = X_train.reshape((X_train.shape + (1,)))
#     X_test = X_test.reshape((X_test.shape + (1,)))

#     model = Sequential([
#         Conv2D(32, (3, 3), activation="relu", input_shape=(28, 28, 1)),
#         MaxPooling2D((2, 2)),
#         Flatten(),
#         Dense(100, activation="relu"),
#         Dense(10, activation="softmax")
#     ])

#     optimizer = SGD(learning_rate=0.01, momentum=0.9)
#     model.compile(
#         optimizer=optimizer, 
#         loss="sparse_categorical_crossentropy", 
#         metrics=["accuracy"]
#     )

#     model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

#     predictions = np.argmax(model.predict(X_test), axis=-1)
#     print(f"Accuracy: {accuracy_score(y_test, predictions)}")
    
#     model_filepath = os.path.join(base_directory, "model", "0001")
#     model.save(model_filepath)
    
    
def train(base_directory, epochs=50, batch_size=32):
    model = Sequential([
        Dense(10, input_shape=(X_train.shape[1],), activation="relu"),
        Dense(3, activation="softmax"),
    ])

    model.compile(
        optimizer=optimizers.SGD(learning_rate=0.01),
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

    # predictions = np.argmax(model.predict(X_test), axis=-1)
    # print(f"Accuracy: {accuracy_score(y_test, predictions)}")
    
    model_filepath = os.path.join(base_directory, "model", "0001")
    model.save(model_filepath)
    
parser = argparse.ArgumentParser()
parser.add_argument("--base_directory", type=str, default="/opt/ml/")
parser.add_argument("--train_folder", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", None))
parser.add_argument("--validation_folder", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION", None))
parser.add_argument("--epochs", type=int, default=50)
parser.add_argument("--batch_size", type=int, default=32)
args, _ = parser.parse_known_args()

# args.train_folder is where the train data is
# args.validation_folder is where the validation data is

train(
    base_directory=args.base_directory,
    epochs=args.epochs,
    batch_size=args.batch_size
)

Writing penguins-train.py


In [5]:
CONFIGURATION_DIRECTORY = "/tmp/training"

!mkdir -p $CONFIGURATION_DIRECTORY
!cp penguins-train.py $CONFIGURATION_DIRECTORY/train.py
!ls $CONFIGURATION_DIRECTORY

train.py


In [None]:
hyperparameters = {
    "epochs": 10,
    "batch_size": 32
}

estimator = TensorFlow(
    source_dir=CONFIGURATION_DIRECTORY,
    entry_point="train.py",
    role=role,
    hyperparameters=hyperparameters,
    instance_type="ml.m5.large",
    instance_count=1,
    py_version="py37",
    framework_version="2.4",
    script_mode=True,
    volume_size=5,
)


training_step = TrainingStep(
    name="PenguinsTraining",
    estimator=estimator,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri,
            content_type="text/csv"
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv"
        )
    },
)