In [24]:
import os
import sagemaker
from sagemaker.transformer import Transformer
from sagemaker import get_execution_role
from sagemaker.sklearn.model import SKLearnModel

SETTINGS

In [17]:
# Specify the S3 path where the trained model is stored
model_data = 's3://sagemaker-eu-west-1-211125740051/trainin-job-simple-03-2024-07-01-13-12-35-409/output/model.tar.gz' # This path can be retreived from training job
input_path = 's3://sagemaker-bucket-ds/training-jobs/data/inference_input/'
output_path = 's3://sagemaker-bucket-ds/training-jobs/data/inference_output/'

DELETE OUTPUT DATA

In [18]:
!aws s3 rm s3://sagemaker-bucket-ds/training-jobs/data/inference_output/ --recursive

CREATE BASIC OBJECTS

In [19]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()

CREATE ENTRY POINT

For some reason the Sklearn docker containers do not implement a function to read the model.
That is why we have to define this function by ourselves. Its signature must match the one below!

In [25]:
os.makedirs("07_batch_inference", exist_ok=True) # Create folder for training code

In [30]:
%%writefile 07_batch_inference/start_file.py

from __future__ import print_function

import argparse
import joblib
import os
import pandas as pd

from sklearn.linear_model import LogisticRegression

# There is no default function to load the model
# Without this function the job will fail!
def model_fn(model_dir):
    """Deserialized and return fitted model

    Note that this should have the same name as the serialized model in the main method
    """
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

# There is a default function to calculate the predictions.
# It calculates the class 0/1 instead of probability
# That is why we should override it with a custom function
def predict_fn(input_data, model):
    pred_prob = model.predict_proba(input_data)
    return pred_prob

Overwriting 07_batch_inference/start_file.py


CREATE SKLEARN MODEL

In [31]:
# Create the SKLearnModel
sklearn_model = SKLearnModel(
    model_data=model_data,
    entry_point='start_file.py', # The file with the training code
    source_dir='07_batch_inference', # The folder with the training code
    role=role,
    framework_version='1.2-1',  # Replace with the appropriate sklearn version
    sagemaker_session=sagemaker_session
)

CREATE TRANSFORMER

In [32]:
# Create the transformer object for batch transform
transformer = sklearn_model.transformer(
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=output_path # The path where the results will be saved
)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2024-07-01-14-42-23-782


CALCULATE PREDICTION

In [33]:
# Start the batch transform job
transformer.transform(
    data=input_path, # Path where the input is stored
    content_type='text/csv', # It is neccessary because csv is not default format
    split_type='Line' # Each line equals one observation
)

# Wait for the transform job to complete
transformer.wait()

INFO:sagemaker:Creating transform job with name: sagemaker-scikit-learn-2024-07-01-14-42-25-502


...............................[34m2024-07-01 14:47:40,788 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2024-07-01 14:47:40,792 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2024-07-01 14:47:40,793 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;