# Train

Based on [the example](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_inference_pipeline/Inference%20Pipeline%20with%20Scikit-learn%20and%20Linear%20Learner.ipynb).

In [1]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# S3 prefix
bucket = 'task-bucket-2021'
prefix = 'sagemaker-sklearn-linearlearner'

# Upload the train data

In [2]:
WORK_DIRECTORY = "../data"

train_input = sagemaker_session.upload_data(
    path="{}/{}".format(WORK_DIRECTORY, "train.csv"),
    bucket=bucket,
    key_prefix="{}/{}".format(prefix, "train"),
)

# Create SageMaker Scikit Estimator for preprocessing 

In [41]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
script_path = "../preprocess/preprocess.py"

sklearn_preprocessor = SKLearn(
    entry_point=script_path,
    role=role,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.m4.xlarge",
    sagemaker_session=sagemaker_session,
)

In [42]:
sklearn_preprocessor.fit({"train": train_input})

2021-07-18 22:01:58 Starting - Starting the training job...
2021-07-18 22:02:21 Starting - Launching requested ML instancesProfilerReport-1626645718: InProgress
...
2021-07-18 22:02:49 Starting - Preparing the instances for training.........
2021-07-18 22:04:22 Downloading - Downloading input data...
2021-07-18 22:04:42 Training - Downloading the training image...
2021-07-18 22:05:22 Uploading - Uploading generated training model
2021-07-18 22:05:22 Completed - Training job completed
[34m2021-07-18 22:05:10,193 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-07-18 22:05:10,197 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-18 22:05:10,208 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-07-18 22:05:10,563 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-18 22:05:12,003 sagemaker-training-to

In [43]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn_preprocessor.transformer(
    instance_count=1, instance_type="ml.m4.xlarge", assemble_with="Line", accept="text/csv",
    strategy='SingleRecord', max_payload=20
)

In [44]:
# Preprocess training input
transformer.transform(train_input, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
preprocessed_train = transformer.output_path

...............................[34m2021-07-18 22:10:38,875 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-18 22:10:38,878 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-07-18 22:10:38,879 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redire

UnexpectedStatusException: Error for Transform job sagemaker-scikit-learn-2021-07-18-22-05-40-395: Failed. Reason: AlgorithmError: See job logs for more information

# Fit a LinearLearner Model with the preprocessed data

In [None]:
import boto3
from sagemaker.image_uris import retrieve

ll_image = retrieve("linear-learner", boto3.Session().region_name)

In [None]:
s3_ll_output_key_prefix = "ll_training_output"
s3_ll_output_location = "s3://{}/{}/{}/{}".format(
    bucket, prefix, s3_ll_output_key_prefix, "ll_model"
)

ll_estimator = sagemaker.estimator.Estimator(
    ll_image,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    max_run=3600,
    input_mode="File",
    output_path=s3_ll_output_location,
    sagemaker_session=sagemaker_session,
)

ll_estimator.set_hyperparameters(predictor_type="binary_classifier", feature_dim=171, wd=1.0) # To reduce overfitting due to multicollinearity, set L2 regularization.

ll_train_data = sagemaker.inputs.TrainingInput(
    preprocessed_train,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)

data_channels = {"train": ll_train_data}
ll_estimator.fit(inputs=data_channels, logs=True)

# Create an Inference Pipeline with Scikit preprocessor and Linear Learner

In [None]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
linear_learner_model = ll_estimator.create_model()

model_name = "inference-pipeline-" + timestamp_prefix
endpoint_name = "inference-pipeline-ep-" + timestamp_prefix
sm_model = PipelineModel(
    name=model_name, role=role, models=[scikit_learn_inferencee_model, linear_learner_model]
)

sm_model.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge", endpoint_name=endpoint_name)

# Test the endpoint

In [None]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

predictor = Predictor(
    endpoint_name=endpoint_name, sagemaker_session=sagemaker_session, serializer=CSVSerializer()
)


test = pd.read_csv("{}/{}".format(WORK_DIRECTORY, "test.csv"))
label = 'default'
variables = [x for x in test.columns if x != 'uuid' and x != label]
test_X, test_y = test[variables], test[label].values

predictions = []
for array in np.array_split(test_X, 100):
    result = predictor.predict(array)
    predictions += [r["predicted_label"] for r in result["predictions"]]

predictions = np.array(predictions)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_y, predictions))