# Package SKLearn Model for SageMaker MME with DJL
Example 1 : Train, evaluate, and package a scikit-learn
Linear Regression model for deployment on a SageMaker Multi-Model Endpoint with DJL.

Read in essential static variables used across notebooks from the store. These values are set in notebook 00.

In [None]:
%store -r

## Step 1: Define and train the example model

### Import required modules for model training and evaluation

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

### Generate a synthetic dataset for training and evaluation

In [None]:
# Generate dummy data
np.random.seed(0)
X = np.random.rand(100, 1)
y = 2 * X + 1 + 0.1 * np.random.randn(100, 1)  

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Take a look at the example data 

In [None]:
X_test[0:2]

In [None]:
y_test[0:2]

### Create the model
In this case we're using Linear Regression mode 

In [None]:
model = LinearRegression()

### Train the model

In [None]:
model.fit(X_train, y_train)

### Evaluate the model

In [None]:
y_pred = model.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

### Run prediction on the first two test rows
And output the raw results - for quick comparison when we deploy the model to a SageMaker endpoint

In [None]:
y_pred = model.predict(X_test[0:2])
y_pred

## Step 2: Export the trained model

Export the model to a joblib format.
To be sure, that it saved correctly, this restores the model and running inference on a couple of test cases

In [None]:
import os
import joblib

### Make sure we're starting from a known place in the filesystem

In [None]:
%cd ~/SageMaker

### Export the model

In [None]:
# if linear-regression directory does not exist then create it
target_dir = "linear-regression"
target_path = f"./models/{target_dir}"

if not os.path.exists(target_path):
    os.makedirs(target_path)

In [None]:
%cd $target_path

In [None]:
model_filename = "model.joblib"
joblib.dump(model, model_filename)

### Test the exported model
Instantiate a new instance of the model from the saved file and test it with subset of test data

In [None]:
serialized_model = joblib.load(model_filename)

In [None]:
# sample inference
payload = X_test[0:2]
sm_y_pred = serialized_model.predict(payload).tolist()
sm_y_pred

In [None]:
sm_y_pred == y_pred

## Step 3: Deep Learning for Java (DJL) artifact creation

We now have our model artifact, but we need the following for our DJL Serving Engine

model.py: Inference script with custom model loading + pre/processing code

requirements.txt: Additional dependencies, in this case we need to install sklearn and numpy

serving.properties: Environment variables for DJL Serving, can adjust number of workers here

In [None]:
%%writefile model.py
#!/usr/bin/env python
#
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file
# except in compliance with the License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
# the specific language governing permissions and limitations under the License.

import logging
import numpy as np
import time
import os
import joblib
from djl_python import Input
from djl_python import Output


class SKLearnRegressor(object):
    def __init__(self):
        self.initialized = False

    def initialize(self, properties: dict):
        """
        Initialize model.
        """
        print("SKLearnRegressor: initialize: " + str(os.listdir()))
        if os.path.exists("model.joblib"):
            self.model = joblib.load(os.path.join("model.joblib"))
            print("SKLearnRegressor: model loaded during initialization")
        else:
            raise ValueError("Expecting a model.joblib artifact for SKLearn Model Loading")
        self.initialized = True

    def inference(self, inputs):
        """
        Custom service entry point function.

        :param inputs: the Input object holds a list of numpy array
        :return: the Output object to be send back
        """

        #sample input: [[0.5]]
        
        try:
            data = inputs.get_as_json()
            print(f"SKLearnRegressor: inference input data type : {type(data)}")
            print(f"SKLearnRegressor: inference input data: {data}")

            res = self.model.predict(data).tolist()[0]
            outputs = Output()
            outputs.add_as_json(res)
            
        except Exception as e:
            # error handling
            print(f"SKLearnRegressor: inference failed: {str(e)}")
            logging.exception(f"SKLearnRegressor: inference failed: {str(e)}")
            outputs = Output().error(str(e))
        
        print(f"SKLearnRegressor: inference output data type: {type(outputs)}")
        print(f"SKLearnRegressor: inference output data: {outputs}")

        return outputs


_service = SKLearnRegressor()


def handle(inputs: Input):
    """
    Default handler function
    """
    if not _service.initialized:
        # stateful model
        _service.initialize(inputs.get_properties())
    
    if inputs.is_empty():
        return None

    return _service.inference(inputs)

In [None]:
%%writefile requirements.txt
joblib
scikit-learn==1.5.0

In [None]:
%%writefile serving.properties
engine=Python
# idle time in seconds before the worker thread is scaled down, the default is 
max_idle_time=600

### Tarball Creation

In [None]:
# Build tar file with model data + inference code, replace this cell with your model.joblib
import subprocess

bashCommand = "tar -cvpzf model.tar.gz model.joblib requirements.txt model.py serving.properties"
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()

### Upload the tarball to target location on Amazon S3

In [None]:
from boto3 import client as boto3_client

s3_client = boto3_client("s3")

In [None]:
# upload the tarball to Amazon S3 where is will be used to back model requests
with open("model.tar.gz", "rb") as f:
    s3_client.upload_fileobj(f,
                             bucket,
                             f"{s3_model_prefix}/{lr_model_reference_name}")

In [None]:
# Verify that the tar ball is saved to the target location
!aws s3 ls {mme_artifacts}