In [25]:
%%sh
pip -q install pandas scikit-learn joblib

# Vanilla code

In [26]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import sagemaker

data = pd.read_csv('housing.csv')
labels = data[['medv']]
samples = data.drop(['medv'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(samples, labels, test_size=0.1, random_state=123)
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))
joblib.dump(regr, 'model.joblib')

Mean squared error: 41.82
Coefficient of determination: 0.63


['model.joblib']

In [None]:
%%sh
export SM_CHANNEL_TRAINING=training
export SM_MODEL_DIR=model
python sklearn-boston-housing.py --normalize True --test-size 0.1 --training . --model-dir .
ls -l model.joblib

## Run with SageMaker Local Mode

In [None]:
%%sh
pip install -q -t . --upgrade joblib

In [2]:
import sagemaker
from sagemaker.sklearn import SKLearn

training = 'file://.'
output = 'file://.'

role = sagemaker.get_execution_role()
sk = SKLearn(entry_point='sklearn-boston-housing.py',
             framework_version='0.20.0',
             role=role,
             instance_count=1, 
             instance_type='local',
             output_path=output,
             hyperparameters={
                  'normalize': True,
                  'test-size': 0.1
              }
)

sk.fit({'training':training})

  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


FileNotFoundError: [Errno 2] No such file or directory: 'sklearn-boston-housing.py'

In [None]:
sk_predictor = sk.deploy(initial_instance_count=1, instance_type='local')

In [None]:
data = pd.read_csv('housing.csv')
payload = data[:10].drop(['medv'], axis=1) 
payload = payload.to_csv(header=False, index=False)
print(payload)

In [None]:
sk_predictor = sk.deploy(initial_instance_count=1, 
                         instance_type='local')

In [None]:
data = pd.read_csv('housing.csv')
payload = data[:10].drop(['medv'], axis=1) 
payload = payload.to_csv(header=False, index=False)

sk_predictor.serializer = sagemaker.serializers.CSVSerializer()
sk_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = sk_predictor.predict(payload)

print(response)

In [None]:
sk_predictor.delete_endpoint()

## Run with SageMaker managed infrastructure

In [3]:
import sagemaker
from sagemaker.sklearn import SKLearn

role = sagemaker.get_execution_role()
sess   = sagemaker.Session()
bucket = sess.default_bucket()                     
prefix = 'sklearn-boston-housing'

training = sess.upload_data(path='housing.csv', key_prefix=prefix + "/training")
output   = 's3://{}/{}/output/'.format(bucket,prefix)
print(training)
print(output)

s3://sagemaker-us-east-2-392551634434/sklearn-boston-housing/training/housing.csv
s3://sagemaker-us-east-2-392551634434/sklearn-boston-housing/output/


In [12]:
sk = SKLearn(entry_point='sklearn-boston-housing.py',
             source_dir="./src",
             role=sagemaker.get_execution_role(),
             framework_version='0.20.0',
             instance_count=1, 
             instance_type='ml.m5.large',
             output_path=output,
             hyperparameters={
                  'normalize': True,
                  'test-size': 0.1
              }
)

sk.fit({'training':training})

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2024-01-13-15-29-43-474


2024-01-13 15:29:43 Starting - Starting the training job...
2024-01-13 15:29:57 Starting - Preparing the instances for training......
2024-01-13 15:30:54 Downloading - Downloading input data...
2024-01-13 15:31:24 Downloading - Downloading the training image...
2024-01-13 15:32:00 Training - Training image download completed. Training in progress.[34m2024-01-13 15:32:03,877 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-01-13 15:32:03,881 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-01-13 15:32:03,890 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-01-13 15:32:04,096 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting transformers@ git+https://github.com/abajorat/transformers@main
  Cloning https://github.com/abajorat/trans

In [30]:
sk_predictor = sk.deploy(initial_instance_count=1, instance_type='ml.t2.medium')




INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2024-01-13-02-03-03-508
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2024-01-13-02-03-03-508
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2024-01-13-02-03-03-508


-------!

You can reuse the cells above for prediction.

In [31]:
data = pd.read_csv('housing.csv')
payload = data[:5].drop(['medv'], axis=1) 
payload = payload.to_csv(header=False, index=False)

sk_predictor.serializer = sagemaker.serializers.CSVSerializer()
sk_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = sk_predictor.predict(payload)
response

[['29.801388899699845'],
 ['24.990809475886074'],
 ['30.737965445555197'],
 ['28.786967125316536'],
 ['28.14215019919609']]

In [32]:
sk_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-scikit-learn-2024-01-13-02-03-03-508
INFO:sagemaker:Deleting endpoint with name: sagemaker-scikit-learn-2024-01-13-02-03-03-508
