# renfe-guru sagemaker example

## 0. python general imports

In [1]:
import pandas as pd
import logging
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
def get_logger():
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    return logger

logger = get_logger()

logger.info("hi!")

2019-09-26 13:20:28,799 - root - INFO - hi!


## 1. data loading

the dataset can be downloaded here: https://www.kaggle.com/thegurusteam/spanish-high-speed-rail-system-ticket-pricing

In [3]:
renfe = pd.read_parquet('../data/raw/renfe.parquet')  # about 60MB in .parquet file, but 3.5Gb in memory, be careful!

In [4]:
renfe.head()

Unnamed: 0,insert_date,origin,destination,start_date,end_date,train_type,price,train_class,fare
0,2019-08-21 03:42:10,SEVILLA,MADRID,2019-08-29 13:40:00,2019-08-29 16:10:00,AVE,47.3,Turista,Promo
1,2019-08-21 03:42:10,SEVILLA,MADRID,2019-08-29 14:45:00,2019-08-29 17:15:00,AVE,53.4,Turista,Promo
2,2019-08-21 03:42:10,SEVILLA,MADRID,2019-08-29 14:58:00,2019-08-29 17:50:00,ALVIA,,Preferente,Promo
3,2019-08-21 03:42:10,SEVILLA,MADRID,2019-08-29 15:45:00,2019-08-29 18:15:00,AVE,61.45,Preferente,Promo
4,2019-08-21 03:42:10,SEVILLA,MADRID,2019-08-29 16:45:00,2019-08-29 19:17:00,AVE,60.3,Turista,Promo


In [5]:
renfe.dtypes

insert_date    datetime64[ns]
origin                 object
destination            object
start_date     datetime64[ns]
end_date       datetime64[ns]
train_type             object
price                 float64
train_class            object
fare                   object
dtype: object

In [6]:
renfe.info()  # with deep memory usage will take a while...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10800510 entries, 0 to 10800509
Data columns (total 9 columns):
insert_date    datetime64[ns]
origin         object
destination    object
start_date     datetime64[ns]
end_date       datetime64[ns]
train_type     object
price          float64
train_class    object
fare           object
dtypes: datetime64[ns](3), float64(1), object(5)
memory usage: 741.6+ MB


## 2. data wrangling

first of all, null values will be dropped. all null values are due to:
- scrapping errors, specially at the beggining of the process
- trains with no ticket available (usually full, canceled, etc.)

In [7]:
# filtering null values, inplace to modify original df

renfe.dropna(inplace=True)

as the goal is to predict ticket price in advance, some interesting features can be derived:
- trip duration (in hours)
- time to departure (in days)
- hour of departure (24h)
- week day of departure

In [8]:
# feature engineering / generation

def add_features(renfe_df):

    renfe_df['duration'] = (renfe_df['end_date'] - renfe_df['start_date']).dt.seconds / 3600
    renfe_df['time_to_departure'] = (renfe_df['start_date'].dt.tz_localize('Europe/Madrid').dt.tz_convert('UTC') \
                                   - renfe_df['insert_date'].dt.tz_localize('UTC')).dt.days
    renfe_df['hour'] = renfe_df['start_date'].dt.hour
    renfe_df['weekday'] = renfe_df['start_date'].dt.dayofweek

add_features(renfe)

following, perform train - validation - test splits:

In [9]:
# train - test split

from sklearn.model_selection import train_test_split

renfe_train_validation, renfe_test = train_test_split(renfe)
renfe_train, renfe_validation = train_test_split(renfe_train_validation)

# to avoid chained assignment 'pandas warning'

renfe_train = renfe_train.copy()
renfe_validation = renfe_validation.copy()
renfe_test = renfe_test.copy()

In [10]:
logger.info(f'n obs in training set are: {renfe_train.shape[0]}')
logger.info(f'n obs in validation set are: {renfe_validation.shape[0]}')
logger.info(f'n obs in test set are: {renfe_test.shape[0]}')

2019-09-26 13:21:06,323 - root - INFO - n obs in training set are: 5699503
2019-09-26 13:21:06,325 - root - INFO - n obs in validation set are: 1899835
2019-09-26 13:21:06,326 - root - INFO - n obs in test set are: 2533113


data looks like this so far:

In [11]:
renfe_train.head().T

Unnamed: 0,4014038,1500573,4233318,6871655,10472288
insert_date,2019-04-17 13:23:18,2019-09-05 13:02:06,2019-04-19 05:04:09,2019-05-21 21:50:01,2019-08-16 19:45:56
origin,VALENCIA,MADRID,MADRID,BARCELONA,PONFERRADA
destination,MADRID,VALENCIA,SEVILLA,MADRID,MADRID
start_date,2019-05-24 10:40:00,2019-10-28 16:05:00,2019-05-08 19:30:00,2019-07-14 16:25:00,2019-09-04 06:11:00
end_date,2019-05-24 12:32:00,2019-10-28 22:47:00,2019-05-08 22:05:00,2019-07-14 18:55:00,2019-09-04 10:15:00
train_type,AVE,REGIONAL,AVE,AVE,ALVIA
price,33.65,28.35,53.4,88.95,33.5
train_class,Turista,Turista,Turista,Turista,Turista
fare,Promo,Adulto ida,Promo,Promo,Promo
duration,1.86667,6.7,2.58333,2.5,4.06667


following, there are some categorical columns that have to be encoded (most ML algorithms will need that):

In [12]:
# preprocessing

from sklearn.preprocessing import OrdinalEncoder
import joblib

encode_cols = ['train_type', 'train_class', 'fare', 'origin', 'destination']
encoder = OrdinalEncoder()
encoder.fit(renfe[encode_cols])  # warning, it should be fit only on training data!
joblib.dump(encoder, '../output/pickle_data/encoder.joblib')

for split, df in {'training': renfe_train, 
                  'validation': renfe_validation, 
                  'test': renfe_test}.items():
    logger.info(f'transforming {split} set...')
    df.loc[:,encode_cols] = encoder.transform(df.loc[:,encode_cols])

2019-09-26 13:21:09,363 - root - INFO - transforming training set...
2019-09-26 13:21:27,345 - root - INFO - transforming validation set...
2019-09-26 13:21:32,909 - root - INFO - transforming test set...


with those columns encoded, data looks like this:

In [13]:
renfe_train.head().T

Unnamed: 0,4014038,1500573,4233318,6871655,10472288
insert_date,2019-04-17 13:23:18,2019-09-05 13:02:06,2019-04-19 05:04:09,2019-05-21 21:50:01,2019-08-16 19:45:56
origin,5,2,2,0,3
destination,2,5,4,2,2
start_date,2019-05-24 10:40:00,2019-10-28 16:05:00,2019-05-08 19:30:00,2019-07-14 16:25:00,2019-09-04 06:11:00
end_date,2019-05-24 12:32:00,2019-10-28 22:47:00,2019-05-08 22:05:00,2019-07-14 18:55:00,2019-09-04 10:15:00
train_type,2,13,2,2,0
price,33.65,28.35,53.4,88.95,33.5
train_class,4,4,4,4,4
fare,8,1,8,8,8
duration,1.86667,6.7,2.58333,2.5,4.06667


## 3. upload data to S3

to use sagemaker using aws apis (sagemaker or boto) data must be formated in a particular way and stored in aws s3

In [14]:
# target must be in the first position of csv columns for xgboost via sagemaker API

target = 'price'
features = ['train_type', 'train_class', 'fare', 'duration', 'time_to_departure', 'hour', 'weekday']

renfe_train[[target] + features].head()

Unnamed: 0,price,train_type,train_class,fare,duration,time_to_departure,hour,weekday
4014038,33.65,2.0,4.0,8.0,1.866667,36,10,4
1500573,28.35,13.0,4.0,1.0,6.7,53,16,0
4233318,53.4,2.0,4.0,8.0,2.583333,19,19,2
6871655,88.95,2.0,4.0,8.0,2.5,53,16,6
10472288,33.5,0.0,4.0,8.0,4.066667,18,6,2


data must be pushed to s3, aws credentials must be properly set for this purpose (`/home/user/.aws/credentials`).

In [15]:
BUCKET = 'ml-in-production-madrid-sagemaker'

s3_train = 's3://' + BUCKET + '/' + 'train'
s3_validation = 's3://' + BUCKET + '/' + 'validation'
s3_model_output = 's3://' + BUCKET + '/' + 'model'

renfe_train[[target] + features].to_csv(s3_train + '/train.csv', index=False, header=False)  # .csv file without header
renfe_validation[[target] + features].to_csv(s3_validation + '/validation.csv', index=False, header=False)

## 4. train model with sagemaker api

first, using sagemaker api for xgboost image, can launch a model training job using the following code. please note that `role` and `region` information must be specified. can be fetched from aws programatically or hardcoded. __please check your aws console while training__

In [19]:
from datetime import datetime
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator

ROLE = 'arn:aws:iam::090554204572:role/service-role/AmazonSageMaker-ExecutionRole-20190610T164237'
REGION = 'eu-west-1'
TRAINING_JOB_NAME = 'ml-in-production-madrid-sgemaker-api'

train_channel = sagemaker.session.s3_input(s3_train, content_type='text/csv')
valid_channel = sagemaker.session.s3_input(s3_validation, content_type='text/csv')

data_channels = {'train': train_channel, 
                 'validation': valid_channel}

container = get_image_uri(REGION, 'xgboost', '0.90-1')

xgb_model = Estimator(container,
                      ROLE, 
                      train_instance_count=1, 
                      train_instance_type='ml.m4.xlarge',
                      train_volume_size = 5,
                      output_path=s3_model_output,
                      sagemaker_session=sagemaker.Session()
                     )

xgb_model.set_hyperparameters(max_depth = 4,
                              eta = .2,
                              gamma = 4,
                              min_child_weight = 8,
                              silent = 0,
                              objective = "reg:linear",
                              num_round = 8)

xgb_model.fit(inputs=data_channels, 
              logs=True, 
              job_name=TRAINING_JOB_NAME + \
              '-' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
)

2019-09-26 13:47:03,272 - sagemaker - INFO - Creating training-job with name: ml-in-production-madrid-sgemaker-api-2019-09-26-13-47-03


2019-09-26 13:47:03 Starting - Starting the training job...
2019-09-26 13:47:06 Starting - Launching requested ML instances...
2019-09-26 13:48:01 Starting - Preparing the instances for training......
2019-09-26 13:49:05 Downloading - Downloading input data
2019-09-26 13:49:05 Training - Downloading the training image...
2019-09-26 13:49:24 Training - Training image download completed. Training in progress.[31mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[31mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[31mReturning the value itself[0m
[31mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[31mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[31mINFO:root:Determined delimiter of CSV input is ','[0m
[31mINFO:root:Determined delimiter of CSV input is ','[0m
[31mINFO:root:Determined delimiter of CSV input is ','[0m
[31

## 5. train model with boto3 api

a training job can be created using boto3 __and not sagemaker api__. a dictionary with all details must be specified:

In [23]:
import boto3
import time

TRAINING_JOB_NAME = 'ml-in-production-madrid-boto3-api' \
                    + '-' + datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": ROLE,
    "OutputDataConfig": {
        "S3OutputPath": s3_model_output
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.4xlarge",
        "VolumeSizeInGB": 5
    },
    "TrainingJobName": TRAINING_JOB_NAME,
    "HyperParameters": {
        "max_depth":"4",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"4",
        "subsample":"0.7",
        "silent":"0",
        "objective":"reg:linear",
        "num_round":"8"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_train,
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_validation,
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None"
        }
    ]
}

client = boto3.client('sagemaker', region_name=REGION)
client.create_training_job(**create_training_params)
status = client.describe_training_job(TrainingJobName=TRAINING_JOB_NAME)['TrainingJobStatus']

# this loop will query status until completed, there is no more info available, go to aws console for more...
while status !='Completed' and status!='Failed':
    time.sleep(16)
    status = client.describe_training_job(TrainingJobName=TRAINING_JOB_NAME)['TrainingJobStatus']
    logger.info('training job created with boto3 api is:' + status)

InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


## 6. serving model with sagemaker api

to deploy a model and create an endpoint using sagemaker api, `deploy` method of sagemaker estimator can be used. takes some time... go grab some cofee!

In [26]:
ENDPOINT_NAME = 'ml-in-production-madrid-sagemaker-api-endpoint'
MODEL_NAME = 'ml-in-production-madrid-sagemaker-api-model'

xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                                 instance_type='ml.t2.medium',
                                 endpoint_name=ENDPOINT_NAME,
                                 model_name=MODEL_NAME)

## 7. serving model with boto3 api

not straightforward, involves 3 steps, too low level to explain here, does not worth the pain having sagemaker api and mlflow... let's try mlflow with an sklearn model instead :-D

## 8. invoking endpoint (just boto3 api option)

In [27]:
runtime = boto3.client('runtime.sagemaker')

logger.info('getting a sample of 100 elements from test split')
test_sample = renfe_test.sample(100)

logger.info('calling endpoint...')
response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                   Body=test_sample[features].to_csv(header=False, index=False))  # data must be passed as .csv (string)
display(response)
y_pred = list(map(lambda x: float(x), response['Body'].read().decode().split(',')))  # result is a string and must be parsed

2019-09-26 14:08:13,386 - root - INFO - getting a sample of 100 elements from test split
2019-09-26 14:08:13,498 - root - INFO - calling endpoint...


{'ResponseMetadata': {'RequestId': '047786de-f29f-484e-8b60-556d2f5c72d5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '047786de-f29f-484e-8b60-556d2f5c72d5',
   'x-amzn-invoked-production-variant': 'AllTraffic',
   'date': 'Thu, 26 Sep 2019 14:08:14 GMT',
   'content-type': 'text/csv; charset=utf-8',
   'content-length': '1842'},
  'RetryAttempts': 0},
 'ContentType': 'text/csv; charset=utf-8',
 'InvokedProductionVariant': 'AllTraffic',
 'Body': <botocore.response.StreamingBody at 0x7f8cfc9ac690>}

let's compare result with reality:

In [32]:
from sklearn.metrics import mean_absolute_error
y_true = test_sample['price']
logger.info(f"mae for xbgoost model is: {mean_absolute_error(y_true=y_true, y_pred=y_pred)}")

display(pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}))

2019-09-26 14:10:12,971 - root - INFO - mae for xbgoost model is: 11.515720779418944


Unnamed: 0,y_true,y_pred
668489,49.55,62.541649
9449386,45.80,38.042965
586448,85.10,62.541649
1431351,45.30,34.568508
2338501,85.10,62.541649
...,...,...
5341749,28.35,24.286119
354498,100.40,61.160225
8479041,85.10,62.541649
6837888,76.30,67.757019


## 9. deploy model using mlflow the easy way (sklearn version)

In [54]:
from sklearn.ensemble import RandomForestRegressor

X = renfe_train[features]
y = renfe_train[target]

rf = RandomForestRegressor(n_estimators=256, 
                           n_jobs=32,  # adapt to your processor(s)
                           verbose=1,
                           max_depth=8)  # limit max depth to keep serialized model under 100MB (or it will be unable to deploy in AWS)
rf.fit(X, y)

[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:  2.7min
[Parallel(n_jobs=32)]: Done 256 out of 256 | elapsed:  4.4min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=256, n_jobs=32,
                      oob_score=False, random_state=None, verbose=1,
                      warm_start=False)

check results for this model:

In [57]:
y_pred=rf.predict(test_sample[features])
y_true=test_sample['price']

logger.info(f"mae for xbgoost model is: {mean_absolute_error(y_true=y_true, y_pred=y_pred)}")

display(pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}))

[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 256 out of 256 | elapsed:    0.1s finished
2019-09-26 14:55:48,768 - root - INFO - mae for xbgoost model is: 6.751278555302093


Unnamed: 0,y_true,y_pred
668489,49.55,75.643374
9449386,45.80,45.800000
586448,85.10,75.643374
1431351,45.30,42.568147
2338501,85.10,75.643374
...,...,...
5341749,28.35,28.350000
354498,100.40,94.133290
8479041,85.10,75.643374
6837888,76.30,76.984304


In [39]:
import mlflow.sklearn

MODEL_PATH = '../output/price_pred_model'
!rm -rf $MODEL_PATH  # '!' can be used to execute bash commands in jupyter cells

mlflow.sklearn.save_model(sk_model=rf, path=MODEL_PATH)
logger.info("model saved!")

2019-09-26 14:23:43,825 - root - INFO - model saved!


using `mlflow.sagemaker` module, this model can be deployed directly in AWS, with just one line of code...

In [46]:
import mlflow.sagemaker

ENDPOINT_NAME = 'ml-in-prod-mad-mlf-api-ep'

mlflow.sagemaker.deploy(app_name=ENDPOINT_NAME, 
                         model_uri=MODEL_PATH, 
                         execution_role_arn=ROLE, 
                         bucket=BUCKET,
                         region_name=REGION, 
                         mode='create',  # try 'replace'
                         instance_type='ml.t2.medium', 
                         instance_count=1)

2019/09/26 14:30:53 INFO mlflow.sagemaker: Using the python_function flavor for deployment!
2019/09/26 14:30:54 INFO mlflow.sagemaker: tag response: {'ResponseMetadata': {'RequestId': 'D677046E82A7F687', 'HostId': '/Pz7uHFsEwkCO7dIjL+VZie+kqgRZBZ3mtseIIj8RCzShyVD8+o8rlyspi8Ka4+LvQQdnpRq9ew=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': '/Pz7uHFsEwkCO7dIjL+VZie+kqgRZBZ3mtseIIj8RCzShyVD8+o8rlyspi8Ka4+LvQQdnpRq9ew=', 'x-amz-request-id': 'D677046E82A7F687', 'date': 'Thu, 26 Sep 2019 14:30:55 GMT', 'content-length': '0', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}
2019/09/26 14:30:54 INFO mlflow.sagemaker: Creating new endpoint with name: ml-in-prod-mad-mlf-api-ep ...
2019/09/26 14:30:54 INFO mlflow.sagemaker: Created model with arn: arn:aws:sagemaker:eu-west-1:090554204572:model/ml-in-prod-mad-mlf-api-ep-model-tkd7gkgbsj6tshwhvjp2cw
2019/09/26 14:30:54 INFO mlflow.sagemaker: Created endpoint configuration with arn: arn:aws:sagemaker:eu-west-1:090554204572:endpoint-config/ml-in-pro

endpoint can be invoked the usual way, using boto3. authenticating against aws using plain curl or requests is much harder than just store its credentials and using aws sdk:

In [None]:
import json
response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                   ContentType='application/json',
                                   Body=test_sample[features].to_json(orient='split'))

y_pred = json.loads(response['Body'].read().decode())
y_true = test_sample[target]

logger.info(f"mae for xbgoost model is: {mean_absolute_error(y_true=y_true, y_pred=y_pred)}")

display(pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}))

## 10. references

- https://www.mlflow.org/docs/latest/models.html#built-in-deployment-tools
- https://aws.amazon.com/sagemaker/features/