In [1]:
import pandas as pd

bucket = 'titanic-sagemaker'
data_key = 'input'

train_data_location = 's3://{}/{}/{}'.format(bucket, data_key, 'train.csv')
test_data_location = 's3://{}/{}/{}'.format(bucket, data_key, 'test.csv')

train_data = pd.read_csv(train_data_location)
test_data = pd.read_csv(test_data_location)

print(train_data.columns)
print(test_data.columns)

print(train_data.head())
print(test_data.head())

# Variable Notes

# PassengerId: Unique identifier for the passenger
# Survived: 1 if passenger survived 0 otherwise
# Pclass: 1 = Upper, 2 = Middle,3 = Lower
# Name: Passenger's name
# Sex: Gender of the passenger denoted by 'male', 'female'
# Age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
# SibSp: No of siblings / spouses aboard the Titanic
# Parch: No of parents / children aboard the Titanic
# Ticket: Ticket number
# Fare: Passenger fare
# Cabin: Passenger cabin
# Embarked: Port of Embarkation; C = Cherbourg, Q = Queenstown, S = Southampton

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket

In [2]:
# Notes on data processing
# 1. Sagemaker requires the label to tbe the first column in the csv training file.
# 2. Since the goal is to illustrate how to use Sagemaker to generate predictions, we will keep things simple
#    and use 'Pclass', and 'Sex' as our predictor variables.

# Extract relevant columns
processed_train_data = train_data[['Survived', 'Pclass', 'Sex']]
processed_test_data = test_data[['Pclass', 'Sex']]

print(processed_train_data.head())
print(processed_test_data.head())

# Code 'Sex' into dummy variables
processed_train_data = pd.get_dummies(processed_train_data)
processed_test_data = pd.get_dummies(processed_test_data)

print(processed_train_data.head())
print(processed_test_data.head())

# Omit 'Sex_male' to avoid collinearity
processed_train_data.drop(['Sex_male'], axis=1, inplace=True)
processed_test_data.drop(['Sex_male'], axis=1, inplace=True)

print(processed_train_data.head())
print(processed_test_data.head())




   Survived  Pclass     Sex
0         0       3    male
1         1       1  female
2         1       3  female
3         1       1  female
4         0       3    male
   Pclass     Sex
0       3    male
1       3  female
2       2    male
3       3    male
4       3  female
   Survived  Pclass  Sex_female  Sex_male
0         0       3           0         1
1         1       1           1         0
2         1       3           1         0
3         1       1           1         0
4         0       3           0         1
   Pclass  Sex_female  Sex_male
0       3           0         1
1       3           1         0
2       2           0         1
3       3           0         1
4       3           1         0
   Survived  Pclass  Sex_female
0         0       3           0
1         1       1           1
2         1       3           1
3         1       1           1
4         0       3           0
   Pclass  Sex_female
0       3           0
1       3           1
2       2           0


In [29]:
# Convert data to recordIO-wrapped protobuf

import io
import sagemaker.amazon.common as smac

labels = processed_train_data.iloc[:,0].values.astype('float32')
features = processed_train_data.iloc[:,1:].values.astype('float32')

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, features, labels)
buf.seek(0)

0

In [30]:
import boto3
import os

key = 'recordio-titanic-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join('input', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/input/{}'.format(bucket, key)
print('uploaded training data location: {}'.format(s3_train_data))

output_location = 's3://{}/output'.format(bucket)
print('training artifacts will be uploaded to: {}'.format(output_location))

uploaded training data location: s3://titanic-sagemaker/input/recordio-titanic-data
training artifacts will be uploaded to: s3://titanic-sagemaker/output


In [31]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [32]:
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
session = sagemaker.Session()

linear_estimator = sagemaker.estimator.Estimator(container,
                                                 role,
                                                 train_instance_count=1,
                                                 train_instance_type='ml.m4.xlarge',
                                                 output_path=output_location,
                                                 sagemaker_session=session)

linear_estimator.set_hyperparameters(feature_dim=2,
                                    predictor_type='binary_classifier',
                                    mini_batch_size=200)

linear_estimator.fit({'train': s3_train_data})

INFO:sagemaker:Creating training-job with name: linear-learner-2018-12-04-15-39-10-680


2018-12-04 15:39:10 Starting - Starting the training job...
2018-12-04 15:39:12 Starting - Launching requested ML instances......
2018-12-04 15:40:16 Starting - Preparing the instances for training......
2018-12-04 15:41:22 Downloading - Downloading input data...
2018-12-04 15:42:07 Training - Training image download completed. Training in progress.
2018-12-04 15:42:07 Uploading - Uploading generated training model
2018-12-04 15:42:07 Completed - Training job completed

[31mDocker entrypoint called with argument(s): train[0m
[31m[12/04/2018 15:41:56 INFO 140680870025024] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_m