In [1]:
import pandas as pd
import boto3

bucket = 'titanic-sagemaker'
data_key = 'input'

train_data_location = 's3://{}/{}/{}'.format(bucket, data_key, 'train.csv')
test_data_location = 's3://{}/{}/{}'.format(bucket, data_key, 'test.csv')

train_data = pd.read_csv(train_data_location)
test_data = pd.read_csv(test_data_location)

print(train_data.columns)
print(test_data.columns)

print(train_data.head())
print(test_data.head())

# Variable Notes

# PassengerId: Unique identifier for the passenger
# Survived: 1 if passenger survived 0 otherwise
# Pclass: 1 = Upper, 2 = Middle,3 = Lower
# Name: Passenger's name
# Sex: Gender of the passenger denoted by 'male', 'female'
# Age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
# SibSp: No of siblings / spouses aboard the Titanic
# Parch: No of parents / children aboard the Titanic
# Ticket: Ticket number
# Fare: Passenger fare
# Cabin: Passenger cabin
# Embarked: Port of Embarkation; C = Cherbourg, Q = Queenstown, S = Southampton

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket

In [29]:
# Notes on data processing
# 1. Sagemaker requires the label to tbe the first column in the csv training file.
# 2. Since the goal is to illustrate how to use Sagemaker to generate predictions, we will keep things simple
#    and use 'Pclass', and 'Sex' as our predictor variables.

# Extract relevant columns
processed_train_data = train_data[['Survived', 'Pclass', 'Sex']]
processed_test_data = test_data[['Pclass', 'Sex']]

print(processed_train_data.head())
print(processed_test_data.head())

# Code 'Sex' into dummy variables
processed_train_data = pd.get_dummies(processed_train_data)
processed_test_data = pd.get_dummies(processed_test_data)

print(processed_train_data.head())
print(processed_test_data.head())

# Omit 'Sex_male' to avoid collinearity
processed_train_data.drop(['Sex_male'], axis=1, inplace=True)
processed_test_data.drop(['Sex_male'], axis=1, inplace=True)

print(processed_train_data.head())
print(processed_test_data.head())


def upload_to_s3(file_name, data):
    boto3.resource('s3').Bucket(bucket).put_object(Key='processed_input/' + file_name, Body=data.to_csv(index=False))

upload_to_s3('processed_train_data.csv', processed_train_data)
upload_to_s3('processed_test_data.csv', processed_test_data)


   Survived  Pclass     Sex
0         0       3    male
1         1       1  female
2         1       3  female
3         1       1  female
4         0       3    male
   Pclass     Sex
0       3    male
1       3  female
2       2    male
3       3    male
4       3  female
   Survived  Pclass  Sex_female  Sex_male
0         0       3           0         1
1         1       1           1         0
2         1       3           1         0
3         1       1           1         0
4         0       3           0         1
   Pclass  Sex_female  Sex_male
0       3           0         1
1       3           1         0
2       2           0         1
3       3           0         1
4       3           1         0
   Survived  Pclass  Sex_female
0         0       3           0
1         1       1           1
2         1       3           1
3         1       1           1
4         0       3           0
   Pclass  Sex_female
0       3           0
1       3           1
2       2           0
