In [12]:
import numpy as np
import boto3
import sagemaker
import io
import sagemaker.amazon.common as smac
import os
import pandas as pd

# Read from csv or someother location like s3.
# Download from your S3 bucket the census data CSV file based on the publically available census data from the ML repository curated by the University of California, Irvine
from io import StringIO
s3 = boto3.resource('s3')
bucket_name = 'machine-learning-exam' # place the adult_census.csv file in a bucket in your account
object_key = 'Real_estate_valuation_data_set.csv'

# Load the data into a pandas dataframe 

csv_obj = s3.Object(bucket_name, object_key)
csv_string = csv_obj.get()['Body'].read().decode('utf-8')

dataset = pd.read_csv(StringIO(csv_string))
dataset.head()
#dataset = pd.read_csv("Bio_Train.csv")



In [None]:
# Randomize the data and split it between train and test datasets on a 70% 30% split respectively
train_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset))])
print(train_data.shape, test_data.shape)

In [None]:
# Determine the features and labels.
feature_dataset = dataset[['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station', 
                           'X4 number of convenience stores', 'X5 latitude', 'X6 longitude' ]]
features = np.array(feature_dataset.values).astype('float32')

label_dataset= dataset[['Y house price of unit area']]
labels = np.array(label_dataset.values).astype('float32')
labels_vec = np.squeeze(np.asarray(labels))

In [14]:
buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(buffer, features, labels_vec)
buffer.seek(0)

key = 'linearregression'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buffer)
s3_training_data_location = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

In [17]:
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://machine-learning-exam/sagemaker/xgboost-regression/output


In [15]:
from sagemaker.amazon.amazon_estimator import get_image_uri
linear_container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [None]:
from sagemaker import get_execution_role

role = get_execution_role()

sagemaker_session = sagemaker.Session()

# Provide the container, role, instance type and model output location
linear = sagemaker.estimator.Estimator(linear_container,
                                       role=role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sagemaker_session)

# Provide the number of features identified during data preparation
# Provide the predictor_type 

linear.set_hyperparameters(feature_dim=6,
                           mini_batch_size=4,
                           predictor_type='regressor')

# Train the model using the previously prepared test data and validate the 
#data by providing the validation data.

linear.fit({'train': s3_training_data_location})


2020-02-14 03:27:55 Starting - Starting the training job...
2020-02-14 03:27:56 Starting - Launching requested ML instances......
2020-02-14 03:29:03 Starting - Preparing the instances for training......
2020-02-14 03:30:19 Downloading - Downloading input data..