# Test

Dette er en test til beregning af huspriser

In [7]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import boto3
import io
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
plt.ion()   # interactive mode

Load the data

In [8]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='ikea-orders', Key='housing_train.csv')
df = pd.read_csv(io.BytesIO(obj['Body'].read()))

In [9]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Shape of data

In [13]:
df.shape

(1460, 81)

Encode categorical features.

In [14]:
categorical_features = ["MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", 
                        "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType",
                        "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrArea","MasVnrType",
                        "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure",
                        "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC", "CentralAir", "Electrical", 
                        "KitchenQual", "Functional", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual",
                        "GarageCond", "PavedDrive", "PoolQC", "Fence", "MiscFeature", "SaleType", "SaleCondition"]
output_feature = "SalePrice"

label_encoders = {}
for cat_col in categorical_features:
    label_encoders[cat_col] = LabelEncoder()
    df[cat_col] = label_encoders[cat_col].fit_transform(df[cat_col].astype(str))


In [15]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,2,3,3,0,...,0,3,4,4,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,2,3,3,0,...,0,3,4,4,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,2,0,3,0,...,0,3,4,4,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,2,0,3,0,...,0,3,4,4,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,2,0,3,0,...,0,3,4,4,0,12,2008,8,4,250000


Remove Id column and remove null vals

In [16]:
df = df.dropna()
df = df.drop(columns=['Id'])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,3,65.0,8450,1,2,3,3,0,4,...,0,3,4,4,0,2,2008,8,4,208500
1,20,3,80.0,9600,1,2,3,3,0,2,...,0,3,4,4,0,5,2007,8,4,181500
2,60,3,68.0,11250,1,2,0,3,0,4,...,0,3,4,4,0,9,2008,8,4,223500
3,70,3,60.0,9550,1,2,0,3,0,0,...,0,3,4,4,0,2,2006,8,0,140000
4,60,3,84.0,14260,1,2,0,3,0,2,...,0,3,4,4,0,12,2008,8,4,250000


Scale data

In [33]:
scaler = MinMaxScaler()
# Fit on training set only.
scaler.fit(df)
train_housing = scaler.transform(df)
df2 = pd.DataFrame(train_housing, index=df.index, columns=df.columns)

In [34]:
df2.head()

Unnamed: 0,SaleCondition,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType
0,0.8,0.240644,0.235294,0.75,0.150685,0.03342,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.090909,0.5,1.0
1,0.8,0.203128,0.0,0.75,0.202055,0.038795,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.363636,0.25,1.0
2,0.8,0.261487,0.235294,0.75,0.160959,0.046507,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.727273,0.5,1.0
3,0.0,0.145464,0.294118,0.75,0.133562,0.038561,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.090909,0.0,1.0
4,0.8,0.298308,0.235294,0.75,0.215753,0.060576,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.5,1.0


Move SalePrice to first column

In [36]:
df2 = df2[ ['SalePrice'] + [ col for col in df2.columns if col != 'SalePrice' ] ]
df2.head()

Unnamed: 0,SalePrice,SaleCondition,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType
0,0.240644,0.8,0.235294,0.75,0.150685,0.03342,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.090909,0.5,1.0
1,0.203128,0.8,0.0,0.75,0.202055,0.038795,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.363636,0.25,1.0
2,0.261487,0.8,0.235294,0.75,0.160959,0.046507,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.727273,0.5,1.0
3,0.145464,0.0,0.294118,0.75,0.133562,0.038561,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.090909,0.0,1.0
4,0.298308,0.8,0.235294,0.75,0.215753,0.060576,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.5,1.0


Save training dataset

In [37]:
from io import StringIO # python3; python2: BytesIO 

csv_buffer = StringIO()
df.to_csv(csv_buffer, header=False, index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object('ikea-orders', 'train/df2.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'B7784D46016F3EDA',
  'HostId': 'ZCU5xh9R7L29RtLQEDE6cwFnN+Bt4jOYyu7kg72OFkPFyZpdeN2tSp3PkznEUBqPW9FOnmgNFMM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'ZCU5xh9R7L29RtLQEDE6cwFnN+Bt4jOYyu7kg72OFkPFyZpdeN2tSp3PkznEUBqPW9FOnmgNFMM=',
   'x-amz-request-id': 'B7784D46016F3EDA',
   'date': 'Wed, 26 Feb 2020 09:35:32 GMT',
   'etag': '"f49d6e73a5d317f652a436d7407a335d"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"f49d6e73a5d317f652a436d7407a335d"'}

Reduce parameters with PCA

In [38]:
pca = PCA(n_components=10)
pca.fit_transform(df2.dropna())
df3 = pd.DataFrame(pca.components_,columns=df2.columns)

In [39]:
df3.shape

(10, 80)

In [97]:
df3

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,-0.006665,-5.6e-05,-0.054937,0.026487,0.006726,0.003679,0.051802,-0.260226,0.018278,-3.936235e-21,...,0.002918,-0.002655,0.092903,0.013613,-0.000919,0.018311,-0.010788,-0.012915,0.073591,0.112322
1,0.048827,0.410257,-0.032531,-0.072163,-0.022601,0.003193,-0.029648,0.555587,0.080253,-3.8932139999999994e-19,...,-0.004327,0.005642,0.070158,0.009397,-0.000914,-0.022594,0.033824,-0.015689,0.003562,-0.026229
2,-0.021946,-0.003658,0.000178,-0.009838,-0.005872,0.000632,-0.156585,0.223877,-0.02783,8.825573e-19,...,-0.008562,0.005301,0.100182,0.002063,-0.000241,0.024988,-0.018109,-0.005207,0.036339,0.00903
3,-0.016408,0.47161,0.018562,-0.018101,0.002005,-0.000821,-0.05636,-0.416665,-0.083519,1.305136e-18,...,0.008137,-0.009978,-0.001177,0.013407,-0.00079,0.030409,-0.081241,0.008678,-0.02156,0.022118
4,-0.06012,0.02173,-0.011153,0.029869,0.000368,0.000409,0.030803,0.460683,0.116935,2.282466e-18,...,0.009045,-0.009348,-0.115436,-0.006335,0.0008,-0.085284,0.293259,-0.011821,-0.018679,0.02643
5,0.011073,-0.084076,0.026713,0.057614,0.023649,-0.003815,-0.001994,0.291275,-0.069362,-5.300608e-18,...,0.017246,-0.014983,-0.099889,-0.00101,0.000951,0.067854,-0.070887,-0.004889,0.046452,0.110133
6,0.050353,0.110775,0.015881,-0.043631,-0.001769,-0.00887,-0.063609,-0.211713,-0.106796,-1.6577809999999997e-19,...,-0.008136,0.008286,0.053758,-0.009549,0.001014,-0.11419,0.581617,0.026537,-0.000882,0.011601
7,-0.068742,0.119702,0.038751,-0.037809,-0.006987,-0.003061,-0.054587,0.053889,-0.024636,1.957361e-17,...,-0.002087,0.001031,0.118048,0.008142,-0.001462,0.118984,-0.60521,-0.062148,-0.003828,-0.007602
8,-0.06027,-0.039047,-0.018286,0.00263,-0.001697,-0.014531,0.041463,-0.076971,-0.038206,-1.1994869999999998e-19,...,-0.01357,0.013816,0.139564,0.010561,-0.002162,0.004951,0.229454,-0.122121,0.021337,0.001404
9,-0.128266,0.060331,0.045085,-0.017539,-0.010585,0.020538,0.002578,-0.141107,0.121601,3.0419310000000004e-17,...,-0.000498,-0.00099,-0.209856,-0.00125,0.001875,0.001313,0.045212,0.015354,-0.065899,-0.034612


Train the model

In [40]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [42]:
import sagemaker
s3train = sagemaker.session.s3_input(s3_data='s3://ikea-orders/train', distribution='FullyReplicated', compression=None, content_type='text/csv', record_wrapping=None, s3_data_type='S3Prefix', input_mode=None, attribute_names=None, shuffle_config=None)

In [43]:
from sagemaker import get_execution_role

role = get_execution_role()

sess = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path='s3://ikea-orders/llout',
                                       sagemaker_session=sess)
linear.set_hyperparameters(feature_dim=79,
                           predictor_type='regressor',
                           mini_batch_size=200)

linear.fit({'train' : s3train})

2020-02-26 09:39:06 Starting - Starting the training job...
2020-02-26 09:39:08 Starting - Launching requested ML instances...
2020-02-26 09:40:05 Starting - Preparing the instances for training.........
2020-02-26 09:41:32 Downloading - Downloading input data
2020-02-26 09:41:32 Training - Downloading the training image...
2020-02-26 09:42:02 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34m[02/26/2020 09:41:55 INFO 140423649015616] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'


2020-02-26 09:42:09 Completed - Training job completed
Training seconds: 60
Billable seconds: 60


Setup endpoint

In [None]:
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

------------

Validate 

In [111]:
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [162]:
test1 = '20,3,80.0,9600,1,2,3,3,0,2,0,24,1,2,0,2,6,8,1976,1976,1,1,8,8,2,0,3,4,1,2,3,1,0,978,5,0,284,1262,1,0,1,4,1262,0,0,1262,0,1,2,0,3,1,3,6,6,1,4,1,1976.0,1,2,460,4,4,2,298,0,0,0,0,0,3,4,4,0,5,2007,8,4'
test2 = '190,3,50.0,7420,1,2,3,3,0,0,0,3,0,0,1,1,5,6,1939,1950,1,1,8,8,2,0,3,4,0,3,3,3,2,851,5,0,140,991,1,0,1,4,1077,0,0,1077,1,0,1,0,2,2,3,5,6,2,4,1,1939.0,1,1,205,2,4,2,0,4,0,0,0,0,3,4,4,0,1,2008,8,4'
test3 = '20,2,80.0,11622,1,2,3,3,0,4,0,12,1,2,0,2,5,6,1961,1961,1,0,10,12,2,0,3,4,1,3,3,3,4,468.0,3,144.0,270.0,882.0,0,4,1,3,896,0,0,896,0.0,0.0,1,0,2,1,3,5,6,0,5,1,1961.0,2,1.0,730.0,3,4,2,140,0,0,0,120,0,2,2,3,0,6,2010,8,4'
result = linear_predictor.predict(test3)
print(result)

{'predictions': [{'score': 123734.125}]}


In [126]:
obj = s3.get_object(Bucket='ikea-orders', Key='housing_test.csv')
df = pd.read_csv(io.BytesIO(obj['Body'].read()))

In [136]:
label_encoders = {}
for cat_col in categorical_features:
    label_encoders[cat_col] = LabelEncoder()
    df[cat_col] = label_encoders[cat_col].fit_transform(df[cat_col].astype(str))

In [137]:
df = df.dropna()
df = df.drop(columns=['Id'])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,2,80.0,11622,1,2,3,3,0,4,...,120,0,2,2,3,0,6,2010,8,4
1,20,3,81.0,14267,1,2,0,3,0,0,...,0,0,2,4,0,12500,6,2010,8,4
2,60,3,74.0,13830,1,2,0,3,0,4,...,0,0,2,2,3,0,3,2010,8,4
3,60,3,78.0,9978,1,2,0,3,0,4,...,0,0,2,4,3,0,6,2010,8,4
4,120,3,43.0,5005,1,2,0,1,0,4,...,144,0,2,4,3,0,1,2010,8,4


In [164]:
csv_buffer2 = StringIO()
df.to_csv(csv_buffer2, header=False, index=False)
csv_buffer2.seek(0)

predictions = []
for line in csv_buffer2.readlines():
    result = linear_predictor.predict(line)
    predictions += [r['score'] for r in result['predictions']]

predictions = np.array(predictions)

In [165]:
predictions

array([123734.125,  69217.75 , 172103.125, ...,  64796.375, 176687.125,
       237871.   ])

## Delete endpoint

In [166]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)