# Test

Dette er en test til beregning af huspriser

In [644]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import boto3
import io
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
plt.ion()   # interactive mode

### Load the data

In [659]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='ikea-orders', Key='housing_train.csv')
df = pd.read_csv(io.BytesIO(obj['Body'].read()))

Split data

In [660]:
train, validate = np.split(df.sample(frac=1), [int(.75*len(df))])

In [646]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
208,209,60,RL,,14364,Pave,,IR1,Low,AllPub,...,0,,,,0,4,2007,WD,Normal,277000
280,281,60,RL,82.0,11287,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2007,WD,Normal,228500
1451,1452,20,RL,78.0,9262,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2009,New,Partial,287090
1430,1431,60,RL,60.0,21930,Pave,,IR3,Lvl,AllPub,...,0,,,,0,7,2006,WD,Normal,192140
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500


Shape of data

In [647]:
train.shape

(1095, 81)

### Feature engineering
Encode categorical features.

In [648]:
categorical_features = ["MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", 
                        "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType",
                        "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrArea","MasVnrType",
                        "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure",
                        "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC", "CentralAir", "Electrical", 
                        "KitchenQual", "Functional", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual",
                        "GarageCond", "PavedDrive", "PoolQC", "Fence", "MiscFeature", "SaleType", "SaleCondition"]
output_feature = "SalePrice"

label_encoders = {}
for cat_col in categorical_features:
    label_encoders[cat_col] = LabelEncoder()
    train[cat_col] = label_encoders[cat_col].fit_transform(train[cat_col].astype(str))
    validate[cat_col] = label_encoders[cat_col].fit_transform(validate[cat_col].astype(str))    

In [649]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
208,209,60,3,,14364,1,2,0,2,0,...,0,3,4,4,0,4,2007,8,4,277000
280,281,60,3,82.0,11287,1,2,3,3,0,...,0,3,4,4,0,1,2007,8,4,228500
1451,1452,20,3,78.0,9262,1,2,3,3,0,...,0,3,4,4,0,5,2009,6,5,287090
1430,1431,60,3,60.0,21930,1,2,2,3,0,...,0,3,4,4,0,7,2006,8,4,192140
1457,1458,70,3,66.0,9042,1,2,3,3,0,...,0,3,0,2,2500,5,2010,8,4,266500


Remove Id column and remove null vals

In [650]:
train = train.dropna()
train = train.drop(columns=['Id'])
validate = validate.dropna()
validate = validate.drop(columns=['Id'])
org_prices_train = train[output_feature]
org_prices_validate = validate[output_feature]
train.shape

(839, 80)

Scale data

In [651]:
scaler = MinMaxScaler()
# Fit on training set only.
scaler.fit(train)
train_housing = scaler.transform(train)
validate_housing = scaler.transform(validate)
train_df = pd.DataFrame(train_housing, index=train.index, columns=train.columns)
validate_df = pd.DataFrame(validate_housing, index=validate.index, columns=validate.columns)
train_df.shape

(839, 80)

In [652]:
train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
280,0.235294,0.75,0.208904,0.045891,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.25,1.0,0.8,0.268434
1451,0.0,0.75,0.195205,0.036418,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.363636,0.75,0.75,1.0,0.349844
1430,0.235294,0.75,0.133562,0.095678,1.0,1.0,0.666667,1.0,0.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.545455,0.0,1.0,0.8,0.217912
1457,0.294118,0.75,0.15411,0.035389,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.333333,1.0,0.363636,1.0,1.0,0.8,0.321235
1023,0.588235,0.75,0.075342,0.007976,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.363636,0.5,1.0,0.8,0.216328


Reduce parameters with PCA

In [603]:
train_df.drop(columns=['SalePrice'])
validate_df.drop(columns=['SalePrice'])
pca = PCA(0.95)
pca.fit(train_df)
pca_train = pca.transform(train_df)
pca_validate = pca.transform(validate_df)
train_df = pd.DataFrame(pca_train)
validate_df = pd.DataFrame(pca_validate)

def nans(df): 
    return df[df.isnull().any(axis=1)]

nans(train_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44


In [604]:
train_df.shape

(841, 45)

In [605]:
validate_df.shape

(286, 45)

In [606]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,-0.785391,0.373385,-0.487657,-0.622042,0.022302,0.175893,0.087164,-0.001736,0.166758,0.087439,...,-0.661152,0.498147,0.077116,0.395815,0.27297,-0.040347,-0.208516,0.006502,-0.028392,0.045124
1,-0.356516,-0.453984,0.109462,-0.20156,0.176195,-0.027073,0.868746,0.06272,-0.07794,-0.102422,...,-0.027847,-0.097057,-0.225212,-0.035168,-0.210541,-0.043243,-0.112855,0.174194,0.005056,-0.087006
2,0.690081,0.680488,-0.024727,0.379027,-0.257438,0.602984,-0.159207,-0.596979,0.011148,0.225569,...,-0.056321,0.019222,0.053287,-0.015544,0.008883,-0.002412,-0.09408,-0.041632,0.048577,-0.002384
3,-0.803458,0.248397,-0.227141,-0.623802,-0.036949,-0.09203,0.227493,0.08434,-0.309406,0.157788,...,-0.063003,0.01227,-0.033162,-0.024464,-0.083416,0.009556,0.109424,-0.016227,0.010969,0.066273
4,-0.427058,0.564574,-0.756511,-0.163938,0.062099,-0.01924,0.131167,0.053382,0.341875,-0.552249,...,0.091668,-0.038159,0.307979,0.071838,0.038418,-0.123034,0.030513,-0.082601,-0.188044,-0.03989


Move SalePrice to first column

In [653]:
train_df['SalePrice'] = np.log(train['SalePrice'])
train_df = train_df[ ['SalePrice'] + [ col for col in train_df.columns if col != 'SalePrice' ] ]
validate_df['SalePrice'] = np.log(validate['SalePrice'])
validate_df = validate_df[ ['SalePrice'] + [ col for col in validate_df.columns if col != 'SalePrice' ] ]
train_df.head()

Unnamed: 0,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
280,12.339291,0.235294,0.75,0.208904,0.045891,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.25,1.0,0.8
1451,12.567551,0.0,0.75,0.195205,0.036418,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.363636,0.75,0.75,1.0
1430,12.16598,0.235294,0.75,0.133562,0.095678,1.0,1.0,0.666667,1.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.545455,0.0,1.0,0.8
1457,12.49313,0.294118,0.75,0.15411,0.035389,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.333333,1.0,0.363636,1.0,1.0,0.8
1023,12.160029,0.588235,0.75,0.075342,0.007976,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.363636,0.5,1.0,0.8


Save training dataset

In [654]:
from io import StringIO # python3; python2: BytesIO 

csv_buffer = StringIO()
train_df.to_csv(csv_buffer, header=False, index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object('ikea-orders', 'train/train.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '661D4E131D652BF7',
  'HostId': 'TXUFZfZEEi2JbO0qlooN4dnFoqv+kK1k6rdEl3WuksFTqrydn1ILsqgmbs4QTIUcgS+298JP/OM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'TXUFZfZEEi2JbO0qlooN4dnFoqv+kK1k6rdEl3WuksFTqrydn1ILsqgmbs4QTIUcgS+298JP/OM=',
   'x-amz-request-id': '661D4E131D652BF7',
   'date': 'Wed, 26 Feb 2020 16:44:43 GMT',
   'etag': '"ccf44d98fa975ad1421aed5886cb0079"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"ccf44d98fa975ad1421aed5886cb0079"'}

### Train the model

In [655]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [656]:
import sagemaker
s3train = sagemaker.session.s3_input(s3_data='s3://ikea-orders/train', distribution='FullyReplicated', compression=None, content_type='text/csv', record_wrapping=None, s3_data_type='S3Prefix', input_mode=None, attribute_names=None, shuffle_config=None)

In [664]:
from sagemaker import get_execution_role

role = get_execution_role()

sess = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path='s3://ikea-orders/llout',
                                       sagemaker_session=sess)
linear.set_hyperparameters(feature_dim=train_df.shape[1] - 1,
                           predictor_type='regressor',
                           mini_batch_size=20)

linear.fit({'train' : s3train})

2020-02-26 17:00:22 Starting - Starting the training job...
2020-02-26 17:00:23 Starting - Launching requested ML instances...
2020-02-26 17:01:22 Starting - Preparing the instances for training.........
2020-02-26 17:02:49 Downloading - Downloading input data
2020-02-26 17:02:49 Training - Downloading the training image...
2020-02-26 17:03:09 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34m[02/26/2020 17:03:12 INFO 140049981327168] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_schedul

[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.12308747950123577, "sum": 0.12308747950123577, "min": 0.12308747950123577}}, "EndTime": 1582736597.682463, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 4}, "StartTime": 1582736597.682401}
[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.12558896686972643, "sum": 0.12558896686972643, "min": 0.12558896686972643}}, "EndTime": 1582736597.682543, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 4}, "StartTime": 1582736597.682531}
[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.12452295066379919, "sum": 0.12452295066379919, "min": 0.12452295066379919}}, "EndTime": 1582736597.682598, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 4}, "StartTime": 1582736597.682582}
[0m
[34m#metr


2020-02-26 17:03:37 Uploading - Uploading generated training model
2020-02-26 17:03:37 Completed - Training job completed
Training seconds: 61
Billable seconds: 61


### Setup endpoint

In [665]:
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

-------------!

### Validate 

In [666]:
from sagemaker.predictor import csv_serializer, json_deserializer
import math

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [667]:
csv_buffer_val = StringIO()
val = validate_df.drop(columns=['SalePrice'])
val.to_csv(csv_buffer_val, header=False, index=False)
csv_buffer_val.seek(0)

predictions = []
for line in csv_buffer_val.readlines():  
    result = linear_predictor.predict(line)   
    predictions += [r['score'] for r in result['predictions']]

predictions = np.array(predictions)
res = np.round(np.exp(predictions))
val_array = org_prices_validate.values
diff = []
for i in range(len(val_array)):
    diff.append([val_array[i], res[i], val_array[i] - res[i]])
    
td = 0
for d in diff:
    td = td + abs(d[2])**2
print(math.sqrt(td/len(res)))
diff

42183.91136559879


[[225000, 226868.0, -1868.0],
 [165400, 201223.0, -35823.0],
 [162000, 176120.0, -14120.0],
 [258000, 218242.0, 39758.0],
 [179665, 227031.0, -47366.0],
 [228000, 256359.0, -28359.0],
 [175900, 200384.0, -24484.0],
 [150900, 168479.0, -17579.0],
 [164000, 157830.0, 6170.0],
 [142000, 156675.0, -14675.0],
 [475000, 482947.0, -7947.0],
 [147000, 163712.0, -16712.0],
 [173900, 170654.0, 3246.0],
 [93000, 94452.0, -1452.0],
 [119500, 128273.0, -8773.0],
 [146000, 157486.0, -11486.0],
 [262500, 244212.0, 18288.0],
 [153500, 145228.0, 8272.0],
 [179600, 189763.0, -10163.0],
 [239000, 210100.0, 28900.0],
 [113000, 103560.0, 9440.0],
 [265900, 241532.0, 24368.0],
 [174000, 163206.0, 10794.0],
 [171000, 176460.0, -5460.0],
 [151000, 163107.0, -12107.0],
 [137500, 115657.0, 21843.0],
 [226000, 221757.0, 4243.0],
 [320000, 321810.0, -1810.0],
 [66500, 110526.0, -44026.0],
 [163000, 179767.0, -16767.0],
 [155000, 147809.0, 7191.0],
 [241500, 213747.0, 27753.0],
 [210000, 217568.0, -7568.0],
 [1460

In [668]:
obj = s3.get_object(Bucket='ikea-orders', Key='housing_test.csv')
df = pd.read_csv(io.BytesIO(obj['Body'].read()))

In [669]:
label_encoders = {}
for cat_col in categorical_features:
    label_encoders[cat_col] = LabelEncoder()
    df[cat_col] = label_encoders[cat_col].fit_transform(df[cat_col].astype(str))

In [670]:
df = df.dropna()
df = df.drop(columns=['Id'])
scaler.fit(df)
test_housing = scaler.transform(df)
df = pd.DataFrame(test_housing, index=df.index, columns=df.columns)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.0,0.4,0.329609,0.200792,1.0,1.0,1.0,1.0,0.0,1.0,...,0.208333,0.0,1.0,0.5,1.0,0.0,0.454545,1.0,0.888889,0.8
1,0.0,0.6,0.335196,0.253179,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.735294,0.454545,1.0,0.888889,0.8
2,0.235294,0.6,0.296089,0.244524,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.5,1.0,0.0,0.181818,1.0,0.888889,0.8
3,0.235294,0.6,0.318436,0.168231,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.454545,1.0,0.888889,0.8
4,0.588235,0.6,0.122905,0.069737,1.0,1.0,0.0,0.333333,0.0,1.0,...,0.25,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.888889,0.8


In [671]:
csv_buffer2 = StringIO()
df.to_csv(csv_buffer2, header=False, index=False)
csv_buffer2.seek(0)

predictions = []
for line in csv_buffer2.readlines():
    result = linear_predictor.predict(line)   
    predictions += [r['score'] for r in result['predictions']]

predictions = np.array(predictions)

Scale back up

In [672]:
np.round(np.exp(predictions))

array([125212., 154095., 182045., ...,  97428., 163151., 233645.])

## Delete endpoint

In [643]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)