## Predict House Price in Melbourne
ML practice based on Kaggle Course:
    Intermediate Machine Learning Tutorials

Data source: https://www.kaggle.com/dansbecker/melbourne-housing-snapshot/home



In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


## Step 0. Prepare Training and Validation data

In [2]:
# Load the data
data = pd.read_csv('melb_data.csv')

In [3]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
# Select the target and predictors
y = data.Price
X = data.drop(['Price'], axis=1)

In [5]:
# Split data into Training and Validation subsets
from sklearn.model_selection import train_test_split

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size = 0.8,
                                                test_size = 0.2,random_state=0)
X_train_full.shape

(10864, 20)

In [6]:
# Select categorical columns with relatively low cardinality 
# (the number of unique values in a column)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype =='object']

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if
                  X_train_full[cname].dtype in ['int64','float64']]
          
# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [36]:
X_train.head(10)

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0
547,h,S,Southern Metropolitan,5,9.7,3103.0,5.0,2.0,2.0,611.0,,,-37.8116,145.0789,5682.0
4655,h,SP,Northern Metropolitan,4,9.9,3044.0,4.0,2.0,2.0,250.0,194.0,1983.0,-37.7319,144.9461,7485.0
6082,h,S,Western Metropolitan,3,13.5,3020.0,3.0,1.0,4.0,700.0,,,-37.7845,144.8131,6763.0
5519,h,S,Western Metropolitan,3,6.6,3011.0,3.0,1.0,1.0,283.0,,1940.0,-37.8064,144.8944,2417.0
8571,h,S,Western Metropolitan,4,13.8,3018.0,3.0,1.0,2.0,725.0,,,-37.8678,144.816,5301.0


In [8]:
X_train.shape

(10864, 15)

## Step 1. Define Preprocessing Steps


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Numerical data: 'constant' - replace missing values with fill_value
numerical_transformer = SimpleImputer(strategy='constant')

# Categorical data: missing values with most_frequent
#                 : oneHotEncoder 
categorical_transformer = Pipeline(steps =[
          ('imputer', SimpleImputer(strategy='most_frequent')),
          ('onehot',OneHotEncoder(handle_unknown='ignore'))
        ])

# Combine Numerical and Categorical data
preprocessor = ColumnTransformer(
    transformers = [
          ('num', numerical_transformer, numerical_cols),
          ('cat', categorical_transformer, categorical_cols)       
        ])

## Step 2. Define the Model

In [10]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators = 100, random_state = 0)

## Step 3. Create and Evalute the Pipline

In [11]:
from sklearn.metrics import mean_absolute_error

# Combine preprossing and modeling code in a pipeline
my_pipeline = Pipeline(steps = [
                      ('preprocessor', preprocessor),
                      ('model',model)
        ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# preprocessing of validadtion data, predict
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE: ', score)

MAE:  160679.18917034855


## Step 3-1. Cross-Validation

In [18]:
from sklearn.model_selection import cross_val_score

# Multipy by -1 since sklearn calculates "Negative" MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv = 10,
                              scoring = 'neg_mean_absolute_error')

print("MAE score:\n", scores)

MAE score:
 [221040.50693983 187170.03680483 218804.12689985 165889.626255
 193403.21301915 177767.07567747 151259.33058665 146150.184089
 142483.58600533 168060.53515709]


In [19]:
scores.mean()

177202.8221434182

## Step 3-2. XGBoost

In [34]:
from xgboost import XGBRegressor
my_model = XGBRegressor(n_estimators=1000,n_jobs=4)

my_pipeline1 = Pipeline(steps = [
                ('preprocessor', preprocessor),
                ('model',my_model)
        ])

my_pipeline1.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['Rooms', 'Distance',
                                                   'Postcode', 'Bedroom2',
                                                   'Bathroom', 'Car',
 

In [35]:
predictions = my_pipeline1.predict(X_valid)
print(mean_absolute_error(predictions, y_valid))

160588.77314409977
