___
# Import Library
___

In [110]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

import category_encoders as ce

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, RobustScaler, \
PowerTransformer, FunctionTransformer

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE, SelectPercentile
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor

from sklearn.metrics import mean_squared_error

from dotenv import load_dotenv, find_dotenv, set_key
import sqlalchemy as db
import os
import pickle

In [72]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

In [73]:
load_dotenv()

True

In [74]:
RANDOM_STATE = 202102
SCORING = 'neg_root_mean_squared_error'

___
# Business Problems
___

Who doesn't like to travel? Traveling is a such fun activity, coming into new place and experience many new things. It is a part of crucial aspect in life as it's the best way to escape from hectic and busy schedule. It also improves the mental and physical health while it's a good remedy for stress, anxiety and depression. As part of travelling, deciding accomodation or where you should sleep and charge your body is definitely important thing to decide. One of the choice to spend your night, apart from conventional hotel is Airbnb.

Airbnb is an American vacation rental online marketplace company based in San Francisco, California. Airbnb maintains and hosts a marketplace, accessible to consumers on its website or via an app. Through the service, users can arrange lodging, primarily homestays, and tourism experiences or list their properties for rental. Airbnb does not own any of the listed properties; instead, it profits by receiving commission from each booking. 

Singapore, one of developed country in Asia, make 4.1% of its national GDP from tourism industry alone and has been one of the most visited country in Asia resulting in thousand of Airbnb listings in Singapore (around 4000+ listings). It can be troublesome to choose one befitting your needs and budget. This also becomes problem for owner property who wants to register their new property in Airbnb since they may find it hard to price their property.

Here's where machine learning help to solve those problems. Predicting price of property, or in this case Airbnb listing, could be quite challenging since there are various factors need to be measured and calculated in order to get price accurately. In this project, we will do end-to-end machine learning project, starting from cleaning the dataset, do exploratory data analysis to get some insights, use machine learning model to predict Airbnb listing price and deploy our best model by creating dashboard.   

___
# Load Dataset
___

In [75]:
engine = db.create_engine(os.getenv('db-uri'))

In [76]:
meta = db.MetaData()
meta.reflect(engine)

In [77]:
with engine.connect() as con:
    query = db.select([meta.tables['listings']])
    result = con.execute(query).fetchall()
df_raw = pd.DataFrame(result, columns=meta.tables['listings'].c.keys())

In [78]:
listings_dropped_columns = os.getenv('listings_dropped_columns')
listings_dropped_columns

"['id', 'listing_url', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count', 'availability_60', 'availability_90', 'availability_365', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'host_response_time', 'has_availability', 'h

In [79]:
df = df_raw.drop(eval(listings_dropped_columns), axis=1)
display(df)

Unnamed: 0,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,price,minimum_nights,maximum_nights,availability_30,instant_bookable,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,total_bathrooms,bathrooms_type,Air Conditioning,BBQ Utensils,Baby and Children Equipments,Bathtub,Beach Essentials,Breakfast,Building Staff,Cleaning Before Checkout,Cleaning Equipments,Clothing Equipments,Coffee Maker,Cutlery,Dedicated Workspace,Door Lock,Dryer,EV Charger,Elevator,Entertainment,Essentials,Ethernet Connection,"Extra pillows, blankets or bed-linen",Fan,Fire Safety Equipments,Fireplace,First Aid Kit,Free Parking,Gym,Heating,Host Greets You,Hot Tub,Hot Water,Kitchen Utensils,Kitchen/Dining Area,Laundromat Nearby,Living Room,Lockbox,Long Term Stays Allowed,Luggage Dropoff Allowed,Outdoor Space,Paid Parking,Pool,Private Entrance,Refrigerator/Freezer,Sauna,Single level Home,Ski-in/Ski-out,TV,Toilet Equipments,Washer,Water Body Access
0,Woodlands,North Region,1.44255,103.79580,Private room in apartment,Private room,1,1.0,1.0,79,180,360,30,False,0,2,0,1.0,bath,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
1,Bukit Timah,Central Region,1.33235,103.78521,Private room in apartment,Private room,2,1.0,1.0,80,90,730,30,False,0,1,0,1.0,bath,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
2,Woodlands,North Region,1.44246,103.79667,Private room in apartment,Private room,1,1.0,1.0,66,6,14,30,False,0,2,0,1.0,bath,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
3,Tampines,East Region,1.34541,103.95712,Private room in villa,Private room,6,2.0,3.0,174,90,1125,30,False,0,8,0,1.0,private bath,1,0,1,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,1,1,0,0,1,0,0,0,0,1,1,1,0
4,Tampines,East Region,1.34567,103.95963,Private room in house,Private room,3,1.0,1.0,93,90,1125,30,False,0,8,0,1.0,shared half-bath,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4367,Bukit Timah,Central Region,1.32414,103.80956,Private room in condominium,Private room,2,,1.0,25,10,30,0,True,0,2,0,1.0,shared bath,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4368,Marine Parade,Central Region,1.30862,103.90297,Private room in house,Private room,1,1.0,1.0,57,90,182,29,True,0,6,0,1.0,half-bath,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
4369,Geylang,Central Region,1.31044,103.90275,Private room in house,Private room,4,1.0,1.0,62,90,182,29,True,0,6,0,1.0,bath,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0
4370,Marine Parade,Central Region,1.30889,103.90296,Private room in house,Private room,1,1.0,1.0,47,90,182,29,True,0,6,0,1.0,half-bath,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0


___
# Data Splitting
___

In [80]:
X = df.drop(['price'], axis=1).fillna(np.nan)
y = df.price

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=RANDOM_STATE)

In [82]:
for x in [X_train, X_test, y_train, y_test]:
    print(x.shape)

(3497, 68)
(875, 68)
(3497,)
(875,)


In [83]:
list_column_input = X_train.columns.tolist()
if os.getenv('list_column_input') != list_column_input:
    set_key(find_dotenv(), 'list_column_input', str(list_column_input))

___
# Choose Evaluation Metric
___

In this project, one of our goal is to predict Airbnb listing as accurately as possible (regression problem). Hence, I choose RMSE (Root Mean Squared Error) as evaluation metric since it shows how far our set of predictions to the actual price

___
# Data Transformer
___

In [84]:
encode_df = pd.DataFrame()
encode_df['Unique Values'] = df.apply(pd.unique)
encode_df['Count'] = encode_df['Unique Values'].apply(len)
encode_df['Data Type'] = [df[t].dtype for t in encode_df.index]
encode_df['%Missing'] = df.isnull().sum()/len(df)*100
encode_df.sort_values('Count')

Unnamed: 0,Unique Values,Count,Data Type,%Missing
EV Charger,"[0, 1]",2,int64,0.0
Entertainment,"[0, 1]",2,int64,0.0
Essentials,"[0, 1]",2,int64,0.0
Ethernet Connection,"[1, 0]",2,int64,0.0
"Extra pillows, blankets or bed-linen","[0, 1]",2,int64,0.0
Fan,"[0, 1]",2,int64,0.0
Fire Safety Equipments,"[0, 1]",2,int64,0.0
Fireplace,"[0, 1]",2,int64,0.0
First Aid Kit,"[0, 1]",2,int64,0.0
Free Parking,"[0, 1]",2,int64,0.0


In [85]:
missing_numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan))
])

In [86]:
missing_onehot_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('onehot', OneHotEncoder(drop='first'))
])

In [87]:
scaled_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('pf', PowerTransformer(method='yeo-johnson'))
])

In [88]:
transformer = ColumnTransformer([
    ('missing_numerical', missing_numerical_pipeline, ['bedrooms', 'total_bathrooms']),
    ('missing_onehot', missing_onehot_pipeline, ['bathrooms_type']), 
    ('nonmissing_onehot', OneHotEncoder(drop='first'), ['instant_bookable', 
                                                        'room_type', 'neighbourhood_group_cleansed']),
    ('nonmissing_binary', ce.BinaryEncoder(drop_invariant=True), ['neighbourhood_cleansed', 
                                                                  'property_type']),
    ('scaled_cols', scaled_pipeline, ['beds', 'minimum_nights'])
], remainder='passthrough')

___
# Model Building
___

In [89]:
model_result = dict()

In [90]:
kf = KFold(shuffle=True, random_state=RANDOM_STATE)

In [91]:
ridge_params = {
    'regressor__regressor__alpha': np.geomspace(0.001, 1, 4), 
    'regressor__regressor__solver': ['auto', 'svd', 'cholesky', 'lsqr', 
                                     'sparse_cg', 'sag', 'saga'],
}

In [92]:
lasso_params = {
    'regressor__regressor__alpha': np.geomspace(0.001, 1, 4), 
    'regressor__regressor__selection': ['cyclic', 'random']
}

In [93]:
elastic_params = {
    'regressor__regressor__alpha': np.geomspace(0.001, 1, 4), 
    'regressor__regressor__l1_ratio': np.linspace(0.1, 0.5, 5),
    'regressor__regressor__selection': ['cyclic', 'random']
}

In [94]:
dtr_params = {
    'regressor__regressor__criterion': ['mse', 'mae'], 
    'regressor__regressor__max_depth': [5, 10, 20, 40, 50],
    'regressor__regressor__min_samples_split': [2, 5, 10, 20],
    'regressor__regressor__min_samples_leaf': [1, 2, 5, 10],
    'regressor__regressor__max_features': ['auto', 'sqrt']
}

In [95]:
knn_params = {
    'regressor__regressor__n_neighbors': [5, 10, 15, 20],
    'regressor__regressor__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'regressor__regressor__weights': ['uniform', 'distance'],
    'regressor__regressor__p': [1 ,2]
}

In [96]:
rfr_params = {
    'regressor__regressor__criterion': ['mse', 'mae'], 
    'regressor__regressor__max_depth': [5, 10, 20, 40, 50],
    'regressor__regressor__min_samples_split': [2, 5, 10, 20],
    'regressor__regressor__min_samples_leaf': [1, 2, 5, 10],
    'regressor__regressor__max_features': ['auto', 'sqrt']
}

In [97]:
xgb_params = {
    'regressor__regressor__n_estimators': [10, 20, 50, 100],
    'regressor__regressor__max_depth': [2, 5, 10, 20],
    'regressor__regressor__learning_rate': np.geomspace(0.001, 1, 4)
}

In [98]:
lgb_params = {
    'regressor__regressor__boosting_type': ['gbdt', 'dart', 'goss'],
    'regressor__regressor__num_leaves': [11, 21, 31, 51],
    'regressor__regressor__max_depth': [-1, 5, 10, 20],
    'regressor__regressor__learning_rate': np.geomspace(0.001, 1, 4),
    'regressor__regressor__n_estimators': [10, 20, 50, 100],
}

In [99]:
model_pool = [
    ('Ridge', Ridge(random_state=RANDOM_STATE, max_iter=100000), ridge_params),
    ('Lasso', Lasso(random_state=RANDOM_STATE), lasso_params),
    ('Elastic Net', ElasticNet(random_state=RANDOM_STATE), elastic_params),
    ('Decision Tree', DecisionTreeRegressor(random_state=RANDOM_STATE), dtr_params),
    ('KNN', KNeighborsRegressor(), knn_params),
    ('Random Forest', RandomForestRegressor(random_state=RANDOM_STATE), rfr_params),
    ('XGB', XGBRegressor(random_state=RANDOM_STATE), xgb_params),
    ('LightGBM', LGBMRegressor(random_state=RANDOM_STATE))
]

In [100]:
def model_test(model_pool):
    for name, model, params in model_pool:
        benchmark_noscale_pipe = Pipeline([
            ('transformer', transformer),
            ('regressor', model)
        ])
        
        regr_noscale = TransformedTargetRegressor(regressor=benchmark_noscale_pipe, 
                                                  transformer=PowerTransformer(method='box-cox'))
        
        benchmark_scale_pipe = Pipeline([
            ('transformer', transformer),
            ('scaler', RobustScaler()),
            ('regressor', model)
        ])
        
        regr_scale = TransformedTargetRegressor(regressor=benchmark_scale_pipe, 
                                                transformer=PowerTransformer(method='box-cox'))
        
        # Benchmark Model on Cross Val
        print(f'Fitting {name} Benchmark No Scale')
        benchmark_noscale_scores = cross_val_score(regr_noscale, X_train, y_train, scoring=SCORING, 
                                                   cv=kf, n_jobs=-1, verbose=10)
        print(f'Fitting {name} Benchmark Scale')
        benchmark_scale_scores = cross_val_score(regr_scale, X_train, y_train, scoring=SCORING, 
                                                 cv=kf, n_jobs=-1, verbose=10)
        
        # Tuned Model on Cross Val
        grid_noscale = RandomizedSearchCV(regr_noscale, params, n_iter=50, scoring=SCORING, n_jobs=-1, 
                                          cv=kf, verbose=10, random_state=RANDOM_STATE)
        grid_scale = RandomizedSearchCV(regr_scale, params, n_iter=50, scoring=SCORING, n_jobs=-1, 
                                        cv=kf, verbose=10, random_state=RANDOM_STATE)
        
        print(f'Fitting {name} Tuned No Scale')
        grid_noscale.fit(X_train, y_train)
        print(f'Fitting {name} Tuned Scale')
        grid_scale.fit(X_train, y_train)
        
        model_result[name] = {'Benchmark_NoScale': abs(benchmark_noscale_scores.mean()), 
                              'Benchmark_Scale': abs(benchmark_scale_scores.mean()), 
                              'Tuned_NoScale': abs(grid_noscale.best_score_), 
                              'Tuned_Scale': abs(grid_scale.best_score_), 
                              'Raw_Benchmark_NoScale': benchmark_noscale_scores, 
                              'Raw_Benchmark_Scale': benchmark_scale_scores, 
                              'Raw_Tuned_NoScale': grid_noscale.cv_results_, 
                              'Raw_Tuned_Scale': grid_scale.cv_results_}
        
        # Save Model
        for c, m in [('Benchmark_NoScale', regr_noscale), ('Benchmark_Scale', regr_scale), 
                     ('Tuned_NoScale', grid_noscale.best_estimator_), 
                     ('Tuned_Scale', grid_scale.best_estimator_)]:
            filename = f'{name}_{c}.pkl'
            with open('Models/'+filename, 'wb') as f:
                try:
                    pickle.dump(m, f)
                    print(f'Success saving {filename}')
                except:
                    print(f'Error saving {filename}')
        

___
# Model Cross Validation
___

In [101]:
model_test(model_pool)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting Ridge Benchmark No Scale


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   39.7s remaining:   59.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   39.7s remaining:   26.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   39.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   39.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting Ridge Benchmark Scale


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


Fitting Ridge Tuned No Scale
Fitting 5 folds for each of 28 candidates, totalling 140 fits


  elif pd.api.types.is_categorical(cols):


Fitting Ridge Tuned Scale
Fitting 5 folds for each of 28 candidates, totalling 140 fits


  elif pd.api.types.is_categorical(cols):


Success saving Ridge_Benchmark_NoScale.pkl
Success saving Ridge_Benchmark_Scale.pkl
Success saving Ridge_Tuned_NoScale.pkl
Success saving Ridge_Tuned_Scale.pkl
Fitting Lasso Benchmark No Scale


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting Lasso Benchmark Scale


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


Fitting Lasso Tuned No Scale
Fitting 5 folds for each of 8 candidates, totalling 40 fits


  elif pd.api.types.is_categorical(cols):


Fitting Lasso Tuned Scale
Fitting 5 folds for each of 8 candidates, totalling 40 fits


  elif pd.api.types.is_categorical(cols):


Success saving Lasso_Benchmark_NoScale.pkl
Success saving Lasso_Benchmark_Scale.pkl
Success saving Lasso_Tuned_NoScale.pkl
Success saving Lasso_Tuned_Scale.pkl
Fitting Elastic Net Benchmark No Scale


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s finished


Fitting Elastic Net Benchmark Scale


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


Fitting Elastic Net Tuned No Scale
Fitting 5 folds for each of 40 candidates, totalling 200 fits


  elif pd.api.types.is_categorical(cols):


Fitting Elastic Net Tuned Scale
Fitting 5 folds for each of 40 candidates, totalling 200 fits


  elif pd.api.types.is_categorical(cols):
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Success saving Elastic Net_Benchmark_NoScale.pkl
Success saving Elastic Net_Benchmark_Scale.pkl
Success saving Elastic Net_Tuned_NoScale.pkl
Success saving Elastic Net_Tuned_Scale.pkl
Fitting Decision Tree Benchmark No Scale


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.9s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.9s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting Decision Tree Benchmark Scale


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.4s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


Fitting Decision Tree Tuned No Scale
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  elif pd.api.types.is_categorical(cols):


Fitting Decision Tree Tuned Scale
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  elif pd.api.types.is_categorical(cols):


Success saving Decision Tree_Benchmark_NoScale.pkl
Success saving Decision Tree_Benchmark_Scale.pkl
Success saving Decision Tree_Tuned_NoScale.pkl
Success saving Decision Tree_Tuned_Scale.pkl
Fitting KNN Benchmark No Scale


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.4s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting KNN Benchmark Scale


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.4s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


Fitting KNN Tuned No Scale
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  elif pd.api.types.is_categorical(cols):


Fitting KNN Tuned Scale
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  elif pd.api.types.is_categorical(cols):


Success saving KNN_Benchmark_NoScale.pkl
Success saving KNN_Benchmark_Scale.pkl
Success saving KNN_Tuned_NoScale.pkl
Success saving KNN_Tuned_Scale.pkl
Fitting Random Forest Benchmark No Scale


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.8s remaining:   10.3s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.8s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting Random Forest Benchmark Scale


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.5s remaining:    9.8s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.6s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.0s finished


Fitting Random Forest Tuned No Scale
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  elif pd.api.types.is_categorical(cols):


Fitting Random Forest Tuned Scale
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  elif pd.api.types.is_categorical(cols):


Success saving Random Forest_Benchmark_NoScale.pkl
Success saving Random Forest_Benchmark_Scale.pkl
Success saving Random Forest_Tuned_NoScale.pkl
Success saving Random Forest_Tuned_Scale.pkl
Fitting XGB Benchmark No Scale


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.9s remaining:    7.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    5.0s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.0s finished


Fitting XGB Benchmark Scale


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.9s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.0s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.0s finished


Fitting XGB Tuned No Scale
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  elif pd.api.types.is_categorical(cols):


Fitting XGB Tuned Scale
Fitting 5 folds for each of 50 candidates, totalling 250 fits


  elif pd.api.types.is_categorical(cols):


Success saving XGB_Benchmark_NoScale.pkl
Success saving XGB_Benchmark_Scale.pkl
Success saving XGB_Tuned_NoScale.pkl
Success saving XGB_Tuned_Scale.pkl


ValueError: not enough values to unpack (expected 3, got 2)

In [107]:
model_result_df = pd.DataFrame(model_result).loc[['Benchmark_NoScale', 'Benchmark_Scale', 
                                                  'Tuned_NoScale', 'Tuned_Scale']]

In [119]:
# By Model
model_result_df.astype(float).style.background_gradient(cmap='magma_r', axis=0)

Unnamed: 0,Ridge,Lasso,Elastic Net,Decision Tree,KNN,Random Forest,XGB
Benchmark_NoScale,153.949783,186.734405,184.544254,177.997795,159.456351,145.183703,143.46226
Benchmark_Scale,152.794429,193.252952,190.019028,177.999686,150.664259,145.142675,143.466311
Tuned_NoScale,152.593847,154.860982,154.621594,155.512006,153.575697,145.960197,141.993254
Tuned_Scale,152.591324,153.611014,153.012793,155.51094,146.111794,145.957072,141.988715


In [118]:
# By Variation
model_result_df.astype(float).style.background_gradient(cmap='magma_r', axis=1)

Unnamed: 0,Ridge,Lasso,Elastic Net,Decision Tree,KNN,Random Forest,XGB
Benchmark_NoScale,153.949783,186.734405,184.544254,177.997795,159.456351,145.183703,143.46226
Benchmark_Scale,152.794429,193.252952,190.019028,177.999686,150.664259,145.142675,143.466311
Tuned_NoScale,152.593847,154.860982,154.621594,155.512006,153.575697,145.960197,141.993254
Tuned_Scale,152.591324,153.611014,153.012793,155.51094,146.111794,145.957072,141.988715


As shown the table above, we can see the RMSE value for each model on cross-validation of train dataset. We can conclude that XGB so far is the best model based on the most minimum RMSE value of price. Now, let's train those model on predicting train and test dataset 

___
# Choosing Best Model
___

In [130]:
model_pool_trained = []
for file in os.listdir('Models/'):
    if '.pkl' in file:
        model = file.split('_')[0]
        m_type = file.split('_')[1] + '_' + file.split('_')[-1][:-4]
        with open('Models/'+file, 'rb') as f:
            model_og = pickle.load(f)
        model_pool_trained.append( (model, m_type, model_og) )

In [131]:
model_pool_trained

[('Decision Tree',
  'Benchmark_NoScale',
  TransformedTargetRegressor(regressor=Pipeline(steps=[('transformer',
                                                        ColumnTransformer(remainder='passthrough',
                                                                          transformers=[('missing_numerical',
                                                                                         Pipeline(steps=[('imputer',
                                                                                                          SimpleImputer(strategy='most_frequent'))]),
                                                                                         ['bedrooms',
                                                                                          'total_bathrooms']),
                                                                                        ('missing_onehot',
                                                                                         Pi

In [153]:
trained_model_result_train = []
trained_model_result_test = []

In [154]:
def choose_best(model_pool_trained):
    for name, variation, model in model_pool_trained:
        model.fit(X_train, y_train)
        
        pred_train = model.predict(X_train)
        pred_test = model.predict(X_test)
        
        rmse_score_train = mean_squared_error(y_train, pred_train, squared=False)
        rmse_score_test = mean_squared_error(y_test, pred_test, squared=False)
        
        trained_model_result_train.append( (name, variation, rmse_score_train) )
        trained_model_result_test.append( (name, variation, rmse_score_test) )

In [155]:
choose_best(model_pool_trained)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

In [160]:
result_train = pd.DataFrame(trained_model_result_train, columns=['Model', 'Variation', 'Score'])
result_test = pd.DataFrame(trained_model_result_test, columns=['Model', 'Variation', 'Score'])

## Train Dataset Prediction Result

In [164]:
model_col = ['Ridge', 'Lasso', 'Elastic Net', 'Decision Tree', 'KNN', 'Random Forest', 'XGB']

In [165]:
# By Model
(result_train.pivot_table(values='Score', index='Variation', columns='Model')
 [model_col].style.background_gradient('magma_r'))

Model,Ridge,Lasso,Elastic Net,Decision Tree,KNN,Random Forest,XGB
Variation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark_NoScale,150.702283,187.389928,185.197649,0.058581,144.233912,92.267116,40.443399
Benchmark_Scale,149.548602,193.961983,190.878637,0.058581,137.875329,92.266141,40.443399
Tuned_NoScale,149.309731,152.313516,151.661226,131.024848,0.058581,104.137009,24.264043
Tuned_Scale,149.307406,151.004787,149.981613,131.024848,0.058581,104.143985,24.264043


In [167]:
# By Variant
(result_train.pivot_table(values='Score', index='Variation', columns='Model')
 [model_col].style.background_gradient('magma_r', axis=1))

Model,Ridge,Lasso,Elastic Net,Decision Tree,KNN,Random Forest,XGB
Variation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark_NoScale,150.702283,187.389928,185.197649,0.058581,144.233912,92.267116,40.443399
Benchmark_Scale,149.548602,193.961983,190.878637,0.058581,137.875329,92.266141,40.443399
Tuned_NoScale,149.309731,152.313516,151.661226,131.024848,0.058581,104.137009,24.264043
Tuned_Scale,149.307406,151.004787,149.981613,131.024848,0.058581,104.143985,24.264043


## Test Dataset Prediction Result

In [166]:
# By Model
(result_test.pivot_table(values='Score', index='Variation', columns='Model')
 [model_col].style.background_gradient('magma_r'))

Model,Ridge,Lasso,Elastic Net,Decision Tree,KNN,Random Forest,XGB
Variation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark_NoScale,159.498346,201.180518,199.812053,188.900339,167.376372,144.288902,134.65187
Benchmark_Scale,160.560307,205.929145,203.527709,188.900339,156.939328,144.288721,134.634701
Tuned_NoScale,159.693977,161.462637,160.035033,164.104908,143.968356,148.335935,133.756034
Tuned_Scale,159.697172,161.87757,161.313851,164.104908,145.625439,148.327971,133.753524


In [169]:
# By Variant
(result_test.pivot_table(values='Score', index='Variation', columns='Model')
 [model_col].style.background_gradient('magma_r', axis=1))

Model,Ridge,Lasso,Elastic Net,Decision Tree,KNN,Random Forest,XGB
Variation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benchmark_NoScale,159.498346,201.180518,199.812053,188.900339,167.376372,144.288902,134.65187
Benchmark_Scale,160.560307,205.929145,203.527709,188.900339,156.939328,144.288721,134.634701
Tuned_NoScale,159.693977,161.462637,160.035033,164.104908,143.968356,148.335935,133.756034
Tuned_Scale,159.697172,161.87757,161.313851,164.104908,145.625439,148.327971,133.753524


## Conclusion

By looking at the RMSE score table above, both for training dataset and test dataset, we can see that Decision Tree and KNN model have exquisite performance with RMSE value less than 1. But, it did worse on training dataset which could indicate overfitting. Hence, the best model would be XGB model that has good performance both in training dataset and testing dataset. From for variation of model, the Tuned XGB with Scaler had the best performance

___
# Save Best Model
___

In [170]:
with open('Models/'+'XGB_Tuned_Scale.pkl', 'rb') as f:
    best_model = pickle.load(f)

In [171]:
best_model.fit(X, y)

  elif pd.api.types.is_categorical(cols):


TransformedTargetRegressor(regressor=Pipeline(steps=[('transformer',
                                                      ColumnTransformer(remainder='passthrough',
                                                                        transformers=[('missing_numerical',
                                                                                       Pipeline(steps=[('imputer',
                                                                                                        SimpleImputer(strategy='most_frequent'))]),
                                                                                       ['bedrooms',
                                                                                        'total_bathrooms']),
                                                                                      ('missing_onehot',
                                                                                       Pipeline(steps=[('imputer',
                                  

In [174]:
with open('Models/'+'best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)