___
# Import Library
___

In [34]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import category_encoders as ce

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error

from dotenv import load_dotenv, find_dotenv, set_key
import sqlalchemy as db
import os
import pickle

In [35]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

In [36]:
load_dotenv()

True

In [37]:
RANDOM_STATE = 202102
SCORING = 'neg_root_mean_squared_error'

___
# Business Problems
___

Who doesn't like to travel? Traveling is a such fun activity, coming into new place and experience many new things. It is a part of crucial aspect in life as it's the best way to escape from hectic and busy schedule. It also improves the mental and physical health while it's a good remedy for stress, anxiety and depression. As part of travelling, deciding accomodation or where you should sleep and charge your body is definitely important thing to decide. One of the choice to spend your night, apart from conventional hotel is Airbnb.

Airbnb is an American vacation rental online marketplace company based in San Francisco, California. Airbnb maintains and hosts a marketplace, accessible to consumers on its website or via an app. Through the service, users can arrange lodging, primarily homestays, and tourism experiences or list their properties for rental. Airbnb does not own any of the listed properties; instead, it profits by receiving commission from each booking. 

Singapore, one of developed country in Asia, make 4.1% of its national GDP from tourism industry alone and has been one of the most visited country in Asia resulting in thousand of Airbnb listings in Singapore (around 4000+ listings). It can be troublesome to choose one befitting your needs and budget. This also becomes problem for owner property who wants to register their new property in Airbnb since they may find it hard to price their property.

Here's where machine learning help to solve those problems. Predicting price of property, or in this case Airbnb listing, could be quite challenging since there are various factors need to be measured and calculated in order to get price accurately. In this project, we will do end-to-end machine learning project, starting from cleaning the dataset, do exploratory data analysis to get some insights, use machine learning model to predict Airbnb listing price and deploy our best model by creating dashboard.   

___
# Load Dataset
___

In [38]:
engine = db.create_engine(os.getenv('db-uri'))

In [39]:
meta = db.MetaData()
meta.reflect(engine)

In [40]:
with engine.connect() as con:
    query = db.select([meta.tables['listings']])
    result = con.execute(query).fetchall()
df_raw = pd.DataFrame(result, columns=meta.tables['listings'].c.keys())

In [41]:
listings_dropped_columns = os.getenv('listings_dropped_columns')
listings_dropped_columns

"['id', 'listing_url', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'availability_60', 'availability_90', 'availability_365', 'host_i

In [42]:
df = df_raw.drop(eval(listings_dropped_columns), axis=1)
display(df)

Unnamed: 0,neighbourhood_cleansed,neighbourhood_group_cleansed,property_type,room_type,accommodates,bedrooms,beds,price,minimum_nights,maximum_nights,availability_30,instant_bookable,calculated_host_listings_count,total_bathrooms,bathrooms_type,Air Conditioning,BBQ Utensils,Baby and Children Equipments,Bathtub,Beach Essentials,Breakfast,Building Staff,Cleaning Before Checkout,Cleaning Equipments,Clothing Equipments,Coffee Maker,Cutlery,Dedicated Workspace,Door Lock,Dryer,EV Charger,Elevator,Entertainment,Essentials,Ethernet Connection,"Extra pillows, blankets or bed-linen",Fan,Fire Safety Equipments,Fireplace,First Aid Kit,Free Parking,Gym,Heating,Host Greets You,Hot Tub,Hot Water,Kitchen Utensils,Kitchen/Dining Area,Laundromat Nearby,Living Room,Lockbox,Long Term Stays Allowed,Luggage Dropoff Allowed,Outdoor Space,Paid Parking,Pool,Private Entrance,Refrigerator/Freezer,Sauna,Single level Home,Ski-in/Ski-out,TV,Toilet Equipments,Washer,Water Body Access
0,Woodlands,North Region,Private room in apartment,Private room,1,1.0,1.0,79,180,360,30,False,2,1.0,bath,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
1,Bukit Timah,Central Region,Private room in apartment,Private room,2,1.0,1.0,80,90,730,30,False,1,1.0,bath,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
2,Woodlands,North Region,Private room in apartment,Private room,1,1.0,1.0,66,6,14,30,False,2,1.0,bath,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
3,Tampines,East Region,Private room in villa,Private room,6,2.0,3.0,174,90,1125,30,False,8,1.0,private bath,1,0,1,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,1,1,0,0,1,0,0,0,0,1,1,1,0
4,Tampines,East Region,Private room in house,Private room,3,1.0,1.0,93,90,1125,30,False,8,1.0,shared half-bath,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4382,Bukit Timah,Central Region,Private room in condominium,Private room,2,,1.0,25,10,30,0,True,2,1.0,shared bath,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4383,Marine Parade,Central Region,Private room in house,Private room,1,1.0,1.0,57,90,182,29,True,6,1.0,half-bath,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
4384,Geylang,Central Region,Private room in house,Private room,4,1.0,1.0,62,90,182,29,True,6,1.0,bath,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0
4385,Marine Parade,Central Region,Private room in house,Private room,1,1.0,1.0,47,90,182,29,True,6,1.0,half-bath,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0


___
# Data Splitting
___

In [43]:
X = df.drop('price', axis=1).fillna(np.nan)
y = df['price']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=RANDOM_STATE)

In [45]:
for x in [X_train, X_test, y_train, y_test]:
    print(x.shape)

(3070, 64)
(1317, 64)
(3070,)
(1317,)


In [46]:
list_column_input = X_train.columns.tolist()
if os.getenv('list_column_input') != list_column_input:
    set_key(find_dotenv(), 'list_column_input', str(list_column_input))

___
# Choose Evaluation Metric
___

In this project, one of our goal is to predict Airbnb listing as accurately as possible (regression problem). Hence, I choose RMSE (Root Mean Squared Error) as evaluation metric since it shows how far our set of predictions to the actual price and its ability to punish high difference of prediction and actual price.

___
# Data Transformer
___

In [47]:
encode_df = pd.DataFrame()
encode_df['Unique Values'] = df.apply(pd.unique)
encode_df['Count'] = encode_df['Unique Values'].apply(len)
encode_df['Data Type'] = [df[t].dtype for t in encode_df.index]
encode_df['%Missing'] = df.isnull().sum()/len(df)*100
encode_df.sort_values('Count')

Unnamed: 0,Unique Values,Count,Data Type,%Missing
Entertainment,"[0, 1]",2,int64,0.0
Ethernet Connection,"[1, 0]",2,int64,0.0
"Extra pillows, blankets or bed-linen","[0, 1]",2,int64,0.0
Fan,"[0, 1]",2,int64,0.0
Fire Safety Equipments,"[0, 1]",2,int64,0.0
Fireplace,"[0, 1]",2,int64,0.0
First Aid Kit,"[0, 1]",2,int64,0.0
Free Parking,"[0, 1]",2,int64,0.0
Gym,"[0, 1]",2,int64,0.0
Heating,"[0, 1]",2,int64,0.0


In [48]:
missing_numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan))
])

In [49]:
missing_onehot_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('onehot', OneHotEncoder(drop='first'))
])

In [50]:
transformer = ColumnTransformer([
    ('missing_numerical', missing_numerical_pipeline, ['bedrooms', 'beds', 'total_bathrooms']),
    ('missing_onehot', missing_onehot_pipeline, ['bathrooms_type']), 
    ('nonmissing_onehot', OneHotEncoder(drop='first'), ['instant_bookable', 
                                                        'room_type', 'neighbourhood_group_cleansed']),
    ('nonmissing_binary', ce.BinaryEncoder(drop_invariant=True), ['neighbourhood_cleansed', 
                                                                  'property_type'])
], remainder='passthrough')

___
# Model Building
___

In [51]:
model_result = pd.DataFrame()

In [52]:
kf = KFold(shuffle=True, random_state=RANDOM_STATE)

## Decision Tree Regression

### Benchmark Decision Tree Regression

In [53]:
dtr = DecisionTreeRegressor(random_state=RANDOM_STATE)

In [54]:
benchmark_dtr = Pipeline([
    ('transformer', transformer),
    ('regressor', dtr)
])

In [55]:
benchmark_dtr_scores = cross_val_score(benchmark_dtr, X_train, y_train, scoring=SCORING, 
                                       cv=kf, n_jobs=-1, verbose=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.6s remaining:   10.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.7s remaining:    4.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.9s finished


In [56]:
benchmark_dtr_scores

array([-282.58591357, -448.45301758, -557.78601577, -421.17058352,
       -585.83820493])

In [57]:
benchmark_dtr_scores.mean()

-459.16674707394196

In [58]:
benchmark_dtr.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('missing_numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent'))]),
                                                  ['bedrooms', 'beds',
                                                   'total_bathrooms']),
                                                 ('missing_onehot',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['bathrooms_type']),
   

In [59]:
filename = 'benchmark_dtr.pkl'
pickle.dump(benchmark_dtr, open('Models/Trained Models/'+filename, 'wb'))

In [60]:
model_result = model_result.append(pd.DataFrame.from_dict({'Benchmark DTR': -benchmark_dtr_scores.mean()}, 
                                                          orient='index'))
model_result

Unnamed: 0,0
Benchmark DTR,459.166747


### Tuned Decision Tree Regression

In [61]:
dtr = DecisionTreeRegressor(random_state=RANDOM_STATE)

In [62]:
benchmark_dtr = Pipeline([
    ('transformer', transformer),
    ('regressor', dtr)
])

In [63]:
tuned_dtr_params = {
    "regressor__criterion": ["mse", "friedman_mse", "mae", "poisson"],
    "regressor__min_samples_split": [2, 5, 10, 20, 30, 40],
    "regressor__max_depth": [2, 6, 8],
    "regressor__min_samples_leaf": [20, 40, 100],
    "regressor__max_leaf_nodes": [5, 20, 100],
}

In [64]:
grid_dtr = GridSearchCV(benchmark_dtr, tuned_dtr_params, scoring=SCORING, 
                        cv=kf, n_jobs=-1, verbose=10)

In [65]:
grid_dtr.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


  elif pd.api.types.is_categorical(cols):


GridSearchCV(cv=KFold(n_splits=5, random_state=202102, shuffle=True),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('missing_numerical',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='most_frequent'))]),
                                                                         ['bedrooms',
                                                                          'beds',
                                                                          'total_bathrooms']),
                                                                        ('missing_onehot',
                                                                         Pipeline(steps=[('imputer',
        

In [66]:
grid_dtr.best_score_

-319.1091790505964

In [67]:
tuned_dtr_scores = pd.DataFrame(grid_dtr.cv_results_).sort_values('rank_test_score')
tuned_dtr_scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__criterion,param_regressor__max_depth,param_regressor__max_leaf_nodes,param_regressor__min_samples_leaf,param_regressor__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
479,1.334236,0.036328,0.029784,0.001599,mae,8,100,40,40,"{'regressor__criterion': 'mae', 'regressor__ma...",-264.87149,-181.713942,-183.219552,-442.518594,-523.222316,-319.109179,139.411065,1
478,1.345829,0.036632,0.034781,0.009320,mae,8,100,40,30,"{'regressor__criterion': 'mae', 'regressor__ma...",-264.87149,-181.713942,-183.219552,-442.518594,-523.222316,-319.109179,139.411065,1
477,1.341032,0.033634,0.034582,0.001625,mae,8,100,40,20,"{'regressor__criterion': 'mae', 'regressor__ma...",-264.87149,-181.713942,-183.219552,-442.518594,-523.222316,-319.109179,139.411065,1
476,1.326040,0.022855,0.030784,0.004258,mae,8,100,40,10,"{'regressor__criterion': 'mae', 'regressor__ma...",-264.87149,-181.713942,-183.219552,-442.518594,-523.222316,-319.109179,139.411065,1
475,1.347230,0.038960,0.033380,0.004836,mae,8,100,40,5,"{'regressor__criterion': 'mae', 'regressor__ma...",-264.87149,-181.713942,-183.219552,-442.518594,-523.222316,-319.109179,139.411065,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,0.149113,0.004663,0.043777,0.009364,poisson,6,5,20,40,"{'regressor__criterion': 'poisson', 'regressor...",-275.02315,-226.093160,-235.821756,-458.724119,-533.063209,-345.745079,125.899686,637
544,0.154712,0.004441,0.037780,0.004163,poisson,6,5,20,30,"{'regressor__criterion': 'poisson', 'regressor...",-275.02315,-226.093160,-235.821756,-458.724119,-533.063209,-345.745079,125.899686,637
543,0.170499,0.013357,0.044376,0.007496,poisson,6,5,20,20,"{'regressor__criterion': 'poisson', 'regressor...",-275.02315,-226.093160,-235.821756,-458.724119,-533.063209,-345.745079,125.899686,637
599,0.171903,0.014428,0.044573,0.006829,poisson,8,5,20,40,"{'regressor__criterion': 'poisson', 'regressor...",-275.02315,-226.093160,-235.821756,-458.724119,-533.063209,-345.745079,125.899686,637


In [68]:
tuned_dtr = grid_dtr.best_estimator_

In [69]:
tuned_dtr.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('missing_numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent'))]),
                                                  ['bedrooms', 'beds',
                                                   'total_bathrooms']),
                                                 ('missing_onehot',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['bath...type']),
      

In [70]:
filename = 'tuned_dtr.pkl'
pickle.dump(benchmark_dtr, open('Models/Trained Models/'+filename, 'wb'))

In [71]:
model_result = model_result.append(pd.DataFrame.from_dict({'Tuned DTR': -grid_dtr.best_score_}, 
                                                          orient='index'))
model_result

Unnamed: 0,0
Benchmark DTR,459.166747
Tuned DTR,319.109179


### Save Model Result

In [72]:
filename = 'model_result_dtr.pkl'
pickle.dump(model_result, open('Models/Trained Models/'+filename, 'wb'))