___
# Import Library
___

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import category_encoders as ce

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import confusion_matrix, classification_report, f1_score, recall_score,\
precision_score, plot_confusion_matrix, plot_roc_curve, roc_curve

from dotenv import load_dotenv
import sqlalchemy as db
import os

In [2]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

In [3]:
load_dotenv()

True

In [4]:
RANDOM_STATE = 202102

___
# Business Problems
___

___
# Load Dataset
___

In [5]:
engine = db.create_engine(os.getenv('db-uri'))

In [6]:
meta = db.MetaData()
meta.reflect(engine)

In [7]:
with engine.connect() as con:
    query = db.select([meta.tables['listings']])
    result = con.execute(query).fetchall()
df_raw = pd.DataFrame(result, columns=meta.tables['listings'].c.keys())

In [8]:
listings_dropped_columns = os.getenv('listings_dropped_columns')
listings_dropped_columns

"['id', 'listing_url', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'availability_60', 'availability_90', 'availability_365']"

In [9]:
df = df_raw.drop(eval(listings_dropped_columns), axis=1)
display(df)

Unnamed: 0,host_response_time,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bedrooms,beds,price,minimum_nights,maximum_nights,has_availability,availability_30,instant_bookable,calculated_host_listings_count,total_bathrooms,bathrooms_type,Air Conditioning,BBQ Utensils,Baby and Children Equipments,Bathtub,Beach Essentials,Breakfast,Building Staff,Cleaning Before Checkout,Cleaning Equipments,Clothing Equipments,Coffee Maker,Cutlery,Dedicated Workspace,Door Lock,Dryer,EV Charger,Elevator,Entertainment,Essentials,Ethernet Connection,"Extra pillows, blankets or bed-linen",Fan,Fire Safety Equipments,Fireplace,First Aid Kit,Free Parking,Gym,Heating,Host Greets You,Hot Tub,Hot Water,Kitchen Utensils,Kitchen/Dining Area,Laundromat Nearby,Living Room,Lockbox,Long Term Stays Allowed,Luggage Dropoff Allowed,Outdoor Space,Paid Parking,Pool,Private Entrance,Refrigerator/Freezer,Sauna,Single level Home,Ski-in/Ski-out,TV,Toilet Equipments,Washer,Water Body Access,email,facebook,google,government_id,identity_manual,jumio,kba,manual_offline,manual_online,offline_government_id,phone,reviews,selfie,weibo,work_email,zhima_selfie
0,within a day,False,True,True,Woodlands,North Region,1.44255,103.79580,Private room in apartment,Private room,1,1.0,1.0,79,180,360,True,30,False,2,1.0,bath,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,0,0,1,1,1,1,0,0,0
1,a few days or more,False,True,True,Bukit Timah,Central Region,1.33235,103.78521,Private room in apartment,Private room,2,1.0,1.0,80,90,730,True,30,False,1,1.0,bath,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,1,0,1,0,0,0,0,0,1,1,1,1,0,1,0
2,within a day,False,True,True,Woodlands,North Region,1.44246,103.79667,Private room in apartment,Private room,1,1.0,1.0,66,6,14,True,30,False,2,1.0,bath,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,1,1,0,0,0,1,1,1,1,0,0,0
3,within a few hours,False,True,True,Tampines,East Region,1.34541,103.95712,Private room in villa,Private room,6,2.0,3.0,174,90,1125,True,30,False,8,1.0,private bath,1,0,1,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,1,1,0,0,1,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0
4,within a few hours,False,True,True,Tampines,East Region,1.34567,103.95963,Private room in house,Private room,3,1.0,1.0,93,90,1125,True,30,False,8,1.0,shared half-bath,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4382,within a day,False,True,False,Bukit Timah,Central Region,1.32414,103.80956,Private room in condominium,Private room,2,,1.0,25,10,30,True,0,True,2,1.0,shared bath,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0
4383,within a day,False,True,False,Marine Parade,Central Region,1.30862,103.90297,Private room in house,Private room,1,1.0,1.0,57,90,182,True,29,True,6,1.0,half-bath,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4384,within a day,False,True,False,Geylang,Central Region,1.31044,103.90275,Private room in house,Private room,4,1.0,1.0,62,90,182,True,29,True,6,1.0,bath,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4385,within a day,False,True,False,Marine Parade,Central Region,1.30889,103.90296,Private room in house,Private room,1,1.0,1.0,47,90,182,True,29,True,6,1.0,half-bath,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0


___
# Data Splitting
___

In [61]:
X = df.drop('price', axis=1).fillna(np.nan)
y = df['price']

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=RANDOM_STATE)

In [63]:
for x in [X_train, X_test, y_train, y_test]:
    print(x.shape)

(1316, 87)
(3071, 87)
(1316,)
(3071,)


___
# Choose Evaluation Metric
___

___
# Data Transformer
___

In [88]:
encode_df = pd.DataFrame()
encode_df['Unique Values'] = df.apply(pd.unique)
encode_df['Count'] = encode_df['Unique Values'].apply(len)
encode_df['Data Type'] = [df[t].dtype for t in encode_df.index]
encode_df['%Missing'] = df.isnull().sum()/len(df)*100
encode_df.sort_values('Count')

Unnamed: 0,Unique Values,Count,Data Type,%Missing
Fan,"[0, 1]",2,int64,0.0
Private Entrance,"[0, 1]",2,int64,0.0
Pool,"[0, 1]",2,int64,0.0
Paid Parking,"[0, 1]",2,int64,0.0
Outdoor Space,"[0, 1]",2,int64,0.0
Luggage Dropoff Allowed,"[0, 1]",2,int64,0.0
Long Term Stays Allowed,"[0, 1]",2,int64,0.0
Lockbox,"[0, 1]",2,int64,0.0
Living Room,"[0, 1]",2,int64,0.0
Refrigerator/Freezer,"[0, 1]",2,int64,0.0


In [91]:
missing_numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan))
])

In [92]:
missing_onehot_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('onehot', OneHotEncoder(drop='first'))
])

In [96]:
transformer = ColumnTransformer([
    ('missing_numerical', missing_numerical_pipeline, ['bedrooms', 'beds', 'total_bathrooms']),
    ('missing_onehot', missing_onehot_pipeline, ['host_is_superhost', 
                                                 'host_has_profile_pic', 
                                                 'host_identity_verified', 
                                                 'host_response_time', 'bathrooms_type']), 
    ('nonmissing_onehot', OneHotEncoder(drop='first'), ['has_availability', 'instant_bookable', 
                                                        'room_type', 'neighbourhood_group_cleansed']),
    ('nonmissing_binary', ce.BinaryEncoder(drop_invariant=True), ['neighbourhood_cleansed', 
                                                                  'property_type'])
], remainder='passthrough')

In [74]:
transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(drop='first'), ['has_availability',
                                             'instant_bookable',
                                             'host_is_superhost',
                                             'host_has_profile_pic',
                                             'host_identity_verified', 
                                             'room_type',
                                             'neighbourhood_group_cleansed',
                                             'bathrooms_type',
                                             'host_response_time']), 
    ('binary', ce.BinaryEncoder(drop_invariant=True), ['neighbourhood_cleansed', 'property_type'])
], remainder='passthrough')

___
# Model Building
___

In [85]:
X.isnull().sum()

host_response_time                      581
host_is_superhost                        11
host_has_profile_pic                     11
host_identity_verified                   11
neighbourhood_cleansed                    0
neighbourhood_group_cleansed              0
latitude                                  0
longitude                                 0
property_type                             0
room_type                                 0
accommodates                              0
bedrooms                                393
beds                                     59
minimum_nights                            0
maximum_nights                            0
has_availability                          0
availability_30                           0
instant_bookable                          0
calculated_host_listings_count            0
total_bathrooms                          15
bathrooms_type                           15
Air Conditioning                          0
BBQ Utensils                    

In [69]:
model_result = pd.DataFrame()

## Linear Regression

### Build Model

In [67]:
linreg = LinearRegression()

In [101]:
benchmark_linreg = Pipeline([
    ('transformer', transformer), 
    ('pf', PolynomialFeatures()),
    ('rfe', RFE(linreg)), 
    ('regressor', linreg)
])

### Benchmark Linear Regression

In [104]:
benchmark_linreg.fit(X_test, y_test)

  elif pd.api.types.is_categorical(cols):


KeyboardInterrupt: 

### Tuned Linear Regression

## Ridge Regression

### Benchmark Ridge Regression

### Tuned Ridge Regression

## Lasso Regression

### Benchmark Lasso Regression

### Tuned Lasso Regression

## Elastic Net Regression

### Benchmark Elastic Net Regression

### Tuned Elastic Net Regression

## Decision Tree Regression

### Benchmark Decision Tree Regression

### Tuned Decision Tree Regression

## KNN Regression

### Benchmark KNN Regression

### Tuned KNN Regression

## Random Forest Regression

### Benchmark Random Forest Regression

### Tuned Random Forest Regression

## Gradient Boost Regression

### Benchmark Gradient Boost Regression

### Tuned Gradient Boost Regression

## Adaptive Boost Regression

### Benchmark Adaptive Boost Regression

### Tuned Adaptive Boost Regression

## XGB Regression

### Benchmark XGB Regression

### Tuned XGB Regression

## Light GBM Regression

### Benchmark Light GBM Regression

### Tuned Light GBM Regression