In [661]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 80
pd.options.mode.chained_assignment = None
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split, ShuffleSplit
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from collections import defaultdict
import re
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

In [715]:
def display_all(df):
    with pd.option_context('display.max_rows', 1000):
        with pd.option_context('display.max_columns', 1000):
            display(df)

In [785]:
# Read-in airbnb listing.csv file
df_raw = pd.read_csv('../data/amsterdam_2021-02-08_listings.csv.gz', compression='gzip')

# Drop columns which contains long text e.g. URL, description
df_raw.drop(columns =['id', 'host_id', 'last_scraped', 'scrape_id', 'name', 'host_name', 'host_about', 'listing_url', 'description', 'neighborhood_overview', 'picture_url', 'host_url',
                  'host_thumbnail_url', 'host_picture_url', 'host_verifications', 'calendar_last_scraped', 'latitude', \
                  'longitude'],\
                  inplace=True)

In [786]:
display_all(df_raw.tail().transpose())

Unnamed: 0,18286,18287,18288,18289,18290
host_since,2021-02-02,2015-05-25,2015-07-23,2015-12-01,2018-06-27
host_location,NL,"Amsterdam, North Holland, Netherlands","Amsterdam, North Holland, Netherlands","Amsterdam, North Holland, Netherlands","Amsterdam, North Holland, Netherlands"
host_response_time,,,,,within a few hours
host_response_rate,,,,,100%
host_acceptance_rate,,,,,97%
host_is_superhost,f,f,f,f,t
host_neighbourhood,,,,,Grachtengordel
host_listings_count,1,0,1,1,9
host_total_listings_count,1,0,1,1,9
host_has_profile_pic,f,t,t,t,t


In [787]:
# Keep columns with at least 80% in non-null values
thresh = len(df_raw) * 0.80
df_raw.dropna(thresh = thresh, axis = 1, inplace = True)

In [788]:
# Clean bathroom_text to remove text part and keep numeric part
#df['bathrooms_cnt'] = pd.to_numeric(df['bathrooms_text'].str.split(" ", n = 1, expand = True)[0], errors = 'coerce')
#df.drop(columns = 'bathrooms_text', inplace = True) 

# Convert to datetime
df_raw['host_since'] = pd.to_datetime(df_raw['host_since'])

# host_total_listing_count variable is present containing same value
df_raw.drop(columns = 'host_listings_count', inplace = True)

In [789]:
df_raw['host_is_superhost'].replace({'t': 1, 'f':0}, inplace = True)
df_raw['host_identity_verified'].replace({'t': 1, 'f':0}, inplace = True)
df_raw['instant_bookable'].replace({'t': 1, 'f':0}, inplace = True)

In [790]:
# Create dummy column for each item in amenities
col_value = 'amenities'
for i in df_raw.index:          
    splitted_amenities = re.sub('[^a-zA-Z0-9,\n\.]', '', df_raw.at[i, col_value]).split(',')
    
    for amenity in splitted_amenities:        
        amenity = (amenity.strip())        
        df_raw.at[i, f'amenity_{amenity.lower()}'] = 1
      
df_raw.fillna(0, inplace = True)
df_raw.drop(columns = 'amenities', inplace = True)

In [791]:
df1 = pd.get_dummies(df_raw, columns= ['neighbourhood_cleansed'], prefix='neighbourhood', drop_first=True)
#df1.drop(columns = 'neighbourhood_cleansed', inplace = True)

In [792]:
# Convert Price to numeric to remove $ sign and remove comma, if any
df1['price'] = [float(value[1:-1].replace(',', '')) for value in df1.price]

# Label encode the categorical columns
for i in df1.select_dtypes(exclude= [np.number]).columns:  
    df1.loc[:, f'{i}_encoded'] = pd.factorize(df1[i])[0].reshape(-1, 1)
    df1.drop(columns = i, inplace = True)

In [793]:
# Drop rows with any null values
thresh = len(df1) * 1
df1.dropna(inplace = True)

In [794]:
df_ready = df1.copy()
df_ready = df_ready[(df_ready['price'] <= 500) & (df_ready['price'] > 0)]

In [795]:
# Create Feature and target variable data frame
x = df_ready.drop(columns = ['price'])
y = df_ready['price']

In [796]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)
#column_trans = make_column_transformer((OrdinalEncoder(), df.select_dtypes(exclude= [np.number]).columns),
#                                       remainder = 'passthrough')
#model = make_pipeline(column_trans, LinearRegression())

In [797]:
def print_score(model):
    print('R^2 Training Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(model.score(x_train, y_train), model.score(x_test, y_test)))
    

In [798]:
lr = LinearRegression()
lr.fit(x_train, y_train)
print_score(lr)

R^2 Training Score: 0.45 
R^2 Validation Score: -336375863.77


In [799]:
rf = RandomForestRegressor(min_samples_leaf = 5, max_features = 'sqrt')
rf.fit(x_train, y_train)
print_score(rf)

R^2 Training Score: 0.59 
R^2 Validation Score: 0.43


In [800]:
features = x.columns
features_sorted= sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), features), reverse = True)

top_feature = [feature[1] for feature in features_sorted if feature[0] >= 0.01]
feature_subset = x[top_feature]

In [801]:
features = feature_subset.columns
scores = defaultdict(list)
rs = ShuffleSplit(n_splits = 10, random_state = 100, test_size = 0.3)

for train_idx, test_idx in rs.split(y):    
    x_train, x_test = feature_subset.iloc[train_idx], feature_subset.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    r = rf.fit(x_train, y_train)
    accuracy = r2_score(y_test, rf.predict(x_test))
    
    for i in range(feature_subset.shape[1]):
        x_t = x_test.copy()
        x_t.reset_index(inplace = True, drop = True)
        np.random.shuffle(x_t.iloc[:,i])
        shuff_acc = r2_score(y_test, rf.predict(x_t))
        scores[features[i]].append((accuracy-shuff_acc)/accuracy)

top_feature_validated = sorted([(round(np.mean(score), 4), features) for features, score in scores.items()], reverse = True)
top_feature_validated = [feature[1] for feature in top_feature_validated if feature[0] >= 0.01]
top_feature_validated

['accommodates',
 'bedrooms',
 'review_scores_location',
 'bathrooms_text_encoded',
 'room_type_encoded',
 'beds',
 'property_type_encoded',
 'amenity_dryer',
 'calculated_host_listings_count_entire_homes',
 'availability_365',
 'reviews_per_month',
 'number_of_reviews_ltm',
 'minimum_nights_avg_ntm',
 'availability_60',
 'amenity_tv',
 'calculated_host_listings_count_private_rooms',
 'availability_90',
 'host_since_encoded',
 'number_of_reviews']

In [802]:
feature_subset = x[top_feature_validated]
x_train, x_test, y_train, y_test = train_test_split(feature_subset, y, random_state = 0)
rf.fit(x_train, y_train)
print_score(rf)

R^2 Training Score: 0.62 
R^2 Validation Score: 0.45


In [803]:
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
print_score(xgb)

R^2 Training Score: 0.74 
R^2 Validation Score: 0.45


In [735]:
y.describe(percentiles = [.1, .20, .30, .40, .50, .60, .70, .80, .90, .95, .975, .99, .9975, .995,.999])

count     18083.000000
mean        144.376265
std          77.185802
min           4.000000
10%          69.000000
20%          85.000000
30%         100.000000
40%         112.000000
50%         125.000000
60%         145.000000
70%         160.000000
80%         196.000000
90%         250.000000
95%         300.000000
97.5%       350.000000
99%         425.000000
99.5%       468.590000
99.75%      500.000000
99.9%       500.000000
max         500.000000
Name: price, dtype: float64