In [98]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
from pathlib import Path
import sys
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error



import ast

In [73]:
listings = pd.read_csv('https://raw.githubusercontent.com/szilvasipeter2000/Data-Analysis-3/main/assignment-2/data/listings.csv')

In [74]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23185 entries, 0 to 23184
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            23185 non-null  int64  
 1   listing_url                                   23185 non-null  object 
 2   scrape_id                                     23185 non-null  int64  
 3   last_scraped                                  23185 non-null  object 
 4   source                                        23185 non-null  object 
 5   name                                          23185 non-null  object 
 6   description                                   22822 non-null  object 
 7   neighborhood_overview                         13256 non-null  object 
 8   picture_url                                   23185 non-null  object 
 9   host_id                                       23185 non-null 

In [75]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [76]:
# drop some unnecessary columns
# these are mainly NAs,empty lists, urls, ids, or simpy variables that are just non relevant for the analysis
drop = ['listing_url','scrape_id','last_scraped','source','name','description','neighborhood_overview','picture_url','host_id','host_url',
                                  'host_name','host_since','host_location','host_about','host_thumbnail_url','host_picture_url','calendar_updated','calendar_last_scraped',
                                  'first_review','last_review','neighbourhood_group_cleansed','bathrooms','license','host_verifications','host_neighbourhood','has_availability']

listings = listings.drop(columns=drop)

In [77]:
# filter for local governments: melbourne, Port Philip, Stonnington, Yarra
# these neighborhoods are the central ones, where we would also have our apartments to price
listings = listings[listings['neighbourhood_cleansed'].isin(['Melbourne','Port Phillip','Stonnington','Yarra'])]

## CLEANING THE DATA

#### Cleaning numeric columns

In [78]:
# format binary variables
for binary in [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "instant_bookable",
]:
    listings[binary] = listings[binary].map({"t": 1, "f": 0})

# formatting columns with percentages
for perc in ["host_response_rate", "host_acceptance_rate"]:
    listings[perc] = listings[perc].replace("%", "", regex=True)
    listings[perc] = pd.to_numeric(listings[perc], errors="coerce")


# formatting price column
listings['price'] = pd.to_numeric(listings['price'].replace('[\$,]', '', regex=True), errors='coerce')
# drop where price is missing
listings = listings.dropna(subset=['price'])

#### Cleaning categorical/string columns

In [91]:
# clean amenities column









# create n_amenities column
listings['n_amenities'] = listings['amentities'].apply(lambda x: len(x.split(',')))

len(listings['amenities'][1])

394

1        ["Microwave", "Lockbox", "Hot water", "Smoke a...
5        ["Smoke alarm", "Wifi", "Kitchen", "Iron", "Ha...
6        ["Drying rack for clothing", "Free street park...
7        ["Private living room", "Carbon monoxide alarm...
9        ["Microwave", "Coffee maker", "Free street par...
                               ...                        
23178    ["Wifi", "Smoke alarm", "Kitchen", "TV", "Air ...
23179    ["Shared gym in building", "Microwave", "Hot w...
23180    ["Microwave", "Hot water kettle", "Harbor view...
23181    ["Microwave", "Hot water kettle", "Lockbox", "...
23182    ["Wifi", "Kitchen", "Free parking on premises"...
Name: amenities, Length: 13315, dtype: object

In [103]:
listings['new_amenities'] = listings['amenities'].apply(lambda x:list(ast.literal_eval(x))) # ast.literal_eval() to converts string to dictionary so that way i can just get the values as a list

In [115]:
amentities = []

for list in listings['new_amenities']:
    for amenity in list:
        amentities.append(amenity)

In [124]:
unique_amenities = set(amentities)
unique_amenities

len(unique_amenities)


2876

In [125]:
unique_amenities

{'AEG electric stove',
 'Bose sound system sound system with Bluetooth and aux',
 'Whirlpool gas stove',
 'TV with Chromecast, standard cable',
 '65" HDTV with Amazon Prime Video, Disney+, Netflix',
 'Bar fridge refrigerator',
 'Breville  induction stove',
 'Samsung  refrigerator',
 '65" HDTV with Netflix, Disney+, Amazon Prime Video',
 'Generic  conditioner',
 'Only for the first two days body soap',
 'Wifi – 36 Mbps',
 'Beauty and the Bees Tasmania conditioner',
 'Aseop shampoo',
 '43" HDTV with premium cable, Apple TV, Amazon Prime Video, Netflix, Disney+, standard cable, Chromecast',
 'Outdoor dining area',
 'Wifi – 25 Mbps',
 'Sauna',
 'Outdoor kitchen with sink',
 '55" HDTV with Amazon Prime Video, Netflix, premium cable',
 'A nice one -  has ice maker refrigerator',
 'Shared outdoor pool - available all year, open specific hours, pool cover',
 '50" HDTV with Roku, premium cable, HBO Max, Amazon Prime Video, Fire TV, Netflix, Disney+, Apple TV, Chromecast, Hulu',
 'LG Meridian so

In [80]:
listings['host_response_time'].value_counts()

within an hour        6666
within a few hours    1255
within a day           568
a few days or more     178
Name: host_response_time, dtype: int64

In [87]:
listings.T

Unnamed: 0,1,5,6,7,9,11,13,14,15,16,...,23172,23173,23174,23176,23177,23178,23179,23180,23181,23182
id,12936,1181117,1188388,603007,1192689,1201338,1233439,621155,628156,628370,...,971364279376710218,971385848818575605,969280954896268255,971570547679229207,969308498306934752,971588505523400983,971599030604344292,971604763527045723,969314524632337156,971613881418926837
host_response_time,,within a day,within an hour,,within an hour,within an hour,within an hour,,within a few hours,within an hour,...,within an hour,within an hour,within an hour,,within an hour,,within an hour,within an hour,within an hour,
host_response_rate,,100.0,100.0,,100.0,100.0,97.0,,100.0,100.0,...,93.0,100.0,100.0,,100.0,,100.0,100.0,100.0,
host_acceptance_rate,,0.0,100.0,,85.0,100.0,98.0,,89.0,100.0,...,99.0,96.0,100.0,,100.0,,99.0,91.0,100.0,
host_is_superhost,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
host_listings_count,10.0,1.0,1.0,1.0,2.0,2.0,73.0,3.0,1.0,2.0,...,14.0,80.0,1.0,38.0,1.0,1.0,35.0,6.0,7.0,1.0
host_total_listings_count,20.0,2.0,3.0,1.0,5.0,3.0,136.0,3.0,3.0,2.0,...,30.0,80.0,1.0,42.0,1.0,1.0,47.0,6.0,7.0,1.0
host_has_profile_pic,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
host_identity_verified,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
neighbourhood,"St Kilda, Victoria, Australia","Ripponlea, Victoria, Australia","Port Melbourne, Victoria, Australia","South Yarra, Victoria, Australia","St Kilda, Victoria, Australia","South Melbourne, Victoria, Australia","Melbourne, Victoria, Australia",,"Port Melbourne, Victoria, Australia","Port Melbourne, Victoria, Australia",...,,,"Southbank, Victoria, Australia",,"South Yarra, Victoria, Australia",,"Melbourne, Victoria, Australia",,,
