In [153]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
from pathlib import Path
import sys
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

In [154]:
listings = pd.read_csv('https://raw.githubusercontent.com/szilvasipeter2000/Data-Analysis-3/main/assignment-2/data/listings.csv')

In [166]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14123 entries, 1 to 24651
Data columns (total 47 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            14123 non-null  int64  
 1   host_response_time                            9361 non-null   object 
 2   host_response_rate                            9361 non-null   float64
 3   host_acceptance_rate                          9978 non-null   float64
 4   host_is_superhost                             0 non-null      float64
 5   host_listings_count                           14122 non-null  float64
 6   host_total_listings_count                     14122 non-null  float64
 7   host_has_profile_pic                          0 non-null      float64
 8   host_identity_verified                        0 non-null      float64
 9   neighbourhood                                 8329 non-null  

In [156]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [157]:
# drop some unnecessary columns
# these are mainly empty, url, id, or simpy variables that are just non relevant for the analysis
listings = listings.drop(columns=['listing_url','scrape_id','last_scraped','source','name','description','neighborhood_overview','picture_url','host_id','host_url',
                                  'host_name','host_since','host_location','host_about','host_thumbnail_url','host_picture_url','calendar_updated','calendar_last_scraped',
                                  'first_review','last_review','neighbourhood_group_cleansed','bathrooms','bedrooms','license','amenities','host_verifications','host_neighbourhood','has_availability'])

In [167]:
# filter for local governments: melbourne, Port Philip, Stonnington, Yarra
# these neighborhoods are the central ones, where we would also have our apartments to price
listings = listings[listings['neighbourhood_cleansed'].isin(['Melbourne','Port Phillip','Stonnington','Yarra'])]

## CLEANING THE DATA

#### Cleaning numeric columns

In [171]:
# format binary variables
for binary in [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "instant_bookable",
]:
    listings[binary] = listings[binary].map({"t": 1, "f": 0})

# formatting columns with percentages
for perc in ["host_response_rate", "host_acceptance_rate"]:
    listings[perc] = listings[perc].replace("%", "", regex=True)
    listings[perc] = pd.to_numeric(listings[perc], errors="coerce")


# formatting price column
listings['price'] = pd.to_numeric(listings['price'].replace('[\$,]', '', regex=True), errors='coerce')
# drop where price is missing
listings = listings.dropna(subset=['price'])

#### Cleaning categorical/string columns

In [170]:
# cleaning
listings = listings.dropna(subset=['price'])

106

In [168]:
listings['host_response_time'].value_counts()

within an hour        7194
within a few hours    1273
within a day           666
a few days or more     228
Name: host_response_time, dtype: int64

In [172]:
listings.T

Unnamed: 0,1,5,6,7,8,10,12,13,14,16,...,24641,24642,24643,24644,24645,24646,24647,24648,24650,24651
id,12936,44699,47100,51592,66754,74324,628156,78143,628370,80986,...,1044815963363985780,1043364527010144962,1045171321437975789,1045217869918317655,1045315396175936734,1045319917817316696,1043389360362135958,1043389838073692212,1043663513425643808,1045498302464009295
host_response_time,,within an hour,within a few hours,within an hour,,within an hour,within an hour,within a few hours,within a few hours,,...,within an hour,,within an hour,within an hour,within an hour,within an hour,,,within a few hours,within an hour
host_response_rate,,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,,...,100.0,,97.0,100.0,100.0,100.0,,,98.0,100.0
host_acceptance_rate,,88.0,100.0,99.0,,100.0,87.0,88.0,83.0,,...,100.0,,96.0,100.0,97.0,97.0,100.0,,67.0,99.0
host_is_superhost,,,,,,,,,,,...,,,,,,,,,,
host_listings_count,10.0,3.0,1.0,2.0,10.0,1.0,1.0,1.0,2.0,10.0,...,6.0,1.0,19.0,70.0,15.0,15.0,4.0,55.0,33.0,14.0
host_total_listings_count,20.0,13.0,5.0,2.0,20.0,2.0,3.0,1.0,2.0,20.0,...,6.0,2.0,21.0,146.0,17.0,17.0,4.0,58.0,74.0,35.0
host_has_profile_pic,,,,,,,,,,,...,,,,,,,,,,
host_identity_verified,,,,,,,,,,,...,,,,,,,,,,
neighbourhood,"St Kilda, Victoria, Australia","South Melbourne, Victoria, Australia","Richmond, Victoria, Australia","Melbourne, Victoria, Australia","Richmond, Victoria, Australia","Fitzroy, Victoria, Australia","Port Melbourne, Victoria, Australia","Prahran, Victoria, Australia","Port Melbourne, Victoria, Australia","Richmond, Victoria, Australia",...,,,,"Southbank, Victoria, Australia",,,,"Melbourne, Victoria, Australia",,"Melbourne, Victoria, Australia"


In [None]:
listings['host_is_superhost'].value_counts()

0.0    10499
1.0     3595
Name: host_is_superhost, dtype: int64

In [None]:
listings['price'].head(25)

1      $95.00
5      $92.00
6     $125.00
7     $269.00
8      $94.00
10    $290.00
12     $90.00
13     $96.00
14    $240.00
15        NaN
16     $84.00
18    $182.00
20    $200.00
21    $120.00
24    $190.00
27        NaN
28    $123.00
32    $182.00
33     $99.00
35        NaN
37     $89.00
41    $118.00
45        NaN
49    $133.00
50        NaN
Name: price, dtype: object