In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
from pathlib import Path
import sys
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

In [2]:
listings = pd.read_csv('https://raw.githubusercontent.com/szilvasipeter2000/Data-Analysis-3/main/assignment-2/data/listings.csv')

In [3]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23185 entries, 0 to 23184
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            23185 non-null  int64  
 1   listing_url                                   23185 non-null  object 
 2   scrape_id                                     23185 non-null  int64  
 3   last_scraped                                  23185 non-null  object 
 4   source                                        23185 non-null  object 
 5   name                                          23185 non-null  object 
 6   description                                   22822 non-null  object 
 7   neighborhood_overview                         13256 non-null  object 
 8   picture_url                                   23185 non-null  object 
 9   host_id                                       23185 non-null 

In [4]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [5]:
# drop some unnecessary columns
# these are mainly empty, url, id, or simpy variables that are just non relevant for the analysis
drop = ['listing_url','scrape_id','last_scraped','source','name','description','neighborhood_overview','picture_url','host_id','host_url',
                                  'host_name','host_since','host_location','host_about','host_thumbnail_url','host_picture_url','calendar_updated','calendar_last_scraped',
                                  'first_review','last_review','neighbourhood_group_cleansed','bathrooms','bedrooms','license','host_verifications','host_neighbourhood','has_availability']

# listings = listings.drop(columns=drop)

In [7]:
listings[drop].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23175,23176,23177,23178,23179,23180,23181,23182,23183,23184
listing_url,https://www.airbnb.com/rooms/10803,https://www.airbnb.com/rooms/12936,https://www.airbnb.com/rooms/38271,https://www.airbnb.com/rooms/41836,https://www.airbnb.com/rooms/43429,https://www.airbnb.com/rooms/1181117,https://www.airbnb.com/rooms/1188388,https://www.airbnb.com/rooms/603007,https://www.airbnb.com/rooms/607674,https://www.airbnb.com/rooms/1192689,...,https://www.airbnb.com/rooms/971417197079787974,https://www.airbnb.com/rooms/971570547679229207,https://www.airbnb.com/rooms/969308498306934752,https://www.airbnb.com/rooms/971588505523400983,https://www.airbnb.com/rooms/971599030604344292,https://www.airbnb.com/rooms/971604763527045723,https://www.airbnb.com/rooms/969314524632337156,https://www.airbnb.com/rooms/971613881418926837,https://www.airbnb.com/rooms/971616602219608699,https://www.airbnb.com/rooms/971632586652679242
scrape_id,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,...,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306,20230904181306
last_scraped,2023-09-05,2023-09-05,2023-09-04,2023-09-05,2023-09-04,2023-09-05,2023-09-05,2023-09-05,2023-09-04,2023-09-05,...,2023-09-05,2023-09-05,2023-09-05,2023-09-05,2023-09-05,2023-09-05,2023-09-05,2023-09-05,2023-09-04,2023-09-04
source,city scrape,previous scrape,city scrape,previous scrape,city scrape,city scrape,city scrape,previous scrape,city scrape,city scrape,...,city scrape,city scrape,city scrape,city scrape,city scrape,city scrape,city scrape,city scrape,city scrape,city scrape
name,Rental unit in Brunswick East · ★4.49 · 1 bedr...,Rental unit in St Kilda · ★4.68 · 1 bedroom · ...,Rental unit in Berwick · ★4.86 · 3 bedrooms · ...,Home in Reservoir · ★4.71 · 1 bedroom · 1 bed ...,Rental unit in Oakleigh East · ★4.87 · 1 bedro...,Home in Ripponlea · 4 bedrooms · 5 beds · 2 baths,Home in Port Melbourne · ★New · 2 bedrooms · 3...,Rental unit in South Yarra · ★4.76 · 1 bedroom...,Guesthouse in Sherbrooke · ★4.06 · 7 bedrooms ...,Home in St Kilda · ★4.92 · 4 bedrooms · 5 beds...,...,Guesthouse in Box Hill South · ★New · 2 bedroo...,Rental unit in Melbourne · ★New · 2 bedrooms ·...,Rental unit in South Yarra · ★New · 2 bedrooms...,Rental unit in Carlton · ★New · 1 bedroom · 1 ...,Rental unit in Melbourne · ★New · 3 bedrooms ·...,Rental unit in Southbank · ★New · 1 bedroom · ...,Rental unit in Melbourne · ★New · 2 bedrooms ·...,Home in Abbotsford · ★New · 4 bedrooms · 4 bed...,Rental unit in Box Hill · ★New · 2 bedrooms · ...,Home in Springvale · ★New · Studio · 1 bed · 0...
description,A large air conditioned room with firm queen s...,RIGHT IN THE HEART OF ST KILDA! It doesn't get...,No children under 13 will be accepted in your ...,Easy to travel from and to the Airport; quiet ...,,Architect-renovated art-deco house in Ripponle...,"Located on a quiet street in Port Melbourne, t...","Well appointed South Yarra apartment, close to...",Edna Walling cottage is a quaint full-sized do...,"Stunning, spacious 4 bedroom house 5 minutes f...",...,Make your own family brekky & brunch 🥐🍳☕️🥑in t...,入住這間位處中心地段的房源，全家到哪都方便。,A luxury apartment in the bustling Chapel prec...,Forget your worries in this spacious and seren...,"Located on premier St. Kilda Road, superb luxu...",Come and enjoy the Dockland & Southbank & Melb...,入住这处位于市中心的房源，让家人享受一切近在咫尺的便利。<br /><br /><b>Gue...,Cả nhóm sẽ dễ dàng tiếp cận mọi địa điểm từ ch...,Enjoy easy access to everything from this perf...,The room is spacious with built in robe and a ...
neighborhood_overview,This hip area is a crossroads between two grea...,A stay at our apartment means you can enjoy so...,Our street is quiet & secluded but within walk...,"The neighbours are quiet and friendly, please...",Oakleigh is one of the most convenient and div...,"Located on a quiet cul-de-sac, access to trans...",Port Melbourne is a vibrant and trendy suburb ...,"Right near the famous Royal Botanic Gardens, a...",Sherbrooke is located in a rainforest with tow...,What isn't there to love about St.Kilda? It's...,...,,,South Yarra has some of Melbourne’s best resta...,,ST.KILDA ROAD<br /><br />St. Kilda Road is a M...,,,,,
picture_url,https://a0.muscache.com/pictures/e5f30dd1-ac57...,https://a0.muscache.com/pictures/59701/2e8cdaf...,https://a0.muscache.com/pictures/1182791/3bf4b...,https://a0.muscache.com/pictures/569696dd-1ad0...,https://a0.muscache.com/pictures/8c6284de-36de...,https://a0.muscache.com/pictures/17760874/6d3f...,https://a0.muscache.com/pictures/miso/Hosting-...,https://a0.muscache.com/pictures/cbbf545f-fe7b...,https://a0.muscache.com/pictures/11626259/9430...,https://a0.muscache.com/pictures/3d3241f4-1a82...,...,https://a0.muscache.com/pictures/e2e0ed22-733c...,https://a0.muscache.com/pictures/miso/Hosting-...,https://a0.muscache.com/pictures/179cb640-50c0...,https://a0.muscache.com/pictures/2545022d-cf94...,https://a0.muscache.com/pictures/miso/Hosting-...,https://a0.muscache.com/pictures/miso/Hosting-...,https://a0.muscache.com/pictures/e234fd5f-dde8...,https://a0.muscache.com/pictures/hosting/Hosti...,https://a0.muscache.com/pictures/hosting/Hosti...,https://a0.muscache.com/pictures/hosting/Hosti...
host_id,38901,50121,164193,182833,189684,4517777,2279736,2705870,2558288,3117312,...,23177103,477181521,18172117,117277729,2136236,526603805,519753539,535127590,163554327,395737294
host_url,https://www.airbnb.com/users/show/38901,https://www.airbnb.com/users/show/50121,https://www.airbnb.com/users/show/164193,https://www.airbnb.com/users/show/182833,https://www.airbnb.com/users/show/189684,https://www.airbnb.com/users/show/4517777,https://www.airbnb.com/users/show/2279736,https://www.airbnb.com/users/show/2705870,https://www.airbnb.com/users/show/2558288,https://www.airbnb.com/users/show/3117312,...,https://www.airbnb.com/users/show/23177103,https://www.airbnb.com/users/show/477181521,https://www.airbnb.com/users/show/18172117,https://www.airbnb.com/users/show/117277729,https://www.airbnb.com/users/show/2136236,https://www.airbnb.com/users/show/526603805,https://www.airbnb.com/users/show/519753539,https://www.airbnb.com/users/show/535127590,https://www.airbnb.com/users/show/163554327,https://www.airbnb.com/users/show/395737294


In [167]:
# filter for local governments: melbourne, Port Philip, Stonnington, Yarra
# these neighborhoods are the central ones, where we would also have our apartments to price
listings = listings[listings['neighbourhood_cleansed'].isin(['Melbourne','Port Phillip','Stonnington','Yarra'])]

In [10]:
listings['bedrooms'].value_counts()

2.0     6508
1.0     6491
3.0     2436
4.0     1022
5.0      282
6.0       92
7.0       31
10.0       8
8.0        6
9.0        5
14.0       4
11.0       3
Name: bedrooms, dtype: int64

In [12]:
listings["bathrooms"].value_counts()

Series([], Name: bathrooms, dtype: int64)

## CLEANING THE DATA

#### Cleaning numeric columns

In [171]:
# format binary variables
for binary in [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "instant_bookable",
]:
    listings[binary] = listings[binary].map({"t": 1, "f": 0})

# formatting columns with percentages
for perc in ["host_response_rate", "host_acceptance_rate"]:
    listings[perc] = listings[perc].replace("%", "", regex=True)
    listings[perc] = pd.to_numeric(listings[perc], errors="coerce")


# formatting price column
listings['price'] = pd.to_numeric(listings['price'].replace('[\$,]', '', regex=True), errors='coerce')
# drop where price is missing
listings = listings.dropna(subset=['price'])

#### Cleaning categorical/string columns

In [170]:
# cleaning
listings = listings.dropna(subset=['price'])

106

In [168]:
listings['host_response_time'].value_counts()

within an hour        7194
within a few hours    1273
within a day           666
a few days or more     228
Name: host_response_time, dtype: int64

In [172]:
listings.T

Unnamed: 0,1,5,6,7,8,10,12,13,14,16,...,24641,24642,24643,24644,24645,24646,24647,24648,24650,24651
id,12936,44699,47100,51592,66754,74324,628156,78143,628370,80986,...,1044815963363985780,1043364527010144962,1045171321437975789,1045217869918317655,1045315396175936734,1045319917817316696,1043389360362135958,1043389838073692212,1043663513425643808,1045498302464009295
host_response_time,,within an hour,within a few hours,within an hour,,within an hour,within an hour,within a few hours,within a few hours,,...,within an hour,,within an hour,within an hour,within an hour,within an hour,,,within a few hours,within an hour
host_response_rate,,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,,...,100.0,,97.0,100.0,100.0,100.0,,,98.0,100.0
host_acceptance_rate,,88.0,100.0,99.0,,100.0,87.0,88.0,83.0,,...,100.0,,96.0,100.0,97.0,97.0,100.0,,67.0,99.0
host_is_superhost,,,,,,,,,,,...,,,,,,,,,,
host_listings_count,10.0,3.0,1.0,2.0,10.0,1.0,1.0,1.0,2.0,10.0,...,6.0,1.0,19.0,70.0,15.0,15.0,4.0,55.0,33.0,14.0
host_total_listings_count,20.0,13.0,5.0,2.0,20.0,2.0,3.0,1.0,2.0,20.0,...,6.0,2.0,21.0,146.0,17.0,17.0,4.0,58.0,74.0,35.0
host_has_profile_pic,,,,,,,,,,,...,,,,,,,,,,
host_identity_verified,,,,,,,,,,,...,,,,,,,,,,
neighbourhood,"St Kilda, Victoria, Australia","South Melbourne, Victoria, Australia","Richmond, Victoria, Australia","Melbourne, Victoria, Australia","Richmond, Victoria, Australia","Fitzroy, Victoria, Australia","Port Melbourne, Victoria, Australia","Prahran, Victoria, Australia","Port Melbourne, Victoria, Australia","Richmond, Victoria, Australia",...,,,,"Southbank, Victoria, Australia",,,,"Melbourne, Victoria, Australia",,"Melbourne, Victoria, Australia"


In [None]:
listings['host_is_superhost'].value_counts()

0.0    10499
1.0     3595
Name: host_is_superhost, dtype: int64

In [None]:
listings['price'].head(25)

1      $95.00
5      $92.00
6     $125.00
7     $269.00
8      $94.00
10    $290.00
12     $90.00
13     $96.00
14    $240.00
15        NaN
16     $84.00
18    $182.00
20    $200.00
21    $120.00
24    $190.00
27        NaN
28    $123.00
32    $182.00
33     $99.00
35        NaN
37     $89.00
41    $118.00
45        NaN
49    $133.00
50        NaN
Name: price, dtype: object