In [10]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
from pathlib import Path
import sys
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

In [49]:
listings = pd.read_csv('https://raw.githubusercontent.com/szilvasipeter2000/Data-Analysis-3/main/assignment-2/data/listings.csv')

In [50]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24652 entries, 0 to 24651
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            24652 non-null  int64  
 1   listing_url                                   24652 non-null  object 
 2   scrape_id                                     24652 non-null  int64  
 3   last_scraped                                  24652 non-null  object 
 4   source                                        24652 non-null  object 
 5   name                                          24652 non-null  object 
 6   description                                   0 non-null      float64
 7   neighborhood_overview                         13876 non-null  object 
 8   picture_url                                   24652 non-null  object 
 9   host_id                                       24652 non-null 

In [51]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [53]:
# drop some unnecessary columns
# these are mainly empty, url, id, or simpy variables that are just non relevant for the analysis
listings = listings.drop(columns=['listing_url','scrape_id','last_scraped','source','name','description','neighborhood_overview','picture_url','host_id','host_url',
                                  'host_name','host_since','host_location','host_about','host_thumbnail_url','host_picture_url','calendar_updated','calendar_last_scraped',
                                  'first_review','last_review','neighbourhood_group_cleansed','bathrooms','bedrooms','license','amenities','host_verifications','host_neighbourhood','host_has_profile_pic','host_identity_verified'])

In [57]:
# filter for local governments: melbourne, Port Philip, Stonnington, Yarra
# these neighborhoods are the central ones, where we would also have our apartments to price
listings = listings[listings['neighbourhood_cleansed'].isin(['Melbourne','Port Phillip','Stonnington','Yarra'])]
listings.T

Unnamed: 0,1,5,6,7,8,10,12,13,14,15,...,24641,24642,24643,24644,24645,24646,24647,24648,24650,24651
id,12936,44699,47100,51592,66754,74324,628156,78143,628370,633258,...,1044815963363985780,1043364527010144962,1045171321437975789,1045217869918317655,1045315396175936734,1045319917817316696,1043389360362135958,1043389838073692212,1043663513425643808,1045498302464009295
host_response_time,,within an hour,within a few hours,within an hour,,within an hour,within an hour,within a few hours,within a few hours,,...,within an hour,,within an hour,within an hour,within an hour,within an hour,,,within a few hours,within an hour
host_response_rate,,100%,100%,100%,,100%,100%,100%,100%,,...,100%,,97%,100%,100%,100%,,,98%,100%
host_acceptance_rate,,88%,100%,99%,,100%,87%,88%,83%,,...,100%,,96%,100%,97%,97%,100%,,67%,99%
host_is_superhost,f,t,f,t,f,f,t,t,t,f,...,f,f,f,f,f,f,f,f,f,f
host_listings_count,10.0,3.0,1.0,2.0,10.0,1.0,1.0,1.0,2.0,1.0,...,6.0,1.0,19.0,70.0,15.0,15.0,4.0,55.0,33.0,14.0
host_total_listings_count,20.0,13.0,5.0,2.0,20.0,2.0,3.0,1.0,2.0,1.0,...,6.0,2.0,21.0,146.0,17.0,17.0,4.0,58.0,74.0,35.0
neighbourhood,"St Kilda, Victoria, Australia","South Melbourne, Victoria, Australia","Richmond, Victoria, Australia","Melbourne, Victoria, Australia","Richmond, Victoria, Australia","Fitzroy, Victoria, Australia","Port Melbourne, Victoria, Australia","Prahran, Victoria, Australia","Port Melbourne, Victoria, Australia",,...,,,,"Southbank, Victoria, Australia",,,,"Melbourne, Victoria, Australia",,"Melbourne, Victoria, Australia"
neighbourhood_cleansed,Port Phillip,Port Phillip,Yarra,Melbourne,Yarra,Yarra,Port Phillip,Stonnington,Port Phillip,Stonnington,...,Melbourne,Yarra,Melbourne,Melbourne,Melbourne,Melbourne,Yarra,Melbourne,Melbourne,Melbourne
latitude,-37.85999,-37.831557,-37.818371,-37.81266,-37.82127,-37.80415,-37.84158,-37.85162,-37.84168,-37.85437,...,-37.80986,-37.804661,-37.822373,-37.830048,-37.820565,-37.820909,-37.809946,-37.809853,-37.805,-37.817799


In [None]:
f_property_type