# Assignment 2

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
from pathlib import Path
import sys
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

In [9]:
# DATA IMPORT - FROM GITHUB
data_raw = pd.read_csv('https://raw.githubusercontent.com/thimaipham/2.CEU_DA3_Assignment2/main/listings.csv')

In [None]:
# Regex looks for a number followed by the word 'bedroom' or 'bedrooms'
data_raw['bedroom_extract'] = data_raw['name'].str.extract('(\d+) bedroom')

# Convert the extracted bedroom numbers to numeric type
data_raw['bedroom_extract'] = pd.to_numeric(data_raw['bedroom_extract'], errors='coerce')

In [76]:
# Removing unnecessary columns to streamline the dataset for this assignment
# List of columns to be dropped based on the description and analysis
columns_to_drop = [
    'id', 'listing_url', 'scrape_id', 'name', 'last_scraped', 'description',
    'neighborhood_overview', 'picture_url', 'host_id', 'host_url','bedrooms','host_name', 
    'host_thumbnail_url', 'host_picture_url', 'host_about', 'host_neighbourhood',
    'host_listings_count', 'host_total_listings_count', 'neighbourhood', 
    'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'calendar_last_scraped', 
    'license', 'host_location', 'source','first_review','last_review','host_verifications',
    'calendar_updated', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 
    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 
    'amenities' # amenities column is emty with [] symbol
]

# Dropping the columns from the DataFrame
data_cleaned = data_raw.drop(columns=columns_to_drop, errors='ignore')
data_cleaned.info()

# We focus on apartments with 2<= n <= 6
data_cleaned = data_cleaned[(data_cleaned.accommodates >= 2) & (data_cleaned.accommodates <= 6)]

# Removing rows with null values in the 'price' variable for accurate modeling
data_cleaned = data_cleaned.dropna(subset=['price'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   name                         25480 non-null  object 
 1   host_since                   25478 non-null  object 
 2   host_response_time           13920 non-null  object 
 3   host_response_rate           13920 non-null  object 
 4   host_acceptance_rate         14451 non-null  object 
 5   host_is_superhost            25458 non-null  object 
 6   host_has_profile_pic         25478 non-null  object 
 7   host_identity_verified       25478 non-null  object 
 8   latitude                     25480 non-null  float64
 9   longitude                    25480 non-null  float64
 10  property_type                25480 non-null  object 
 11  room_type                    25480 non-null  object 
 12  accommodates                 25480 non-null  int64  
 13  bathrooms       

In [67]:
data_cleaned.head().T

Unnamed: 0,0,1,2,3,5
name,Camper/RV in Blacktown · 1 bedroom · 2 beds · ...,Guesthouse in Bundeena · ★4.94 · 1 bedroom · 1...,Home in Mosman · 3 bedrooms · 4 beds · 2 baths,Rental unit in Homebush · ★New · 1 bedroom · 2...,Guesthouse in Dee Why · 2 bedrooms · 3 beds · ...
host_since,2016-02-20,2015-07-22,2014-06-04,2016-08-26,2017-10-17
host_response_time,,within an hour,within an hour,within an hour,
host_response_rate,,100%,100%,99%,
host_acceptance_rate,,100%,51%,100%,
host_is_superhost,f,t,f,f,f
host_has_profile_pic,t,t,t,t,t
host_identity_verified,t,t,t,t,t
latitude,-33.77872,-34.08265,-33.83407,-33.867527,-33.75838
longitude,150.92234,151.14855,151.23442,151.084352,151.28742


#### Dealing with missing values

In [68]:
# Convert 'bathrooms_text' to a numeric 'bathrooms' column by extracting the number
data_cleaned['bathrooms'] = data_cleaned['bathrooms_text'].str.extract('(\d+)').astype(float)

# Convert 'price' to a numeric column by removing the '$' and ',' then converting to float
data_cleaned['price'] = data_cleaned['price'].replace('[\$,]', '', regex=True).astype(float)

# Drop the original 'bathrooms_text' column as it's now redundant
data_cleaned = data_cleaned.drop('bathrooms_text', axis=1)



In [73]:
# Calculate the percentage of missing values for each column
missing_percentage = data_cleaned.isnull().mean() * 100

# Due to some columns which have more than 40% of missing values, I will drop those columns
# Identify columns with more than 40% missing values
columns_to_drop_missing = missing_percentage[missing_percentage > 40].index

# Drop these columns
data_cleaned = data_cleaned.drop(columns=columns_to_drop_missing)

# Display the columns dropped and the updated DataFrame head
data_cleaned.head().T



Unnamed: 0,0,1,2,3,5
name,Camper/RV in Blacktown · 1 bedroom · 2 beds · ...,Guesthouse in Bundeena · ★4.94 · 1 bedroom · 1...,Home in Mosman · 3 bedrooms · 4 beds · 2 baths,Rental unit in Homebush · ★New · 1 bedroom · 2...,Guesthouse in Dee Why · 2 bedrooms · 3 beds · ...
host_since,2016-02-20,2015-07-22,2014-06-04,2016-08-26,2017-10-17
host_is_superhost,f,t,f,f,f
host_has_profile_pic,t,t,t,t,t
host_identity_verified,t,t,t,t,t
latitude,-33.77872,-34.08265,-33.83407,-33.867527,-33.75838
longitude,150.92234,151.14855,151.23442,151.084352,151.28742
property_type,Camper/RV,Private room in guesthouse,Entire home,Entire rental unit,Entire guesthouse
room_type,Entire home/apt,Private room,Entire home/apt,Entire home/apt,Entire home/apt
accommodates,3,2,4,3,5


In [74]:
# Drop observations with any missing values
data_cleaned = data_cleaned.dropna()

# Display the shape of the updated DataFrame to confirm rows have been dropped
data_cleaned.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 14746 entries, 1 to 25479
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   name                         14746 non-null  object 
 1   host_since                   14746 non-null  object 
 2   host_is_superhost            14746 non-null  object 
 3   host_has_profile_pic         14746 non-null  object 
 4   host_identity_verified       14746 non-null  object 
 5   latitude                     14746 non-null  float64
 6   longitude                    14746 non-null  float64
 7   property_type                14746 non-null  object 
 8   room_type                    14746 non-null  object 
 9   accommodates                 14746 non-null  int64  
 10  bathrooms                    14746 non-null  float64
 11  beds                         14746 non-null  float64
 12  price                        14746 non-null  float64
 13  minimum_nights  

name                            0.000000
host_since                      0.000000
host_response_time             43.725024
host_response_rate             43.725024
host_acceptance_rate           41.519551
host_is_superhost               0.096110
host_has_profile_pic            0.000000
host_identity_verified          0.000000
latitude                        0.000000
longitude                       0.000000
property_type                   0.000000
room_type                       0.000000
accommodates                    0.000000
bathrooms                       0.212454
bedrooms                       99.979766
beds                            1.219080
price                           0.000000
minimum_nights                  0.000000
maximum_nights                  0.000000
minimum_minimum_nights          0.000000
maximum_minimum_nights          0.000000
minimum_maximum_nights          0.000000
maximum_maximum_nights          0.000000
minimum_nights_avg_ntm          0.000000
maximum_nights_a

In [84]:
data_cleaned['bedrooms'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 25480 entries, 0 to 25479
Series name: bedrooms
Non-Null Count  Dtype  
--------------  -----  
20 non-null     float64
dtypes: float64(1)
memory usage: 199.2 KB
