In [2]:
# Analysis lib
import numpy as np 
import pandas as pd 

In [3]:
# Reading data
listings_dataFilter = pd.read_csv('..\dataset_origin\listings.csv')
listings_dataFilter.shape

(3818, 92)

# There are two types of data, one is Categorical and the other is Numerical. Numerical data includes int64 and float64.

In [4]:
#Check for columns infomation
listings_dataFilter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 92 columns):
id                                  3818 non-null int64
listing_url                         3818 non-null object
scrape_id                           3818 non-null float64
last_scraped                        3818 non-null object
name                                3818 non-null object
summary                             3641 non-null object
space                               3249 non-null object
description                         3818 non-null object
experiences_offered                 3818 non-null object
neighborhood_overview               2786 non-null object
notes                               2212 non-null object
transit                             2884 non-null object
thumbnail_url                       3498 non-null object
medium_url                          3498 non-null object
picture_url                         3818 non-null object
xl_picture_url                      34

# Drop "id" "host_id" "srape_id", these numerical data contains very big numerical value.

In [5]:
listings_dataFilter.drop(columns=['id', 'host_id', 'scrape_id'], inplace=True)

# Remove Columns contain only one value
## These columns won’t be useful for the model since they don’t add any information. In addition, removing these columns will reduce the number of columns we’ll need to explore further in the next stage.

In [6]:
listings_dataFilter = listings_dataFilter.loc[:,listings_dataFilter.apply(pd.Series.nunique) != 1]
listings_dataFilter.shape

(3818, 80)

# Check the missing values in each column.


In [7]:
# Function to show the columns with missing values in descending order.
def show_missing_values(df):
    missing_vals = pd.DataFrame()
    filtered_predicator = []
    missing_vals['amount'] = df.isnull().sum().sort_values(ascending = False)
    missing_vals['ratio'] = round(missing_vals['amount']/len(df),2)
    return missing_vals[missing_vals['amount']>0].dropna()

In [8]:
show_missing_values(listings_dataFilter)

Unnamed: 0,amount,ratio
license,3818,1.0
square_feet,3721,0.97
monthly_price,2301,0.6
security_deposit,1952,0.51
weekly_price,1809,0.47
notes,1606,0.42
neighborhood_overview,1032,0.27
cleaning_fee,1030,0.27
transit,934,0.24
host_about,859,0.22


# Because our dataset only has 3818 rows, which is a quite small dataset. To keep data integrated, we only filter off those columns with more than 30% missing values.

In [9]:
# add columns with the ratio is over 0.3 to a list for later use.
series = show_missing_values(listings_dataFilter).iloc[:,1]
many_missings = [x for x, y in series.items() if y > 0.30]
many_missings

['license',
 'square_feet',
 'monthly_price',
 'security_deposit',
 'weekly_price',
 'notes']

In [10]:
listings_dataFilter = listings_dataFilter.drop(columns = many_missings)

# Collect all categorical data into a list called categorical_data.
# Collect all numerical data into a list called numerical_data.

In [11]:
numerical_data = []
categorical_data = []
for column in listings_dataFilter.columns:
    if listings_dataFilter[column].dtype == "object":
        categorical_data.append(column)
    else:
        numerical_data.append(column)

In [12]:
print("categorical data has:")
categorical_data

categorical data has:


['listing_url',
 'name',
 'summary',
 'space',
 'description',
 'neighborhood_overview',
 'transit',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'smart_location',
 'is_location_exact',
 'property_type',
 'room_type',
 'bed_type',
 'amenities',
 'price',
 'cleaning_fee',
 'extra_people',
 'calendar_updated',
 'first_review',
 'last_review',
 'instant_bookable',
 'cancellation_policy',
 'require_guest_profile_picture',
 'require_guest_phone_verification']

In [13]:
#Check the distribution of categorical data
#listings_dataFilter.describe(include=["O"])

# Convert the columns contain "$" from categorical data into numerical data.
# Such as price, cleaning_fee, extra_people

In [14]:
listings_dataFilter['price'] = listings_dataFilter['price'].replace('[\$,]', '', regex=True).astype(float)
listings_dataFilter['cleaning_fee'] = listings_dataFilter['cleaning_fee'].replace('[\$,]', '', regex=True).astype(float)
listings_dataFilter['extra_people'] = listings_dataFilter['extra_people'].replace('[\$,]', '', regex=True).astype(float)

In [15]:
print("Numerical data has:")
numerical_data

Numerical data has:


['host_listings_count',
 'host_total_listings_count',
 'latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'guests_included',
 'minimum_nights',
 'maximum_nights',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'calculated_host_listings_count',
 'reviews_per_month']

In [16]:
listings_dataFilter.describe()

Unnamed: 0,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,cleaning_fee,...,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,reviews_per_month
count,3816.0,3816.0,3818.0,3818.0,3818.0,3802.0,3812.0,3817.0,3818.0,2788.0,...,3818.0,3171.0,3160.0,3165.0,3160.0,3167.0,3163.0,3162.0,3818.0,3191.0
mean,7.157757,7.157757,47.628961,-122.333103,3.349398,1.259469,1.307712,1.735394,127.976166,61.710904,...,22.223415,94.539262,9.636392,9.556398,9.786709,9.809599,9.608916,9.452245,2.946307,2.078919
std,28.628149,28.628149,0.043052,0.031745,1.977599,0.590369,0.883395,1.13948,90.250022,48.830341,...,37.730892,6.606083,0.698031,0.797274,0.595499,0.568211,0.629053,0.750259,5.893029,1.822348
min,1.0,1.0,47.505088,-122.417219,1.0,0.0,0.0,1.0,20.0,5.0,...,0.0,20.0,2.0,3.0,2.0,2.0,4.0,2.0,1.0,0.02
25%,1.0,1.0,47.609418,-122.354321,2.0,1.0,1.0,1.0,75.0,25.0,...,2.0,93.0,9.0,9.0,10.0,10.0,9.0,9.0,1.0,0.695
50%,1.0,1.0,47.623601,-122.328874,3.0,1.0,1.0,1.0,100.0,50.0,...,9.0,96.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0,1.54
75%,3.0,3.0,47.662694,-122.3108,4.0,1.0,2.0,2.0,150.0,83.0,...,26.0,99.0,10.0,10.0,10.0,10.0,10.0,10.0,2.0,3.0
max,502.0,502.0,47.733358,-122.240607,16.0,8.0,7.0,15.0,1000.0,300.0,...,474.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,37.0,12.15


# Filling in the missing values

In [17]:
listings_dataFilter.columns

Index(['listing_url', 'name', 'summary', 'space', 'description',
       'neighborhood_overview', 'transit', 'thumbnail_url', 'medium_url',
       'picture_url', 'xl_picture_url', 'host_url', 'host_name', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode',
       'smart_location', 'latitude', 'longitude', 'is_location_exact',
       'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'bed_type', 'amenities', 'price', 'cleaning_fee',
       'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights',
       'calendar_upd