In [84]:
#import libraries
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
import numpy as np
random.seed(0)

#Fetching the dataset
import pandas as pd

In [85]:
#read csv file
listings_1 = pd.read_csv('..\dataset_filter\listings_1.csv')
listings_1.shape

(3818, 64)

In [86]:
#Missing values
null_value_stats = listings_1.isnull().sum() / listings_1.shape[0]
null_value_stats[null_value_stats!=0]

summary                        0.046359
space                          0.149031
neighborhood_overview          0.270299
notes                          0.420639
transit                        0.244631
host_since                     0.000524
host_response_time             0.136983
host_response_rate             0.136983
host_acceptance_rate           0.202462
host_is_superhost              0.000524
host_neighbourhood             0.078575
host_listings_count            0.000524
host_total_listings_count      0.000524
host_identity_verified         0.000524
neighbourhood                  0.108958
zipcode                        0.001833
property_type                  0.000262
bathrooms                      0.004191
bedrooms                       0.001572
beds                           0.000262
weekly_price                   0.473808
monthly_price                  0.602672
security_deposit               0.511262
cleaning_fee                   0.269775
first_review                   0.164222


In [87]:
#drop these columns which have missing value more than 30%
listings_2 = listings_1.drop(columns=['notes', 'weekly_price', 'monthly_price', 'security_deposit'])

In [88]:
##convert response data to float

#listings_2['cleaning_fee'] = listings_2['cleaning_fee'].replace('[\$,]', '', regex=True).astype(float)
#listings_2['price'] = listings_2['price'].replace('[\$,]', '', regex=True).astype(float)
#listings_2['extra_people'] = listings_2['extra_people'].replace('[\$,]', '', regex=True).astype(float)

In [89]:
#Collect all numerical data into a list called numerical_data and remaining columns into another list called other
numerical_data = []
other=[]
for column in listings_2.columns:
    if listings_2[column].dtype == "float":
        numerical_data.append(column)
    else:
        other.append(column)

In [90]:
print("Numerical data has:")
numerical_data


Numerical data has:


['host_listings_count',
 'host_total_listings_count',
 'bathrooms',
 'bedrooms',
 'beds',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'reviews_per_month']

In [91]:
print("Remaining data has:")
other

Remaining data has:


['id',
 'name',
 'summary',
 'space',
 'description',
 'neighborhood_overview',
 'transit',
 'host_id',
 'host_since',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_neighbourhood',
 'host_verifications',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'zipcode',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bed_type',
 'amenities',
 'price',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'calendar_updated',
 'has_availability',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'first_review',
 'last_review',
 'instant_bookable',
 'cancellation_policy',
 'require_guest_profile_picture',
 'require_guest_phone_verification',
 'calculated_host_listings_count']

In [92]:
#convert numerical_data list into dataframe numerical_df
numerical_df=listings[numerical_data]
numerical_df

Unnamed: 0,host_listings_count,host_total_listings_count,bathrooms,bedrooms,beds,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,3.0,3.0,1.0,1.0,1.0,95.0,10.0,10.0,10.0,10.0,9.0,10.0,4.07
1,6.0,6.0,1.0,1.0,1.0,96.0,10.0,10.0,10.0,10.0,10.0,10.0,1.48
2,2.0,2.0,4.5,5.0,7.0,97.0,10.0,10.0,10.0,10.0,10.0,10.0,1.15
3,1.0,1.0,1.0,0.0,2.0,,,,,,,,
4,2.0,2.0,2.0,3.0,3.0,92.0,9.0,9.0,10.0,10.0,9.0,9.0,0.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,354.0,354.0,2.0,3.0,3.0,80.0,8.0,10.0,4.0,8.0,10.0,8.0,0.30
3814,1.0,1.0,1.0,1.0,2.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,2.00
3815,1.0,1.0,1.0,1.0,1.0,,,,,,,,
3816,1.0,1.0,1.0,0.0,1.0,,,,,,,,


In [93]:
#convert other list into dataframe other_df
other_df=listings[other]
other_df

Unnamed: 0,id,name,summary,space,description,neighborhood_overview,transit,host_id,host_since,host_response_time,...,availability_90,availability_365,number_of_reviews,first_review,last_review,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count
0,241032,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,,,956883,11/8/2011,within a few hours,...,71,346,207,1/11/2011,2/1/2016,f,moderate,f,f,2
1,953595,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,"Queen Anne is a wonderful, truly functional vi...","Convenient bus stops are just down the block, ...",5177328,21/2/2013,within an hour,...,16,291,43,19/8/2013,29/12/2015,f,strict,t,t,6
2,3308979,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,Upper Queen Anne is a charming neighborhood fu...,A bus stop is just 2 blocks away. Easy bus a...,16708587,12/6/2014,within a few hours,...,17,220,20,30/7/2014,3/9/2015,f,strict,f,f,2
3,7421966,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,,,9851441,6/11/2013,,...,0,143,0,,,f,flexible,f,f,1
4,278830,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,We are in the beautiful neighborhood of Queen ...,The nearest public transit bus (D Line) is 2 b...,1452570,29/11/2011,within an hour,...,90,365,38,10/7/2012,24/10/2015,f,strict,f,f,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,8101950,3BR Mountain View House in Seattle,Our 3BR/2BA house boasts incredible views of t...,"Our 3BR/2BA house bright, stylish, and wheelch...",Our 3BR/2BA house boasts incredible views of t...,We're located near lots of family fun. Woodlan...,,31148752,13/4/2015,within a few hours,...,32,32,1,27/9/2015,27/9/2015,f,strict,f,f,8
3814,8902327,Portage Bay View!-One Bedroom Apt,800 square foot 1 bedroom basement apartment w...,This space has a great view of Portage Bay wit...,800 square foot 1 bedroom basement apartment w...,The neighborhood is a quiet oasis that is clos...,Uber and Car2go are good options in Seattle. T...,46566046,14/10/2015,within an hour,...,44,273,2,18/12/2015,24/12/2015,f,moderate,f,f,1
3815,10267360,Private apartment view of Lake WA,"Very comfortable lower unit. Quiet, charming m...",,"Very comfortable lower unit. Quiet, charming m...",,,52791370,30/12/2015,,...,88,88,0,,,f,moderate,f,f,1
3816,9604740,Amazing View with Modern Comfort!,Cozy studio condo in the heart on Madison Park...,Fully furnished unit to accommodate most needs...,Cozy studio condo in the heart on Madison Park...,Madison Park offers a peaceful slow pace upsca...,Yes,25522052,3/1/2015,within an hour,...,90,179,0,,,f,moderate,f,f,1


In [94]:
# Function to show the columns with missing values in descending order.
def show_missing_values(df):
    missing_vals =pd.DataFrame()
    filtered_predicator = []
    missing_vals['amount'] = df.isnull().sum().sort_values(ascending = False)
    missing_vals['ratio'] = round(missing_vals['amount']/len(df),2)
    return missing_vals[missing_vals['amount']>0].dropna()

In [95]:
show_missing_values(numerical_df)

Unnamed: 0,amount,ratio
review_scores_checkin,658,0.17
review_scores_accuracy,658,0.17
review_scores_value,656,0.17
review_scores_location,655,0.17
review_scores_cleanliness,653,0.17
review_scores_communication,651,0.17
review_scores_rating,647,0.17
reviews_per_month,627,0.16
bathrooms,16,0.0
bedrooms,6,0.0


In [96]:
#Impute the values using scikit-learn SimpleImpute Class
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='mean') #for median imputation replace 'mean' with 'median'
imp_mean.fit(numerical_df)
imputed_numerical_df = imp_mean.transform(numerical_df)

In [97]:
imputed_numerical_df

array([[ 3.        ,  3.        ,  1.        , ...,  9.        ,
        10.        ,  4.07      ],
       [ 6.        ,  6.        ,  1.        , ..., 10.        ,
        10.        ,  1.48      ],
       [ 2.        ,  2.        ,  4.5       , ..., 10.        ,
        10.        ,  1.15      ],
       ...,
       [ 1.        ,  1.        ,  1.        , ...,  9.60891559,
         9.45224541,  2.07891883],
       [ 1.        ,  1.        ,  1.        , ...,  9.60891559,
         9.45224541,  2.07891883],
       [ 1.        ,  1.        ,  1.5       , ...,  9.60891559,
         9.45224541,  2.07891883]])

In [98]:
#convert the array to dataframe imputed_data
imputed_data=pd.DataFrame(data=imputed_numerical_df,columns=numerical_data)
imputed_data

Unnamed: 0,host_listings_count,host_total_listings_count,bathrooms,bedrooms,beds,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,3.0,3.0,1.0,1.0,1.0,95.000000,10.000000,10.000000,10.000000,10.000000,9.000000,10.000000,4.070000
1,6.0,6.0,1.0,1.0,1.0,96.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,1.480000
2,2.0,2.0,4.5,5.0,7.0,97.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,1.150000
3,1.0,1.0,1.0,0.0,2.0,94.539262,9.636392,9.556398,9.786709,9.809599,9.608916,9.452245,2.078919
4,2.0,2.0,2.0,3.0,3.0,92.000000,9.000000,9.000000,10.000000,10.000000,9.000000,9.000000,0.890000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,354.0,354.0,2.0,3.0,3.0,80.000000,8.000000,10.000000,4.000000,8.000000,10.000000,8.000000,0.300000
3814,1.0,1.0,1.0,1.0,2.0,100.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,2.000000
3815,1.0,1.0,1.0,1.0,1.0,94.539262,9.636392,9.556398,9.786709,9.809599,9.608916,9.452245,2.078919
3816,1.0,1.0,1.0,0.0,1.0,94.539262,9.636392,9.556398,9.786709,9.809599,9.608916,9.452245,2.078919


In [99]:
show_missing_values(imputed_data)

Unnamed: 0,amount,ratio


In [100]:
#no missing value!

In [101]:
#Join the amenities dataframe back to the original listings dataframe
listings_joined=other_df.join(imputed_data,how="inner")
listings_joined

Unnamed: 0,id,name,summary,space,description,neighborhood_overview,transit,host_id,host_since,host_response_time,...,bedrooms,beds,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month
0,241032,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,,,956883,11/8/2011,within a few hours,...,1.0,1.0,95.000000,10.000000,10.000000,10.000000,10.000000,9.000000,10.000000,4.070000
1,953595,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,"Queen Anne is a wonderful, truly functional vi...","Convenient bus stops are just down the block, ...",5177328,21/2/2013,within an hour,...,1.0,1.0,96.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,1.480000
2,3308979,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,Upper Queen Anne is a charming neighborhood fu...,A bus stop is just 2 blocks away. Easy bus a...,16708587,12/6/2014,within a few hours,...,5.0,7.0,97.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,1.150000
3,7421966,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,,,9851441,6/11/2013,,...,0.0,2.0,94.539262,9.636392,9.556398,9.786709,9.809599,9.608916,9.452245,2.078919
4,278830,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,We are in the beautiful neighborhood of Queen ...,The nearest public transit bus (D Line) is 2 b...,1452570,29/11/2011,within an hour,...,3.0,3.0,92.000000,9.000000,9.000000,10.000000,10.000000,9.000000,9.000000,0.890000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3813,8101950,3BR Mountain View House in Seattle,Our 3BR/2BA house boasts incredible views of t...,"Our 3BR/2BA house bright, stylish, and wheelch...",Our 3BR/2BA house boasts incredible views of t...,We're located near lots of family fun. Woodlan...,,31148752,13/4/2015,within a few hours,...,3.0,3.0,80.000000,8.000000,10.000000,4.000000,8.000000,10.000000,8.000000,0.300000
3814,8902327,Portage Bay View!-One Bedroom Apt,800 square foot 1 bedroom basement apartment w...,This space has a great view of Portage Bay wit...,800 square foot 1 bedroom basement apartment w...,The neighborhood is a quiet oasis that is clos...,Uber and Car2go are good options in Seattle. T...,46566046,14/10/2015,within an hour,...,1.0,2.0,100.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,2.000000
3815,10267360,Private apartment view of Lake WA,"Very comfortable lower unit. Quiet, charming m...",,"Very comfortable lower unit. Quiet, charming m...",,,52791370,30/12/2015,,...,1.0,1.0,94.539262,9.636392,9.556398,9.786709,9.809599,9.608916,9.452245,2.078919
3816,9604740,Amazing View with Modern Comfort!,Cozy studio condo in the heart on Madison Park...,Fully furnished unit to accommodate most needs...,Cozy studio condo in the heart on Madison Park...,Madison Park offers a peaceful slow pace upsca...,Yes,25522052,3/1/2015,within an hour,...,0.0,1.0,94.539262,9.636392,9.556398,9.786709,9.809599,9.608916,9.452245,2.078919


In [102]:
show_missing_values(listings_joined)

Unnamed: 0,amount,ratio
neighborhood_overview,1032,0.27
cleaning_fee,1030,0.27
transit,934,0.24
host_acceptance_rate,773,0.2
first_review,627,0.16
last_review,627,0.16
space,569,0.15
host_response_time,523,0.14
host_response_rate,523,0.14
neighbourhood,416,0.11


In [103]:
listings_joined.to_csv('..\dataset_filter\listings_2_new.csv',index=False)