In [90]:
import pandas as pd
import csv
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
file_url = '/Users/xzhou/github/project_archives/files_airbnb/sf_airbnb/{}.csv'

listings_file = file_url.format('listings_details')
reviews_file = file_url.format('reviews_details')
calendar_file = file_url.format('calendar_details')

In [4]:
df_listings = pd.read_csv(listings_file)

print (df_listings.shape)
df_listings.head()

(6633, 96)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,958,https://www.airbnb.com/rooms/958,20180800000000.0,8/6/18,"Bright, Modern Garden Unit - 1BR/1B",Our bright garden unit overlooks a grassy back...,"Newly remodeled, modern, and bright garden uni...",Our bright garden unit overlooks a grassy back...,none,*Quiet cul de sac in friendly neighborhood *St...,...,t,STR-0001256,"{""SAN FRANCISCO""}",f,f,moderate,f,f,1,1.38
1,5858,https://www.airbnb.com/rooms/5858,20180800000000.0,8/6/18,Creative Sanctuary,,We live in a large Victorian house on a quiet ...,We live in a large Victorian house on a quiet ...,none,I love how our neighborhood feels quiet but is...,...,t,,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,1,0.99
2,7918,https://www.airbnb.com/rooms/7918,20180800000000.0,8/6/18,A Friendly Room - UCSF/USF - San Francisco,Nice and good public transportation. 7 minute...,Room rental-sunny view room/sink/Wi Fi (inner ...,Nice and good public transportation. 7 minute...,none,"Shopping old town, restaurants, McDonald, Whol...",...,t,,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.16
3,8142,https://www.airbnb.com/rooms/8142,20180800000000.0,8/6/18,Friendly Room Apt. Style -UCSF/USF - San Franc...,Nice and good public transportation. 7 minute...,Room rental Sunny view Rm/Wi-Fi/TV/sink/large ...,Nice and good public transportation. 7 minute...,none,,...,t,,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.15
4,8339,https://www.airbnb.com/rooms/8339,20180800000000.0,8/6/18,Historic Alamo Square Victorian,Pls email before booking. Interior featured i...,Please send us a quick message before booking ...,Pls email before booking. Interior featured i...,none,,...,t,STR-0000264,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,t,t,2,0.24


In [5]:
df_listings.isnull().sum()

id                                     0
listing_url                            0
scrape_id                              0
last_scraped                           0
name                                   0
summary                              228
space                               1041
description                           24
experiences_offered                    0
neighborhood_overview               1917
notes                               2587
transit                             1851
access                              2057
interaction                         2182
house_rules                         1778
thumbnail_url                       6633
medium_url                          6633
picture_url                            0
xl_picture_url                      6633
host_id                                0
host_url                               0
host_name                              0
host_since                             0
host_location                          7
host_about      

In [6]:
df_listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

In [7]:
# Noted >90% null value in 'experience_offered', 'host_acceptance_rate', 'neighbourhood_group_cleansed'
# removed 'neighbourhood',  kept'neighbourhood_cleansed',
# both city and 'smart location' provides city level information, and 'smart_location can be dropped
# country code and country can be dropped, as all data located within US
columns_to_drop =['listing_url', 'scrape_id', 'last_scraped', 'experiences_offered', 
                  'thumbnail_url', 'medium_url', 'xl_picture_url', 'host_acceptance_rate','street',
                  'neighbourhood', 
       'neighbourhood_group_cleansed','zipcode','smart_location','country_code', 
                  'country', 'square_feet','weekly_price', 'monthly_price','calendar_updated', 
                  'calendar_last_scraped', 'requires_license', 'license']

In [8]:
# 'host_since', 'host_location',

host_related_info_to_drop = ['host_id', 'host_url', 'host_name', 'host_since',
                             'host_location', 'host_about', 'host_thumbnail_url',
                             'host_picture_url', 'host_neighbourhood']

In [9]:
# The information is useful for now, but can be dropped during modeling

extra_info_to_drop = []

In [10]:
df_smaller_listings = df_listings.drop (columns_to_drop, axis=1)

In [11]:
df_smaller_listings

Unnamed: 0,id,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,...,review_scores_location,review_scores_value,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,958,"Bright, Modern Garden Unit - 1BR/1B",Our bright garden unit overlooks a grassy back...,"Newly remodeled, modern, and bright garden uni...",Our bright garden unit overlooks a grassy back...,*Quiet cul de sac in friendly neighborhood *St...,Due to the fact that we have children and a do...,*Public Transportation is 1/2 block away. *Ce...,*Full access to patio and backyard (shared wit...,A family of 4 lives upstairs with their dog. N...,...,10.0,10.0,"{""SAN FRANCISCO""}",f,f,moderate,f,f,1,1.38
1,5858,Creative Sanctuary,,We live in a large Victorian house on a quiet ...,We live in a large Victorian house on a quiet ...,I love how our neighborhood feels quiet but is...,All the furniture in the house was handmade so...,The train is two blocks away and you can stop ...,"Our deck, garden, gourmet kitchen and extensiv...",,...,10.0,9.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,1,0.99
2,7918,A Friendly Room - UCSF/USF - San Francisco,Nice and good public transportation. 7 minute...,Room rental-sunny view room/sink/Wi Fi (inner ...,Nice and good public transportation. 7 minute...,"Shopping old town, restaurants, McDonald, Whol...",Please email your picture id with print name (...,N Juda Muni and bus stop. Street parking.,,,...,9.0,8.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.16
3,8142,Friendly Room Apt. Style -UCSF/USF - San Franc...,Nice and good public transportation. 7 minute...,Room rental Sunny view Rm/Wi-Fi/TV/sink/large ...,Nice and good public transportation. 7 minute...,,Please email your picture id with print name (...,"N Juda Muni, Bus and UCSF Shuttle. small shopp...",,,...,9.0,9.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.15
4,8339,Historic Alamo Square Victorian,Pls email before booking. Interior featured i...,Please send us a quick message before booking ...,Pls email before booking. Interior featured i...,,tax ID on file tax ID on file,,Guests have access to everything listed and sh...,,...,10.0,10.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,t,t,2,0.24
5,8567,Lovely 2BR flat Great Location,,"Fully furnished 2BR, 1BA flat in beautiful Vic...","Fully furnished 2BR, 1BA flat in beautiful Vic...","The neighborhood is very centrally located, cl...",,We're 2 blocks from several bus lines that can...,,"I am generally here to welcome you, and if not...",...,9.0,9.0,"{""SAN FRANCISCO""}",f,f,moderate,f,f,1,0.27
6,8739,"Mission Sunshine, with Private Bath","Welcome to San Francisco's Mission District, t...","Your sunny room has a queen size bed, and look...","Welcome to San Francisco's Mission District, t...",Located between Valencia Street and Dolores Pa...,"We live in a dense, urban neighborhood, and ou...",It's an easy and lovely 10 minute stroll to th...,Guests have their own bedroom and private bath...,"We are experienced Airbnb hosts, and have had ...",...,10.0,9.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,2,5.41
7,9225,Artful Potrero Separate Floor with Garden,A Unique Guest Suite! A Spacious Art Filled ...,"It is unique, very spacious [800 sq. ft. with...",A Unique Guest Suite! A Spacious Art Filled ...,"It is a distinct neighborhood, with easy acces...",,There are buses within a block to connect to a...,"Laundry privileges, Spacious Garden with City ...",We are available to make suggestions to enhanc...,...,10.0,9.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,1,3.84
8,10251,Victorian Suite in Inner Mission,,Please read this before you book! Second floor...,Please read this before you book! Second floor...,"Neighborhood is safe, sunny, lively and fun. ...",San Francisco Office of Short-Term Rentals Bus...,"SF Muni line 12, bus stop is a block away from...",You are welcome to relax and enjoy our backyar...,Feel free to knock on the downstairs door if y...,...,9.0,9.0,"{""SAN FRANCISCO""}",f,f,moderate,f,f,1,2.69
9,10578,Classic Nob Hill Studio - Roof Deck,A cute studio with nice street views and lots ...,"Features: Elevator, Wood floors, TV, DVD Play...",A cute studio with nice street views and lots ...,"Very centrally located. Fishermans Wharf, Chi...",,"The California 1 bus, which runs from downtown...",Shared access to the roof deck. The studio is ...,"Usually zero, as the place is set up with door...",...,10.0,10.0,"{""SAN FRANCISCO""}",f,f,moderate,f,t,1,0.22


In [12]:
# Not used at this moement

In [13]:
df_reviews = pd.read_csv(reviews_file)

print (df_reviews.shape)
df_reviews.head()

(278884, 6)


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,958,5977,2009-07-23,15695,Edmund C,"Our experience was, without a doubt, a five st..."
1,958,6660,2009-08-03,26145,Simon,Returning to San Francisco is a rejuvenating t...
2,958,11519,2009-09-27,25839,Denis,We were very pleased with the accommodations a...
3,958,16282,2009-11-05,33750,Anna,We highly recommend this accomodation and agre...
4,958,26008,2010-02-13,15416,Venetia,Holly's place was great. It was exactly what I...


In [14]:
df_calendar =pd.read_csv(calendar_file)

print (df_calendar.shape)
df_calendar.head()

(2420680, 4)


Unnamed: 0,listing_id,date,available,price
0,958,2019-08-05,f,
1,958,2019-08-04,f,
2,958,2019-08-03,f,
3,958,2019-08-02,f,
4,958,2019-08-01,f,


In [15]:
df_smaller_listings.columns

Index(['id', 'name', 'summary', 'space', 'description',
       'neighborhood_overview', 'notes', 'transit', 'access', 'interaction',
       'house_rules', 'picture_url', 'host_id', 'host_url', 'host_name',
       'host_since', 'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'state', 'market', 'latitude',
       'longitude', 'is_location_exact', 'property_type', 'room_type',
       'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type',
       'amenities', 'price', 'security_deposit', 'cleaning_fee',
       'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights',
       'has_availability', 'availability_30', 'availability_60',
       'availability_90', 'availability_36

In [16]:
df_smaller_listings.head()

Unnamed: 0,id,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,...,review_scores_location,review_scores_value,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,958,"Bright, Modern Garden Unit - 1BR/1B",Our bright garden unit overlooks a grassy back...,"Newly remodeled, modern, and bright garden uni...",Our bright garden unit overlooks a grassy back...,*Quiet cul de sac in friendly neighborhood *St...,Due to the fact that we have children and a do...,*Public Transportation is 1/2 block away. *Ce...,*Full access to patio and backyard (shared wit...,A family of 4 lives upstairs with their dog. N...,...,10.0,10.0,"{""SAN FRANCISCO""}",f,f,moderate,f,f,1,1.38
1,5858,Creative Sanctuary,,We live in a large Victorian house on a quiet ...,We live in a large Victorian house on a quiet ...,I love how our neighborhood feels quiet but is...,All the furniture in the house was handmade so...,The train is two blocks away and you can stop ...,"Our deck, garden, gourmet kitchen and extensiv...",,...,10.0,9.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,1,0.99
2,7918,A Friendly Room - UCSF/USF - San Francisco,Nice and good public transportation. 7 minute...,Room rental-sunny view room/sink/Wi Fi (inner ...,Nice and good public transportation. 7 minute...,"Shopping old town, restaurants, McDonald, Whol...",Please email your picture id with print name (...,N Juda Muni and bus stop. Street parking.,,,...,9.0,8.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.16
3,8142,Friendly Room Apt. Style -UCSF/USF - San Franc...,Nice and good public transportation. 7 minute...,Room rental Sunny view Rm/Wi-Fi/TV/sink/large ...,Nice and good public transportation. 7 minute...,,Please email your picture id with print name (...,"N Juda Muni, Bus and UCSF Shuttle. small shopp...",,,...,9.0,9.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,f,f,9,0.15
4,8339,Historic Alamo Square Victorian,Pls email before booking. Interior featured i...,Please send us a quick message before booking ...,Pls email before booking. Interior featured i...,,tax ID on file tax ID on file,,Guests have access to everything listed and sh...,,...,10.0,10.0,"{""SAN FRANCISCO""}",f,f,strict_14_with_grace_period,t,t,2,0.24


In [17]:
drop_for_simple_model=['id', 'name', 'summary', 'space', 'description', 
                       'neighborhood_overview', 'notes', 'transit', 'access', 'interaction',
                       'house_rules', 'picture_url', 'host_id', 'host_url', 'host_name','host_about',
                      'host_thumbnail_url', 'host_picture_url', 'latitude', 'longitude',]

In [18]:
df_simple_model = df_smaller_listings.drop(drop_for_simple_model, axis =1)

In [19]:
df_simple_model.columns

Index(['host_since', 'host_location', 'host_response_time',
       'host_response_rate', 'host_is_superhost', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'state', 'market',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price',
       'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
       'minimum_nights', 'maximum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'jurisdiction_names', 'ins

In [20]:
df_simple_model['host_since']   = pd.to_datetime(df_smaller_listings['host_since'])
df_simple_model['first_review'] = pd.to_datetime(df_smaller_listings['first_review'])
df_simple_model['last_review']  = pd.to_datetime(df_smaller_listings['last_review'])

In [21]:
def duration(starting_time):
    """
    Calculate duration or number of years between current year and input years
    """
    starting_year = starting_time.year
    current_year = datetime.now().year
    duration = current_year - starting_year + 1
    
    return duration

In [22]:
df_simple_model['host_duration'] = df_simple_model['host_since'].apply(lambda x: duration(x))
df_simple_model['yrs_since_first_review'] = df_simple_model['first_review'].apply(lambda x: duration(x))
df_simple_model['yrs_since_last_review'] = df_simple_model['last_review'].apply(lambda x: duration(x))

In [23]:
time_stamps =['host_since', 'first_review', 'last_review']
df_simple_model.drop(time_stamps, axis=1, inplace=True)

In [24]:
# Replace special characters in columns

df_simple_model.host_verifications = df_simple_model.host_verifications.str.replace('[^\w\s]', '')
df_simple_model.jurisdiction_names = df_simple_model.jurisdiction_names.str.replace('[^\w\s]', '')
df_simple_model.amenities = df_simple_model.amenities.str.replace('[^\w\s]', '')
df_simple_model.security_deposit = df_simple_model.security_deposit.str.replace('[^\w\s]', '')
df_simple_model.cleaning_fee = df_simple_model.cleaning_fee.str.replace('[^\w\s]', '')
df_simple_model.price = df_simple_model.price.str.replace('[^\w\s]', '')

In [25]:
df_simple_model.shape

(6633, 54)

In [26]:
df_simple_model.columns

Index(['host_location', 'host_response_time', 'host_response_rate',
       'host_is_superhost', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'state', 'market',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price',
       'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
       'minimum_nights', 'maximum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'jurisdiction_names',
       'instant_bookable', 'is_business_travel_ready', '

In [27]:
df_simple_model.isnull().sum()

host_location                          7
host_response_time                   575
host_response_rate                   575
host_is_superhost                      0
host_neighbourhood                   374
host_listings_count                    0
host_total_listings_count              0
host_verifications                     0
host_has_profile_pic                   0
host_identity_verified                 0
neighbourhood_cleansed                 0
city                                  12
state                                  2
market                                20
is_location_exact                      0
property_type                          0
room_type                              0
accommodates                           0
bathrooms                             26
bedrooms                               1
beds                                   5
bed_type                               0
amenities                              0
price                                  0
security_deposit

In [28]:
# Drop null values for a few columns

null_values_to_drop = ['host_location','host_response_rate','host_neighbourhood', 
                       'state', 'market', 'bathrooms', 'bedrooms','review_scores_rating', 
                       'review_scores_value', 'jurisdiction_names']
df_simple_model_cleaned = df_simple_model.dropna(subset=null_values_to_drop)

In [29]:
df_simple_model_cleaned.shape

(4685, 54)

In [30]:
df_simple_model_cleaned.isnull().sum()

host_location                         0
host_response_time                    0
host_response_rate                    0
host_is_superhost                     0
host_neighbourhood                    0
host_listings_count                   0
host_total_listings_count             0
host_verifications                    0
host_has_profile_pic                  0
host_identity_verified                0
neighbourhood_cleansed                0
city                                  0
state                                 0
market                                0
is_location_exact                     0
property_type                         0
room_type                             0
accommodates                          0
bathrooms                             0
bedrooms                              0
beds                                  0
bed_type                              0
amenities                             0
price                                 0
security_deposit                    885


In [31]:
# Convert column to numeric value. Prepare for filling null values with average

df_simple_model_cleaned['security_deposit'] = df_simple_model_cleaned['security_deposit']  \
                                              .apply(pd.to_numeric, errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
df_simple_model_cleaned.security_deposit

0        10000.0
1            NaN
2        20000.0
3        20000.0
4            0.0
5            0.0
6            0.0
7            NaN
8        50000.0
9        50000.0
10      300000.0
11      250000.0
12      300000.0
13      250000.0
14       20000.0
15       20000.0
16           NaN
17       89900.0
18           NaN
19      500000.0
20           NaN
21       13000.0
22       33300.0
23       50000.0
24       20000.0
25       50000.0
26       50000.0
27       20000.0
28       50000.0
29       25000.0
          ...   
6372         NaN
6373     50000.0
6376         0.0
6377         0.0
6381     30000.0
6382     30000.0
6406         0.0
6407         0.0
6422         NaN
6425         0.0
6426     30000.0
6448         0.0
6449     40000.0
6453         0.0
6454         0.0
6455     35000.0
6464     80000.0
6466     50000.0
6468     20000.0
6469     30000.0
6484     25000.0
6485         0.0
6488     10000.0
6491     40000.0
6495         0.0
6507         NaN
6540     25000.0
6542         N

In [33]:
# Filled null values with average value

df_simple_model_cleaned['security_deposit'].fillna(
    value=df_simple_model_cleaned['security_deposit'].mean(),
    inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [34]:
df_simple_model_cleaned['security_deposit'] = (df_simple_model_cleaned['security_deposit']>0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [None]:
df_simple_model_cleaned['price'] = df_simple_model_cleaned['price']  \
                                   .apply(pd.to_numeric, errors='coerce')

In [35]:
df_simple_model_cleaned.security_deposit

0       1
1       1
2       1
3       1
4       0
5       0
6       0
7       1
8       1
9       1
10      1
11      1
12      1
13      1
14      1
15      1
16      1
17      1
18      1
19      1
20      1
21      1
22      1
23      1
24      1
25      1
26      1
27      1
28      1
29      1
       ..
6372    1
6373    1
6376    0
6377    0
6381    1
6382    1
6406    0
6407    0
6422    1
6425    0
6426    1
6448    0
6449    1
6453    0
6454    0
6455    1
6464    1
6466    1
6468    1
6469    1
6484    1
6485    0
6488    1
6491    1
6495    0
6507    1
6540    1
6542    1
6564    1
6615    0
Name: security_deposit, Length: 4685, dtype: int64

In [36]:
# Convert column to numeric value. Prepare for filling null values with average

df_simple_model_cleaned['cleaning_fee'] = df_simple_model_cleaned['cleaning_fee']  \
                                              .apply(pd.to_numeric, errors='coerce')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
# Filled null values with average value

df_simple_model_cleaned['cleaning_fee'].fillna(
    value=df_simple_model_cleaned['cleaning_fee'].mean(),
    inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [38]:
df_simple_model_cleaned['cleaning_fee'] = (df_simple_model_cleaned['cleaning_fee']>0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
df_simple_model_cleaned['cleaning_fee']

0       1
1       1
2       1
3       1
4       1
5       1
6       1
7       1
8       1
9       1
10      1
11      1
12      1
13      1
14      1
15      1
16      1
17      1
18      1
19      1
20      1
21      1
22      1
23      1
24      1
25      1
26      1
27      1
28      1
29      1
       ..
6372    1
6373    1
6376    1
6377    1
6381    1
6382    1
6406    1
6407    1
6422    1
6425    1
6426    1
6448    1
6449    1
6453    1
6454    1
6455    1
6464    1
6466    1
6468    1
6469    1
6484    1
6485    1
6488    1
6491    1
6495    1
6507    1
6540    1
6542    1
6564    1
6615    1
Name: cleaning_fee, Length: 4685, dtype: int64

In [40]:
df_simple_model_cleaned.isnull().sum()

host_location                       0
host_response_time                  0
host_response_rate                  0
host_is_superhost                   0
host_neighbourhood                  0
host_listings_count                 0
host_total_listings_count           0
host_verifications                  0
host_has_profile_pic                0
host_identity_verified              0
neighbourhood_cleansed              0
city                                0
state                               0
market                              0
is_location_exact                   0
property_type                       0
room_type                           0
accommodates                        0
bathrooms                           0
bedrooms                            0
beds                                0
bed_type                            0
amenities                           0
price                               0
security_deposit                    0
cleaning_fee                        0
guests_inclu

In [41]:
df_simple_model_cleaned.head()

Unnamed: 0,host_location,host_response_time,host_response_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,host_duration,yrs_since_first_review,yrs_since_last_review
0,"San Francisco, California, United States",within an hour,92%,t,Duboce Triangle,1,1,email phone facebook reviews kba,t,t,...,f,f,moderate,f,f,1,1.38,11,10.0,1.0
1,"San Francisco, California, United States",within an hour,100%,f,Bernal Heights,2,2,email phone reviews kba work_email,t,t,...,f,f,strict_14_with_grace_period,f,f,1,0.99,10,10.0,2.0
2,"San Francisco, California, United States",within a few hours,100%,f,Cole Valley,10,10,email phone reviews jumio government_id,t,t,...,f,f,strict_14_with_grace_period,f,f,9,0.16,10,10.0,3.0
3,"San Francisco, California, United States",within a few hours,100%,f,Cole Valley,10,10,email phone reviews jumio government_id,t,t,...,f,f,strict_14_with_grace_period,f,f,9,0.15,10,5.0,2.0
4,"San Francisco, California, United States",within an hour,100%,f,Alamo Square,2,2,email phone reviews kba,t,t,...,f,f,strict_14_with_grace_period,t,t,2,0.24,10,10.0,2.0


In [42]:
df_simple_model_cleaned.columns

Index(['host_location', 'host_response_time', 'host_response_rate',
       'host_is_superhost', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'state', 'market',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price',
       'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
       'minimum_nights', 'maximum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'jurisdiction_names',
       'instant_bookable', 'is_business_travel_ready', '

In [57]:
################## drop for simplicity purpose

df_test_model = df_simple_model_cleaned.drop(['amenities', 'host_verifications'], axis=1)

In [58]:
# 'host_verifications','amenities',

categorical_data = ['host_location', 'host_response_time', 
       'host_is_superhost', 'host_neighbourhood', 
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'city', 'state', 'market',
       'is_location_exact', 'property_type', 'room_type',  'bed_type', 'has_availability',
        'jurisdiction_names',
       'instant_bookable', 'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       ]

In [59]:
df_test_model = pd.get_dummies(df_test_model, prefix = categorical_data, columns = categorical_data )

In [60]:
df_test_model.columns

Index(['host_response_rate', 'host_listings_count',
       'host_total_listings_count', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'security_deposit', 'cleaning_fee',
       ...
       'cancellation_policy_flexible', 'cancellation_policy_moderate',
       'cancellation_policy_strict',
       'cancellation_policy_strict_14_with_grace_period',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60',
       'require_guest_profile_picture_f', 'require_guest_profile_picture_t',
       'require_guest_phone_verification_f',
       'require_guest_phone_verification_t'],
      dtype='object', length=346)

In [62]:
df_test_model.price = df_test_model.price.str.replace('[^\w\s]', '')

In [63]:
df_test_model['price'] = df_test_model['price']  \
                                   .apply(pd.to_numeric, errors='coerce')

In [64]:
df_test_model.price

0       17000
1       23500
2        6500
3        6500
4       67500
5       25500
6       13900
7       13500
8       26500
9       12000
10      21800
11      17700
12      19400
13      13900
14       8500
15       8500
16       7900
17      13500
18      21500
19      45000
20      10700
21      11000
22      19800
23      15000
24       6500
25      22500
26       9500
27      15500
28      17800
29      19900
        ...  
6372     7000
6373    27500
6376    20000
6377    16600
6381     9900
6382     8900
6406    10800
6407    18000
6422    18500
6425    33400
6426    17500
6448     8900
6449    15000
6453     9000
6454     9000
6455    49900
6464    49900
6466    25000
6468    15000
6469    16000
6484    15000
6485    12900
6488    21700
6491    30000
6495    45000
6507     9500
6540    25600
6542     4500
6564    17900
6615     7400
Name: price, Length: 4685, dtype: int64

In [70]:
X

Unnamed: 0,host_response_rate,host_listings_count,host_total_listings_count,accommodates,bathrooms,bedrooms,beds,security_deposit,cleaning_fee,guests_included,...,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,require_guest_profile_picture_f,require_guest_profile_picture_t,require_guest_phone_verification_f,require_guest_phone_verification_t
0,92%,1,1,3,1.0,1.0,2.0,1,1,2,...,0,1,0,0,0,0,1,0,1,0
1,100%,2,2,5,1.0,2.0,3.0,1,1,2,...,0,0,0,1,0,0,1,0,1,0
2,100%,10,10,2,4.0,1.0,1.0,1,1,1,...,0,0,0,1,0,0,1,0,1,0
3,100%,10,10,2,4.0,1.0,1.0,1,1,1,...,0,0,0,1,0,0,1,0,1,0
4,100%,2,2,5,1.5,2.0,2.0,0,1,2,...,0,0,0,1,0,0,0,1,0,1
5,80%,1,1,6,1.0,2.0,3.0,0,1,1,...,0,1,0,0,0,0,1,0,1,0
6,100%,2,2,3,1.0,1.0,2.0,0,1,2,...,0,0,0,1,0,0,1,0,1,0
7,100%,1,1,2,1.0,1.0,1.0,1,1,1,...,0,0,0,1,0,0,1,0,1,0
8,100%,1,1,6,1.0,2.0,3.0,1,1,4,...,0,1,0,0,0,0,1,0,1,0
9,100%,1,1,2,1.0,0.0,1.0,1,1,2,...,0,1,0,0,0,0,1,0,0,1


In [107]:
X = df_test_model.drop(['price', 'host_response_rate', 'extra_people'], axis=1)
y = df_test_model['price'][0]

In [105]:
y

0       17000
1       23500
2        6500
3        6500
4       67500
5       25500
6       13900
7       13500
8       26500
9       12000
10      21800
11      17700
12      19400
13      13900
14       8500
15       8500
16       7900
17      13500
18      21500
19      45000
20      10700
21      11000
22      19800
23      15000
24       6500
25      22500
26       9500
27      15500
28      17800
29      19900
        ...  
6372     7000
6373    27500
6376    20000
6377    16600
6381     9900
6382     8900
6406    10800
6407    18000
6422    18500
6425    33400
6426    17500
6448     8900
6449    15000
6453     9000
6454     9000
6455    49900
6464    49900
6466    25000
6468    15000
6469    16000
6484    15000
6485    12900
6488    21700
6491    30000
6495    45000
6507     9500
6540    25600
6542     4500
6564    17900
6615     7400
Name: price, Length: 4685, dtype: int64

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state=42)

In [92]:
X.dtypes

host_listings_count                                   int64
host_total_listings_count                             int64
accommodates                                          int64
bathrooms                                           float64
bedrooms                                            float64
beds                                                float64
security_deposit                                      int64
cleaning_fee                                          int64
guests_included                                       int64
minimum_nights                                        int64
maximum_nights                                        int64
availability_30                                       int64
availability_60                                       int64
availability_90                                       int64
availability_365                                      int64
number_of_reviews                                     int64
review_scores_rating                    

In [93]:
ss = StandardScaler()
X_normalized = ss.fit_transform(X_train)
X_test_trans = ss.transform (X_test)

In [97]:
lr = linear_model.LinearRegression()
lr.fit(X_normalized, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [104]:
lr.score(X_test_trans, y_test)

-1.2395302867625127e+24

In [98]:
y_predict = lr.predict(X_test_trans)

In [99]:
r2_score(y_predict, y_test)

-0.005466636072483766

In [101]:
mean_squared_error(y_predict, y_test)

4.948787169398556e+32