In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [2]:
df = pd.read_csv('listings 2020 full.csv')
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2318,https://www.airbnb.com/rooms/2318,20200300000000.0,17/03/2020,Casa Madrona - Urban Oasis 1 block from the park!,"Gorgeous, architect remodeled, Dutch Colonial ...","This beautiful, gracious home has been complet...","Gorgeous, architect remodeled, Dutch Colonial ...",none,Madrona is a hidden gem of a neighborhood. It ...,...,t,f,strict_14_with_grace_period,f,f,2,2,0,0,0.23
1,6606,https://www.airbnb.com/rooms/6606,20200300000000.0,18/03/2020,"Fab, private seattle urban cottage!","This tiny cottage is only 15x10, but it has ev...","Soo centrally located, this is a little house ...","This tiny cottage is only 15x10, but it has ev...",none,"A peaceful yet highly accessible neighborhood,...",...,f,f,strict_14_with_grace_period,f,f,3,3,0,0,1.15
2,9419,https://www.airbnb.com/rooms/9419,20200300000000.0,18/03/2020,Glorious sun room w/ memory foambed,This beautiful double room features a magical ...,Our new Sunny space has a private room from th...,This beautiful double room features a magical ...,none,"Lots of restaurants (see our guide book) bars,...",...,f,f,moderate,t,t,8,0,8,0,1.26
3,9460,https://www.airbnb.com/rooms/9460,20200300000000.0,18/03/2020,Downtown Convention Center B&B -- Free Minibar,Take up a glass of wine and unwind on one of t...,Greetings from Seattle. Thanks for considering...,Take up a glass of wine and unwind on one of t...,none,This is where Downtown meets First Hill and hi...,...,t,f,moderate,f,f,4,3,1,0,3.63
4,9531,https://www.airbnb.com/rooms/9531,20200300000000.0,18/03/2020,The Adorable Sweet Orange Craftsman,The Sweet Orange is a delightful and spacious ...,"The Sweet Orange invites you to stay and play,...",The Sweet Orange is a delightful and spacious ...,none,The neighborhood is awesome! Just far enough ...,...,f,f,strict_14_with_grace_period,f,t,2,2,0,0,0.4


Extract the desired variables to be utilised (some may be dropped later; not all will be in the model)

In [3]:
df=df[['price','amenities','accommodates','security_deposit','cleaning_fee','availability_365',
       'calculated_host_listings_count','reviews_per_month','minimum_nights',
       'host_is_superhost','host_since','last_review','first_review',
       "host_verifications", 'last_scraped', 'calendar_updated','number_of_reviews','host_response_time','space',
       'neighborhood_overview','notes','host_about']]



In [4]:
df.shape

(7505, 22)

In [5]:
df.dtypes

price                              object
amenities                          object
accommodates                        int64
security_deposit                   object
cleaning_fee                       object
availability_365                    int64
calculated_host_listings_count      int64
reviews_per_month                 float64
minimum_nights                      int64
host_is_superhost                  object
host_since                         object
last_review                        object
first_review                       object
host_verifications                 object
last_scraped                       object
calendar_updated                   object
number_of_reviews                   int64
host_response_time                 object
space                              object
neighborhood_overview              object
notes                              object
host_about                         object
dtype: object

In [6]:
df.isna().sum()/len(df)*100

price                              0.000000
amenities                          0.000000
accommodates                       0.000000
security_deposit                  15.389740
cleaning_fee                       6.795470
availability_365                   0.000000
calculated_host_listings_count     0.000000
reviews_per_month                 13.111259
minimum_nights                     0.000000
host_is_superhost                  0.000000
host_since                         0.000000
last_review                       13.111259
first_review                      13.111259
host_verifications                 0.000000
last_scraped                       0.000000
calendar_updated                   0.000000
number_of_reviews                  0.000000
host_response_time                20.013324
space                             17.175217
neighborhood_overview             27.315123
notes                             38.441039
host_about                        24.956696
dtype: float64

For security_deposit and cleaning_fee, I'll assume that null value means there is no such fee for that listing. Thus I'll fill with 0. 

In [7]:
df['security_deposit'] = df['security_deposit'].fillna('$0.0')
df['cleaning_fee'] = df['cleaning_fee'].fillna('$0.0')

In [8]:
df.isna().sum()/len(df)*100

price                              0.000000
amenities                          0.000000
accommodates                       0.000000
security_deposit                   0.000000
cleaning_fee                       0.000000
availability_365                   0.000000
calculated_host_listings_count     0.000000
reviews_per_month                 13.111259
minimum_nights                     0.000000
host_is_superhost                  0.000000
host_since                         0.000000
last_review                       13.111259
first_review                      13.111259
host_verifications                 0.000000
last_scraped                       0.000000
calendar_updated                   0.000000
number_of_reviews                  0.000000
host_response_time                20.013324
space                             17.175217
neighborhood_overview             27.315123
notes                             38.441039
host_about                        24.956696
dtype: float64

Before I drop NaN values for the other columns, I'll first obtain character count for each of the textual variables 

In [9]:
df['space'] = df['space'].astype(str)

In [10]:
for i in df["space"].index:
    if (df.at[i,"space"] == 'nan'):
        df.at[i,"space_char_count"] = 0
    else:
        df.at[i,"space_char_count"] = len(df.at[i,"space"])

In [11]:
df['space_char_count'].head()

0    1000.0
1     999.0
2    1000.0
3    1000.0
4    1000.0
Name: space_char_count, dtype: float64

In [13]:
df['neighborhood_overview'] = df['neighborhood_overview'].astype(str)

In [14]:
for i in df["neighborhood_overview"].index:
    if (df.at[i,"neighborhood_overview"] == 'nan'):
        df.at[i,"neighborhood_overview_char_count"] = 0
    else:
        df.at[i,"neighborhood_overview_char_count"] = len(df.at[i,"neighborhood_overview"])

In [15]:
df['neighborhood_overview_char_count'].head()

0    404.0
1    180.0
2    669.0
3    308.0
4    248.0
Name: neighborhood_overview_char_count, dtype: float64

In [16]:
df['notes'] = df['notes'].astype(str)

In [17]:
for i in df["notes"].index:
    if (df.at[i,"notes"] == 'nan'):
        df.at[i,"notes_char_count"] = 0
    else:
        df.at[i,"notes_char_count"] = len(df.at[i,"notes"])

In [18]:
df['notes_char_count'].head()

0     200.0
1     382.0
2     205.0
3    1000.0
4     193.0
Name: notes_char_count, dtype: float64

In [19]:
df['host_about'] = df['host_about'].astype(str)

In [20]:
for i in df["host_about"].index:
    if (df.at[i,"host_about"] == 'nan'):
        df.at[i,"host_about_char_count"] = 0
    else:
        df.at[i,"host_about_char_count"] = len(df.at[i,"host_about"])

In [21]:
df['host_about_char_count'].head()

0      59.0
1     192.0
2    2633.0
3     281.0
4     805.0
Name: host_about_char_count, dtype: float64

Now we can drop the textual columns

In [22]:
df = df.drop(['space', 'neighborhood_overview', 'notes', 'host_about'], axis = 1)

Next, drop the remaining rows containing NaN values

In [23]:
df.isna().sum()/len(df)*100

price                                0.000000
amenities                            0.000000
accommodates                         0.000000
security_deposit                     0.000000
cleaning_fee                         0.000000
availability_365                     0.000000
calculated_host_listings_count       0.000000
reviews_per_month                   13.111259
minimum_nights                       0.000000
host_is_superhost                    0.000000
host_since                           0.000000
last_review                         13.111259
first_review                        13.111259
host_verifications                   0.000000
last_scraped                         0.000000
calendar_updated                     0.000000
number_of_reviews                    0.000000
host_response_time                  20.013324
space_char_count                     0.000000
neighborhood_overview_char_count     0.000000
notes_char_count                     0.000000
host_about_char_count             

In [24]:
df = df.dropna()

Final check for Null values

In [25]:
df.isnull().any()

price                               False
amenities                           False
accommodates                        False
security_deposit                    False
cleaning_fee                        False
availability_365                    False
calculated_host_listings_count      False
reviews_per_month                   False
minimum_nights                      False
host_is_superhost                   False
host_since                          False
last_review                         False
first_review                        False
host_verifications                  False
last_scraped                        False
calendar_updated                    False
number_of_reviews                   False
host_response_time                  False
space_char_count                    False
neighborhood_overview_char_count    False
notes_char_count                    False
host_about_char_count               False
dtype: bool

Next up is data processing

In [26]:
df.dtypes

price                                object
amenities                            object
accommodates                          int64
security_deposit                     object
cleaning_fee                         object
availability_365                      int64
calculated_host_listings_count        int64
reviews_per_month                   float64
minimum_nights                        int64
host_is_superhost                    object
host_since                           object
last_review                          object
first_review                         object
host_verifications                   object
last_scraped                         object
calendar_updated                     object
number_of_reviews                     int64
host_response_time                   object
space_char_count                    float64
neighborhood_overview_char_count    float64
notes_char_count                    float64
host_about_char_count               float64
dtype: object

In [28]:
df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)

Create a new column, 'price_per_person' obtained by 'price' / 'accommodates'

In [29]:
df['price_per_person'] = df['price'] / df['accommodates']

Drop 'accommodates' as we won't need it anymore

In [30]:
df = df.drop(['accommodates'], axis = 1)

In [31]:
df.dtypes

price                               float64
amenities                            object
security_deposit                     object
cleaning_fee                         object
availability_365                      int64
calculated_host_listings_count        int64
reviews_per_month                   float64
minimum_nights                        int64
host_is_superhost                    object
host_since                           object
last_review                          object
first_review                         object
host_verifications                   object
last_scraped                         object
calendar_updated                     object
number_of_reviews                     int64
host_response_time                   object
space_char_count                    float64
neighborhood_overview_char_count    float64
notes_char_count                    float64
host_about_char_count               float64
price_per_person                    float64
dtype: object

In [32]:
df['security_deposit'] = df['security_deposit'].str.replace('$', '').str.replace(',', '').astype(float)

In [33]:
df['cleaning_fee'] = df['cleaning_fee'].str.replace('$', '').str.replace(',', '').astype(float)

In [34]:
df.dtypes

price                               float64
amenities                            object
security_deposit                    float64
cleaning_fee                        float64
availability_365                      int64
calculated_host_listings_count        int64
reviews_per_month                   float64
minimum_nights                        int64
host_is_superhost                    object
host_since                           object
last_review                          object
first_review                         object
host_verifications                   object
last_scraped                         object
calendar_updated                     object
number_of_reviews                     int64
host_response_time                   object
space_char_count                    float64
neighborhood_overview_char_count    float64
notes_char_count                    float64
host_about_char_count               float64
price_per_person                    float64
dtype: object

For the Boolean variables, assign True to 1 and False to 0 for usage in models later

In [35]:
dict_2 = {'t' : 1, 'f' : 0}

In [36]:
df['host_is_superhost'] = df['host_is_superhost'].map(dict_2)

In [37]:
df.dtypes

price                               float64
amenities                            object
security_deposit                    float64
cleaning_fee                        float64
availability_365                      int64
calculated_host_listings_count        int64
reviews_per_month                   float64
minimum_nights                        int64
host_is_superhost                     int64
host_since                           object
last_review                          object
first_review                         object
host_verifications                   object
last_scraped                         object
calendar_updated                     object
number_of_reviews                     int64
host_response_time                   object
space_char_count                    float64
neighborhood_overview_char_count    float64
notes_char_count                    float64
host_about_char_count               float64
price_per_person                    float64
dtype: object

In [38]:
import datetime

#change the dates into a recognisable date format

df['host_since']=pd.to_datetime(df['host_since'])
df['first_review']=pd.to_datetime(df['first_review'])
df['last_review']=pd.to_datetime(df['last_review'])
df['last_scraped']=pd.to_datetime(df['last_scraped'])

Create a column 'listing_duration_days' as an indicator of how long the listing has been listed

In [39]:
df['listing_duration_days'] = df['last_review'] - df['first_review']
df['listing_duration_days'].head()

0   4126 days
1   3725 days
2   3437 days
3   3959 days
4   2585 days
Name: listing_duration_days, dtype: timedelta64[ns]

Convert to int type

In [40]:
df["listing_duration_days"] = df["listing_duration_days"].dt.days
df['listing_duration_days'].head()

0    4126
1    3725
2    3437
3    3959
4    2585
Name: listing_duration_days, dtype: int64

Create a column 'hosting_duration_days' as a relative indicator of how long the host has been hosting (experience)

In [41]:
df = df.assign(hosting_duration_days = df['last_review'] - df['host_since'])

In [42]:
df['hosting_duration_days'].head()

0   4146 days
1   3807 days
2   3762 days
3   3921 days
4   3791 days
Name: hosting_duration_days, dtype: timedelta64[ns]

In [43]:
df["hosting_duration_days"] = df["hosting_duration_days"].dt.days
df['hosting_duration_days'].head()

0    4146
1    3807
2    3762
3    3921
4    3791
Name: hosting_duration_days, dtype: int64

In [44]:
df.dtypes

price                                      float64
amenities                                   object
security_deposit                           float64
cleaning_fee                               float64
availability_365                             int64
calculated_host_listings_count               int64
reviews_per_month                          float64
minimum_nights                               int64
host_is_superhost                            int64
host_since                          datetime64[ns]
last_review                         datetime64[ns]
first_review                        datetime64[ns]
host_verifications                          object
last_scraped                        datetime64[ns]
calendar_updated                            object
number_of_reviews                            int64
host_response_time                          object
space_char_count                           float64
neighborhood_overview_char_count           float64
notes_char_count               

For 'calendar_updated', it is to be converted into a useable form

In [45]:
df["calendar_updated"] = df["calendar_updated"].str.replace('a ', '1 ')

In [46]:
from dateutil.relativedelta import relativedelta

In [47]:
def get_past_date(str_days_ago):
    day_scraped = df.at[0,"last_scraped"]
    splitted = str_days_ago.split()
    if len(splitted) == 1 and splitted[0].lower() == 'today':
        return str(day_scraped.isoformat())
    elif len(splitted) == 1 and splitted[0].lower() == 'yesterday':
        date = day_scraped - relativedelta(days=1)
        return str(date.isoformat())
    elif len(splitted) == 1 and splitted[0].lower() == 'never':
        return "never"
    elif splitted[1].lower() in ['day', 'days', 'd']:
        date = day_scraped - relativedelta(days=int(splitted[0]))
        return str(date.isoformat())
    elif splitted[1].lower() in ['wk', 'wks', 'week', 'weeks', 'w']:
        date = day_scraped - relativedelta(weeks=int(splitted[0]))
        return str(date.isoformat())
    elif splitted[1].lower() in ['mon', 'mons', 'month', 'months', 'm']:
        date = day_scraped - relativedelta(months=int(splitted[0]))
        return str(date.isoformat())
    elif splitted[1].lower() in ['yrs', 'yr', 'years', 'year', 'y']:
        date = day_scraped - relativedelta(years=int(splitted[0]))
        return str(date.isoformat())
    else:
        return "Wrong Argument format"

In [48]:
for i in df["calendar_updated"].index:
    df.at[i,"calendar_updated_temp"] = get_past_date(df.at[i,"calendar_updated"])

Drop rows containing 'never'

In [49]:
df = df[df.calendar_updated_temp != 'never']

In [50]:
df["calendar_updated_temp"].value_counts()

2020-03-17T00:00:00    1179
2020-03-10T00:00:00     553
2020-03-03T00:00:00     506
2020-02-25T00:00:00     351
2020-01-17T00:00:00     315
2020-02-18T00:00:00     259
2020-03-16T00:00:00     238
2019-12-17T00:00:00     197
2020-03-12T00:00:00     179
2020-03-14T00:00:00     176
2020-03-13T00:00:00     171
2020-02-11T00:00:00     157
2020-02-04T00:00:00     156
2019-09-17T00:00:00     138
2019-11-17T00:00:00     130
2019-10-17T00:00:00     106
2020-01-28T00:00:00     105
2020-03-15T00:00:00      93
2020-03-11T00:00:00      64
2019-08-17T00:00:00      43
2019-07-17T00:00:00      32
2019-05-17T00:00:00      29
2019-06-17T00:00:00      24
2019-04-17T00:00:00      15
2019-01-17T00:00:00       6
2019-02-17T00:00:00       6
2019-03-17T00:00:00       5
2018-10-17T00:00:00       5
2018-07-17T00:00:00       5
2018-09-17T00:00:00       4
2018-12-17T00:00:00       3
2018-08-17T00:00:00       3
2018-11-17T00:00:00       2
2018-03-17T00:00:00       2
2017-05-17T00:00:00       1
2017-04-17T00:00:00 

In [51]:
df["calendar_updated_temp"]=pd.to_datetime(df["calendar_updated_temp"])

Create a new column, 'days_since_calendar_updated', to replace 'calendar_updated', where the data is in a more usable form

In [52]:
df['days_since_calendar_updated'] = df['last_scraped'] - df["calendar_updated_temp"]

In [53]:
df['days_since_calendar_updated'] = df['days_since_calendar_updated'].dt.days

In [54]:
df['days_since_calendar_updated'].head()

0    21
1    92
2    15
3     5
4     2
Name: days_since_calendar_updated, dtype: int64

In [55]:
df.dtypes

price                                      float64
amenities                                   object
security_deposit                           float64
cleaning_fee                               float64
availability_365                             int64
calculated_host_listings_count               int64
reviews_per_month                          float64
minimum_nights                               int64
host_is_superhost                            int64
host_since                          datetime64[ns]
last_review                         datetime64[ns]
first_review                        datetime64[ns]
host_verifications                          object
last_scraped                        datetime64[ns]
calendar_updated                            object
number_of_reviews                            int64
host_response_time                          object
space_char_count                           float64
neighborhood_overview_char_count           float64
notes_char_count               

Drop the unneeded columns:

In [56]:
df = df.drop(['calendar_updated_temp','calendar_updated'], axis = 1)

In [57]:
df.dtypes

price                                      float64
amenities                                   object
security_deposit                           float64
cleaning_fee                               float64
availability_365                             int64
calculated_host_listings_count               int64
reviews_per_month                          float64
minimum_nights                               int64
host_is_superhost                            int64
host_since                          datetime64[ns]
last_review                         datetime64[ns]
first_review                        datetime64[ns]
host_verifications                          object
last_scraped                        datetime64[ns]
number_of_reviews                            int64
host_response_time                          object
space_char_count                           float64
neighborhood_overview_char_count           float64
notes_char_count                           float64
host_about_char_count          

Since 'host_verifications' is a list of sorts, count the number in each 'list'

In [58]:
for i in df["host_verifications"].index:
    if len(df.at[i,"host_verifications"]) > 2 :
        count = 1
        for char in df.at[i,"host_verifications"]:
            if char == ',':
                count+=1
    else:
        count = 0
    df.at[i,"no_of_host_verifications"] = count
    count = 0

In [59]:
df["no_of_host_verifications"].value_counts()

6.0     935
5.0     802
4.0     748
7.0     636
8.0     545
3.0     470
9.0     362
2.0     326
10.0    274
11.0    111
1.0      32
12.0     23
Name: no_of_host_verifications, dtype: int64

In [60]:
df = df.drop(['host_verifications'], axis = 1)

Next, check for and remove any anomalies because **at least 100 days have to elapse before superhost can be awarded**, so I will be excluding those with new accounts as they may not be an accurate indicator about whether it meets the superhost requirement.

In [61]:
df['account_duration_days'] = df['last_scraped'] - df['host_since']

In [62]:
df["account_duration_days"] = df["account_duration_days"].dt.days

In [63]:
df['evaluation_period_elapsed'] = df['account_duration_days'].apply(lambda x: True if x >= 100 else False)

In [64]:
pd.crosstab(df["evaluation_period_elapsed"],df["host_is_superhost"])

host_is_superhost,0,1
evaluation_period_elapsed,Unnamed: 1_level_1,Unnamed: 2_level_1
False,15,0
True,2360,2889


As one can see, none of the superhosts have AirBnB account age of less than or equal to 100 days old. I will drop the rows with account age less than or equal to 100 days old:

In [65]:
df = df[df.account_duration_days >= 100]

In [66]:
pd.crosstab(df["evaluation_period_elapsed"],df["host_is_superhost"])

host_is_superhost,0,1
evaluation_period_elapsed,Unnamed: 1_level_1,Unnamed: 2_level_1
True,2360,2889


In [67]:
df.dtypes

price                                      float64
amenities                                   object
security_deposit                           float64
cleaning_fee                               float64
availability_365                             int64
calculated_host_listings_count               int64
reviews_per_month                          float64
minimum_nights                               int64
host_is_superhost                            int64
host_since                          datetime64[ns]
last_review                         datetime64[ns]
first_review                        datetime64[ns]
last_scraped                        datetime64[ns]
number_of_reviews                            int64
host_response_time                          object
space_char_count                           float64
neighborhood_overview_char_count           float64
notes_char_count                           float64
host_about_char_count                      float64
price_per_person               

In [68]:
df = df.drop(['evaluation_period_elapsed','last_scraped','first_review','last_review','host_since'], axis = 1)

In [69]:
df.dtypes

price                               float64
amenities                            object
security_deposit                    float64
cleaning_fee                        float64
availability_365                      int64
calculated_host_listings_count        int64
reviews_per_month                   float64
minimum_nights                        int64
host_is_superhost                     int64
number_of_reviews                     int64
host_response_time                   object
space_char_count                    float64
neighborhood_overview_char_count    float64
notes_char_count                    float64
host_about_char_count               float64
price_per_person                    float64
listing_duration_days                 int64
hosting_duration_days                 int64
days_since_calendar_updated           int64
no_of_host_verifications            float64
account_duration_days                 int64
dtype: object

In [70]:
pd.crosstab(df['host_response_time'],df["host_is_superhost"], normalize='columns')

host_is_superhost,0,1
host_response_time,Unnamed: 1_level_1,Unnamed: 2_level_1
a few days or more,0.017797,0.001731
within a day,0.064407,0.033922
within a few hours,0.124153,0.106265
within an hour,0.793644,0.858082


For 'host_response_time', similarly it will be converted correspondingly, with 1 for 'within an hour', the fastest, 2 for 'within a few hours', 3 for 'within a day' and 4 for 'a few days or more', the slowest.

In [71]:
dict_3 = {'within an hour' : 1, 'within a few hours': 2, 'within a day': 3, 'a few days or more': 4}

In [72]:
df['host_response_time'] = df['host_response_time'].map(dict_3)

In [73]:
df.dtypes

price                               float64
amenities                            object
security_deposit                    float64
cleaning_fee                        float64
availability_365                      int64
calculated_host_listings_count        int64
reviews_per_month                   float64
minimum_nights                        int64
host_is_superhost                     int64
number_of_reviews                     int64
host_response_time                    int64
space_char_count                    float64
neighborhood_overview_char_count    float64
notes_char_count                    float64
host_about_char_count               float64
price_per_person                    float64
listing_duration_days                 int64
hosting_duration_days                 int64
days_since_calendar_updated           int64
no_of_host_verifications            float64
account_duration_days                 int64
dtype: object

Lastly, for amenities first a count of amenities for each listing is obtained:

In [74]:
for i in df["amenities"].index:
    if len(df.at[i,"amenities"]) > 2 :
        count = 1
        for char in df.at[i,"amenities"]:
            if char == ',':
                count+=1
    else:
        count = 0
    df.at[i,"amenities_count"] = count
    count = 0

Also, some pre-calculations show that certain specific amenities may be more common in superhosts than non-superhosts. These shall be extracted as 1/0 (T/F) in a new column

In [101]:
df["Dog(s)"] = df["amenities"].map(lambda x: 1 if "Dog(s)" in x else 0)

In [102]:
pd.crosstab(df["Dog(s)"],df["host_is_superhost"], normalize = 'columns')

host_is_superhost,0,1
Dog(s),Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.885797,0.808067
1,0.114203,0.191933


In [103]:
df["Pets live on this property"] = df["amenities"].map(lambda x: 1 if "Pets live on this property" in x else 0)

In [104]:
pd.crosstab(df["Pets live on this property"],df["host_is_superhost"], normalize = 'columns')

host_is_superhost,0,1
Pets live on this property,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.793186,0.682893
1,0.206814,0.317107


In [76]:
df.dtypes

price                               float64
security_deposit                    float64
cleaning_fee                        float64
availability_365                      int64
calculated_host_listings_count        int64
reviews_per_month                   float64
minimum_nights                        int64
host_is_superhost                     int64
number_of_reviews                     int64
host_response_time                    int64
space_char_count                    float64
neighborhood_overview_char_count    float64
notes_char_count                    float64
host_about_char_count               float64
price_per_person                    float64
listing_duration_days                 int64
hosting_duration_days                 int64
days_since_calendar_updated           int64
no_of_host_verifications            float64
account_duration_days                 int64
amenities_count                     float64
dtype: object

In [75]:
df = df.drop(["amenities"], axis = 1)

Finally, convert some floats to int

In [77]:
df["amenities_count"] = df["amenities_count"].astype(int)

df["space_char_count"] = df["space_char_count"].astype(int)

df["neighborhood_overview_char_count"] = df["neighborhood_overview_char_count"].astype(int)

df["notes_char_count"] = df["notes_char_count"].astype(int)

df["host_about_char_count"] = df["host_about_char_count"].astype(int)

In [78]:
df["no_of_host_verifications"] = df["no_of_host_verifications"].astype(int)

In [112]:
df = df.rename(columns={"Dog(s)": "Dog(s) present"})

In [79]:
df.dtypes

price                               float64
security_deposit                    float64
cleaning_fee                        float64
availability_365                      int64
calculated_host_listings_count        int64
reviews_per_month                   float64
minimum_nights                        int64
host_is_superhost                     int64
number_of_reviews                     int64
host_response_time                    int64
space_char_count                      int64
neighborhood_overview_char_count      int64
notes_char_count                      int64
host_about_char_count                 int64
price_per_person                    float64
listing_duration_days                 int64
hosting_duration_days                 int64
days_since_calendar_updated           int64
no_of_host_verifications              int64
account_duration_days                 int64
amenities_count                       int64
dtype: object

cancellation_policy, host_identity_verified, instant_bookable, host_is_superhost, host_response_time, Dog(s) present, Pets live on this property are categorical variables but are encoded with labels.

Additional steps to further discretize the char_counts for the textual variables and sort them into bins.

In [5]:
df.dtypes

price                                float64
security_deposit                     float64
cleaning_fee                         float64
availability_365                       int64
calculated_host_listings_count         int64
reviews_per_month                    float64
minimum_nights                         int64
cancellation_policy                    int64
host_identity_verified                 int64
instant_bookable                       int64
host_is_superhost                      int64
number_of_reviews                      int64
host_response_time                     int64
space_char_count                       int64
description_char_count                 int64
neighborhood_overview_char_count       int64
notes_char_count                       int64
host_about_char_count                  int64
price_per_person                     float64
listing_duration_days                  int64
hosting_duration_days                  int64
days_since_calendar_updated            int64
no_of_host

In [80]:
df['space_char_count_grouped'] = pd.cut(df['space_char_count'], range(0, 1001, 50), precision=0, include_lowest = True)


In [82]:
df['neighborhood_overview_char_count_grouped'] = pd.cut(df['neighborhood_overview_char_count'], range(0, 1001, 50), precision=0, include_lowest = True)


In [83]:
df['notes_char_count_grouped'] = pd.cut(df['notes_char_count'], range(0, 1001, 50), precision=0, include_lowest = True)


In [84]:
df.dtypes

price                                        float64
security_deposit                             float64
cleaning_fee                                 float64
availability_365                               int64
calculated_host_listings_count                 int64
reviews_per_month                            float64
minimum_nights                                 int64
host_is_superhost                              int64
number_of_reviews                              int64
host_response_time                             int64
space_char_count                               int64
neighborhood_overview_char_count               int64
notes_char_count                               int64
host_about_char_count                          int64
price_per_person                             float64
listing_duration_days                          int64
hosting_duration_days                          int64
days_since_calendar_updated                    int64
no_of_host_verifications                      

Encode each bin with a numerical label; **1 = 0-50, 2 = 51-100, 3 = 101-150, ..., 20 = 951-1000**

In [85]:
df['space_char_count_grouped'] = pd.cut(df['space_char_count'], range(0, 1001, 50), precision=0, include_lowest = True, labels=[1, 2, 3, 4,
                                                                                              5,6,7,8,9,10,11,12,
                                                                                              13,14,15,16,17,18,
                                                                                              19,20])


In [86]:
df['notes_char_count_grouped'] = pd.cut(df['notes_char_count'], range(0, 1001, 50), precision=0, include_lowest = True, labels=[1, 2, 3, 4,
                                                                                              5,6,7,8,9,10,11,12,
                                                                                              13,14,15,16,17,18,
                                                                                              19,20])


In [87]:
df['neighborhood_overview_char_count_grouped'] = pd.cut(df['neighborhood_overview_char_count'], range(0, 1001, 50), precision=0, include_lowest = True, labels=[1, 2, 3, 4,
                                                                                              5,6,7,8,9,10,11,12,
                                                                                              13,14,15,16,17,18,
                                                                                              19,20])

In [88]:
df.to_csv ('~/Downloads/cleaned_df_2020.csv', index = False, header=True)

In [96]:
df.head()

Unnamed: 0,price,security_deposit,cleaning_fee,availability_365,calculated_host_listings_count,reviews_per_month,host_is_superhost,number_of_reviews,host_response_time,space_char_count,...,price_per_person,listing_duration_days,hosting_duration_days,days_since_calendar_updated,no_of_host_verifications,account_duration_days,amenities_count,space_char_count_grouped,neighborhood_overview_char_count_grouped,notes_char_count_grouped
0,296.0,500.0,250.0,86,2,0.23,1,32,1,1000,...,32.888889,4126,4146,21,8,4221,39,20,9,4
1,90.0,200.0,40.0,45,3,1.15,0,150,3,999,...,45.0,3725,3807,92,5,3979,21,20,4,8
2,62.0,100.0,20.0,365,8,1.26,1,148,2,1000,...,31.0,3437,3762,15,6,3844,34,20,14,5
3,79.0,0.0,45.0,10,4,3.63,1,466,1,1000,...,39.5,3959,3921,5,7,3814,48,20,7,20
4,165.0,300.0,120.0,276,2,0.4,1,40,2,1000,...,41.25,2585,3791,2,4,3870,50,20,5,4


In [95]:
df = df.drop(["minimum_nights"], axis = 1)

In [98]:
df=df[['host_is_superhost', 
         'no_of_host_verifications', 'host_response_time',
        'number_of_reviews', 'amenities_count', 'calculated_host_listings_count', 'reviews_per_month',
'price',
 'security_deposit',
 'cleaning_fee',
 'availability_365',
'listing_duration_days',
 'hosting_duration_days',
'price_per_person',
'days_since_calendar_updated',
 'account_duration_days',
 'space_char_count',
 'neighborhood_overview_char_count',
 'notes_char_count',
 'host_about_char_count',
 'space_char_count_grouped',
 'neighborhood_overview_char_count_grouped',
 'notes_char_count_grouped']]

In [99]:
df.to_csv ('~/Downloads/cleaned_df_2020.csv', index = False, header=True)