In [1]:
import pandas as pd
import numpy as np
import gzip
from sklearn.preprocessing import OneHotEncoder
import string
from nltk.corpus import stopwords
from collections import defaultdict
import os

In [2]:
in_fp = '../data/raw/airbnb_raw.csv'
data = pd.read_csv(in_fp)

In [3]:
data.head(5)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,...,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07
1,953595,https://www.airbnb.com/rooms/953595,20160104002432,2016-01-04,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",...,10.0,f,,WASHINGTON,f,strict,t,t,6,1.48
2,3308979,https://www.airbnb.com/rooms/3308979,20160104002432,2016-01-04,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,...,10.0,f,,WASHINGTON,f,strict,f,f,2,1.15
3,7421966,https://www.airbnb.com/rooms/7421966,20160104002432,2016-01-04,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,...,,f,,WASHINGTON,f,flexible,f,f,1,
4,278830,https://www.airbnb.com/rooms/278830,20160104002432,2016-01-04,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,...,9.0,f,,WASHINGTON,f,strict,f,f,1,0.89


In [4]:
# original columns
data.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url',
       'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', '

# Data Cleaning

In [6]:
# only keep columns below
keep_columns = ['id', 'name', 'summary', 'neighborhood_overview', 'transit', 'host_response_time',
       'host_response_rate', 'host_is_superhost', 'host_neighbourhood',
       'host_listings_count', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'zipcode',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities',
       'review_scores_rating', 'security_deposit',
       'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights',
       'maximum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'instant_bookable',
       'cancellation_policy', 'require_guest_profile_picture',
       'require_guest_phone_verification','reviews_per_month', 
       'price']

In [7]:
# only keep listings that have review_scores_rating 
listings = data.loc[data['review_scores_rating'].notnull()][keep_columns]

# fillna
listings.summary = listings.summary.fillna(' ')
listings.neighborhood_overview = listings.neighborhood_overview.fillna(' ')
listings.transit = listings.transit.fillna(' ')

# fill null with mode
listings.host_response_time = listings.host_response_time.fillna(listings.host_response_time.mode()[0])
# fill null with mean
listings.host_response_rate = listings.host_response_rate.fillna('999%').str[:-1].astype(int)
listings.loc[listings.host_response_rate == 999, 'host_response_rate'] = int(listings.host_response_rate.mean())

# impute with 0
listings.security_deposit = listings.security_deposit.fillna('$0')
listings.cleaning_fee = listings.cleaning_fee.fillna('$0')

listings = listings.dropna().reset_index()

In [8]:
listings.shape

(2844, 43)

In [9]:
listings.head()

Unnamed: 0,index,id,name,summary,neighborhood_overview,transit,host_response_time,host_response_rate,host_is_superhost,host_neighbourhood,...,availability_60,availability_90,availability_365,number_of_reviews,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,reviews_per_month,review_scores_rating
0,0,241032,Stylish Queen Anne Apartment,,,,within a few hours,96,f,Queen Anne,...,41,71,346,207,f,moderate,f,f,4.07,95.0
1,1,953595,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Queen Anne is a wonderful, truly functional vi...","Convenient bus stops are just down the block, ...",within an hour,98,t,Queen Anne,...,13,16,291,43,f,strict,t,t,1.48,96.0
2,2,3308979,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,Upper Queen Anne is a charming neighborhood fu...,A bus stop is just 2 blocks away. Easy bus a...,within a few hours,67,f,Queen Anne,...,6,17,220,20,f,strict,f,f,1.15,97.0
3,4,278830,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,We are in the beautiful neighborhood of Queen ...,The nearest public transit bus (D Line) is 2 b...,within an hour,100,f,Queen Anne,...,60,90,365,38,f,strict,f,f,0.89,92.0
4,5,5956968,Private unit in a 1920s mansion,We're renting out a small private unit of one ...,This part of Queen Anne has wonderful views an...,The #1 and #2 bus picks up 2-3 blocks away (di...,within an hour,184,f,Queen Anne,...,0,27,302,17,f,strict,f,f,2.45,95.0


In [10]:
# Features and target
X = listings.iloc[:, 1:-1]
y = listings.iloc[:, -1]

# Numerical Features

In [11]:
columns_num = ['host_response_rate', 'host_listings_count', 'accommodates', 'bathrooms',
              'bedrooms', 'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
              'extra_people', 'minimum_nights', 'maximum_nights',
               'availability_30', 'availability_60', 'availability_90',
               'availability_365', 'number_of_reviews', 'reviews_per_month']

In [12]:
len(columns_num)

19

In [13]:
feat_num = X[columns_num]

In [14]:
for i in ['price','security_deposit','cleaning_fee','extra_people']:
    feat_num[i] = feat_num[i].str[1:].str.replace(',','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat_num[i] = feat_num[i].str[1:].str.replace(',','')


In [15]:
feat_num = feat_num.astype(float)

In [16]:
feat_num.dtypes

host_response_rate     float64
host_listings_count    float64
accommodates           float64
bathrooms              float64
bedrooms               float64
beds                   float64
price                  float64
security_deposit       float64
cleaning_fee           float64
guests_included        float64
extra_people           float64
minimum_nights         float64
maximum_nights         float64
availability_30        float64
availability_60        float64
availability_90        float64
availability_365       float64
number_of_reviews      float64
reviews_per_month      float64
dtype: object

# Categorical Features

In [17]:
X.columns

Index(['id', 'name', 'summary', 'neighborhood_overview', 'transit',
       'host_response_time', 'host_response_rate', 'host_is_superhost',
       'host_neighbourhood', 'host_listings_count', 'host_has_profile_pic',
       'host_identity_verified', 'neighbourhood', 'zipcode',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price',
       'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
       'minimum_nights', 'maximum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'instant_bookable',
       'cancellation_policy', 'require_guest_profile_picture',
       'require_guest_phone_verification', 'reviews_per_month'],
      dtype='object')

In [18]:
columns_cat = ['host_response_time', 'host_is_superhost', 'host_neighbourhood', 
               'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'zipcode',
               'is_location_exact', 'property_type', 'room_type', 'bed_type', 'has_availability',
               'instant_bookable', 'cancellation_policy', 'require_guest_profile_picture',
               'require_guest_phone_verification']

In [19]:
len(columns_cat)

16

In [20]:
feat_cat = X[columns_cat]
for i in feat_cat.columns:
    feat_cat[i] = i + ': ' + feat_cat[i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat_cat[i] = i + ': ' + feat_cat[i]


In [21]:
feat_cat.head()

Unnamed: 0,host_response_time,host_is_superhost,host_neighbourhood,host_has_profile_pic,host_identity_verified,neighbourhood,zipcode,is_location_exact,property_type,room_type,bed_type,has_availability,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification
0,host_response_time: within a few hours,host_is_superhost: f,host_neighbourhood: Queen Anne,host_has_profile_pic: t,host_identity_verified: t,neighbourhood: Queen Anne,zipcode: 98119,is_location_exact: t,property_type: Apartment,room_type: Entire home/apt,bed_type: Real Bed,has_availability: t,instant_bookable: f,cancellation_policy: moderate,require_guest_profile_picture: f,require_guest_phone_verification: f
1,host_response_time: within an hour,host_is_superhost: t,host_neighbourhood: Queen Anne,host_has_profile_pic: t,host_identity_verified: t,neighbourhood: Queen Anne,zipcode: 98119,is_location_exact: t,property_type: Apartment,room_type: Entire home/apt,bed_type: Real Bed,has_availability: t,instant_bookable: f,cancellation_policy: strict,require_guest_profile_picture: t,require_guest_phone_verification: t
2,host_response_time: within a few hours,host_is_superhost: f,host_neighbourhood: Queen Anne,host_has_profile_pic: t,host_identity_verified: t,neighbourhood: Queen Anne,zipcode: 98119,is_location_exact: t,property_type: House,room_type: Entire home/apt,bed_type: Real Bed,has_availability: t,instant_bookable: f,cancellation_policy: strict,require_guest_profile_picture: f,require_guest_phone_verification: f
3,host_response_time: within an hour,host_is_superhost: f,host_neighbourhood: Queen Anne,host_has_profile_pic: t,host_identity_verified: t,neighbourhood: Queen Anne,zipcode: 98119,is_location_exact: t,property_type: House,room_type: Entire home/apt,bed_type: Real Bed,has_availability: t,instant_bookable: f,cancellation_policy: strict,require_guest_profile_picture: f,require_guest_phone_verification: f
4,host_response_time: within an hour,host_is_superhost: f,host_neighbourhood: Queen Anne,host_has_profile_pic: t,host_identity_verified: t,neighbourhood: Queen Anne,zipcode: 98119,is_location_exact: t,property_type: House,room_type: Private room,bed_type: Real Bed,has_availability: t,instant_bookable: f,cancellation_policy: strict,require_guest_profile_picture: f,require_guest_phone_verification: f


In [22]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(feat_cat)
cat_array = enc.transform(feat_cat).toarray()
cat_name = np.concatenate(enc.categories_).tolist()
feat_cat = pd.DataFrame(cat_array, columns = cat_name)

In [23]:
feat_cat.head()

Unnamed: 0,host_response_time: a few days or more,host_response_time: within a day,host_response_time: within a few hours,host_response_time: within an hour,host_is_superhost: f,host_is_superhost: t,host_neighbourhood: Alki,host_neighbourhood: Anaheim,host_neighbourhood: Arbor Heights,host_neighbourhood: Atlantic,...,has_availability: t,instant_bookable: f,instant_bookable: t,cancellation_policy: flexible,cancellation_policy: moderate,cancellation_policy: strict,require_guest_profile_picture: f,require_guest_profile_picture: t,require_guest_phone_verification: f,require_guest_phone_verification: t
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


# Text Features (Bag of Words: 500 most common words)

In [24]:
columns_text_bow = ['name', 'amenities', 'summary', 'neighborhood_overview', 'transit']

In [25]:
feat_text_bow = X[columns_text_bow]

In [26]:
feat_text_bow.amenities = feat_text_bow.amenities.str[1:-1].str.replace('"','').str.replace(',', ' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [27]:
def bow(sentence, words, wordId):
    feat = [0]*len(words)
    for w in sentence.split():
        if w in words:
            feat[wordId[w]] += 1
    return feat

In [28]:
def to_bow(col):
    wordCount = defaultdict(int)
    punctuation = set(string.punctuation)
    for d in col:
        r = ''.join([c for c in d.lower() if not c in punctuation])
        for w in r.split():
            wordCount[w] += 1

    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()
    words = [x[1] for x in counts[:500]]
    wordId = dict(zip(words, range(len(words))))
    wordSet = set(words)
    
    return np.array([bow(s, words, wordId) for s in col]), words
    

In [29]:
def to_df(col):
    feat, words = to_bow(col)
    column_name = [col.name + ': ' + x for x in words]
    feat_text1 = pd.DataFrame(feat, columns = column_name)
    return feat_text1

In [30]:
feat_text_name = to_df(feat_text_bow.name)
feat_text_amenities = to_df(feat_text_bow.amenities)
feat_text_summary = to_df(feat_text_bow.summary)
feat_text_neighborhood_overview = to_df(feat_text_bow.neighborhood_overview)
feat_text_transit = to_df(feat_text_bow.transit)

In [31]:
feat_text = pd.concat([feat_text_name,feat_text_amenities,feat_text_summary,feat_text_neighborhood_overview,feat_text_transit], axis = 1)

In [32]:
feat_text

Unnamed: 0,name: in,name: seattle,name: room,name: hill,name: home,name: private,name: modern,name: apartment,name: view,name: capitol,...,transit: 45,transit: visit,transit: surrounding,transit: rainier,transit: per,transit: long,transit: give,transit: frequent,transit: four,transit: each
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2839,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2840,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2841,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2842,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Final Feature Matrix

In [33]:
X_final = pd.concat([feat_num, feat_cat, feat_text], axis = 1)

In [34]:
X_final.shape

(2844, 2329)

In [35]:
X_final

Unnamed: 0,host_response_rate,host_listings_count,accommodates,bathrooms,bedrooms,beds,price,security_deposit,cleaning_fee,guests_included,...,transit: 45,transit: visit,transit: surrounding,transit: rainier,transit: per,transit: long,transit: give,transit: frequent,transit: four,transit: each
0,96.0,3.0,4.0,1.0,1.0,1.0,85.0,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
1,98.0,6.0,4.0,1.0,1.0,1.0,150.0,100.0,40.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,67.0,2.0,11.0,4.5,5.0,7.0,975.0,1000.0,300.0,10.0,...,0,0,0,0,0,0,0,0,0,0
3,100.0,2.0,6.0,2.0,3.0,3.0,450.0,700.0,125.0,6.0,...,0,0,0,0,0,0,0,0,0,0
4,184.0,1.0,2.0,1.0,1.0,1.0,120.0,0.0,40.0,1.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2839,100.0,1.0,5.0,1.0,2.0,3.0,154.0,150.0,85.0,4.0,...,0,0,0,0,0,0,0,0,0,0
2840,100.0,1.0,3.0,1.0,1.0,1.0,65.0,0.0,20.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2841,100.0,3.0,4.0,1.0,1.0,2.0,95.0,500.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2842,99.0,354.0,6.0,2.0,3.0,3.0,359.0,0.0,230.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
y_final = y > np.median(y)
y_final = y_final.astype(int)
y_final

0       0
1       0
2       1
3       0
4       0
       ..
2839    0
2840    1
2841    0
2842    0
2843    1
Name: review_scores_rating, Length: 2844, dtype: int32

In [37]:
out_fp = '../data/out/'
X_final.to_csv(out_fp + 'airbnb_features.csv')
y_final.to_csv(out_fp + 'airbnb_target.csv')