In [None]:
#Importing all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')

In [None]:
listings = pd.read_csv("D:/Yuvoh Assessment Dataset/listings.csv",low_memory=False,encoding = "ISO-8859-1")
listings.head()

In [None]:
listings.info()

In [None]:
print("We have ",listings.id.nunique(),"listings in the listing data.")

In [None]:
for col in listings.columns:
    listings[col] = listings[col].map(lambda x:x if x!=0 else None)

In [None]:
listings.isnull().sum().sort_values(ascending=False)*100/len(listings)

In [None]:
for col in listings.columns:
    if sum(listings[col].isnull())/float(len(listings.index)) > 0.4:
        del listings[col]

In [None]:
len(listings.columns)

In [None]:
listings.isnull().sum().sort_values(ascending=False)*100/len(listings)

In [None]:
listings.drop(['transit','first_review','last_review','availability_60','availability_90','availability_365','calculated_host_listings_count_entire_homes',
               'host_response_time','review_scores_checkin','review_scores_value','review_scores_location','review_scores_accuracy','host_location',
               'review_scores_communication','review_scores_cleanliness','space','neighborhood_overview','host_neighbourhood','listing_url','scrape_id',
              'state','host_thumbnail_url','host_name','host_picture_url','host_since','summary','description','market','zipcode','last_scraped','city','neighbourhood',
              'name','host_url','picture_url','experiences_offered','host_verifications','street','smart_location','country_code','country'],axis=1,inplace=True)

In [None]:
len(listings.columns)

In [None]:
listings.isnull().sum().sort_values(ascending=False)*100/len(listings)

In [None]:
#Number of listings in each neighborhood

listings.groupby(by='neighbourhood_cleansed').count()[['id']].sort_values(by='id', ascending=False).head(10)

In [None]:
#Review score rating
plt.figure(figsize=(12,6))
sns.distplot(listings.review_scores_rating.dropna(), rug=True)
sns.despine()
plt.show();

In [None]:
listings.review_scores_rating.describe()

As expected, most of reviewers leave very high scores.

Exploring Price

In [None]:
listings['price'] = listings['price'].str.replace(',','')
listings['price'] = listings['price'].str.replace('$','')
listings['price'] = listings['price'].astype(float)

In [None]:
listings['price'].head()

In [None]:
listings['price'].describe()

The most expensive Airbnb listing in London is $13700/night.

In order not to be affected by the extreme cases, I decided to remove listings that exceed $600/night for the following exploratory analysis.

In [None]:
#Listings price distribution after removing outliers
listings.loc[(listings.price <= 600) & (listings.price > 0)].price.hist(bins=200)
plt.ylabel('Count')
plt.xlabel('Listing price in $')
plt.title('Histogram of listing prices');

Neighbourhood vs. Price

In [None]:
sort_price = listings.loc[(listings.price <= 600) & (listings.price > 0)]\
                    .groupby('neighbourhood_cleansed')['price']\
                    .median()\
                    .sort_values(ascending=False)\
                    .index
plt.figure(figsize=(15,6))
sns.boxplot(y='price', x='neighbourhood_cleansed', data=listings.loc[(listings.price <= 600) & (listings.price > 0)],order=sort_price)
ax = plt.gca()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.show();

City of London has the highest median price. Barking and Dagenham has the lowest median price. 

Property type vs. Price

In [None]:
sort_price = listings.loc[(listings.price <= 600) & (listings.price > 0)]\
                    .groupby('property_type')['price']\
                    .median()\
                    .sort_values(ascending=False)\
                    .index
sns.boxplot(y='price', x='property_type', data=listings.loc[(listings.price <= 600) & (listings.price > 0)], order=sort_price)
#plt.figure(figsize=(15,6))
ax = plt.gca()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.show();

Nature Lodge has the highest median price followed by Serviced apartment, Boutique hotel.On the other end of the spectrum are house and dome house.

room type vs. price

In [None]:
sort_price = listings.loc[(listings.price <= 600) & (listings.price > 0)]\
                    .groupby('room_type')['price']\
                    .median()\
                    .sort_values(ascending=False)\
                    .index
sns.boxplot(y='price', x='room_type', data=listings.loc[(listings.price <= 600) & (listings.price > 0)], order=sort_price)
ax = plt.gca()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.show();

Entire room/apt has a a much higher median price than the other room types.No Surprise.

In [None]:
listings.loc[(listings.price <= 600) & (listings.price > 0)].pivot(columns = 'room_type', values = 'price').plot.hist(stacked = True, bins=100)
plt.xlabel('Listing price in $');

bed type vs. price

In [None]:
sort_price = listings.loc[(listings.price <= 600) & (listings.price > 0)]\
                    .groupby('bed_type')['price']\
                    .median()\
                    .sort_values(ascending=False)\
                    .index
sns.boxplot(y='price', x='bed_type', data=listings.loc[(listings.price <= 600) & (listings.price > 0)], order=sort_price)
ax = plt.gca()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.show();

Amenities.

In [None]:
listings['amenities'].head()

In [None]:
listings['amenities'] = listings['amenities'].str.replace("[{}]", "").str.replace('"', "")
listings['amenities'].head()

In [None]:
#Top 20 most common amenities.
pd.Series(np.concatenate(listings['amenities'].map(lambda amns: amns.split(","))))\
    .value_counts().head(20)\
    .plot(kind='bar')
ax = plt.gca()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=12)
plt.show();

Wifi, heating, essentials, kitchen and smoke detector etc are among the most common amenities.

Amenities vs. price top 20

In [None]:
amenities = np.unique(np.concatenate(listings['amenities'].map(lambda amns: amns.split(","))))
amenity_prices = [(amn, listings[listings['amenities'].map(lambda amns: amn in amns)]['price'].mean()) for amn in amenities if amn != ""]
amenity_srs = pd.Series(data=[a[1] for a in amenity_prices], index=[a[0] for a in amenity_prices])
amenity_srs.sort_values(ascending=False)[:20].plot(kind='bar')
ax = plt.gca()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=12)
plt.show();

Number of beds vs. price

In [None]:
listings.loc[(listings.price <= 600) & (listings.price > 0)].pivot(columns = 'beds',values = 'price').plot.hist(stacked = True,bins=100)
plt.xlabel('Listing price in $');

Vast majority of the listings have one bed, the one-bed listing has a very wide range in prices. There are listings that have no bed.

In [None]:
sns.boxplot(y='price', x='beds', data = listings.loc[(listings.price <= 600) & (listings.price > 0)])
plt.show();

Numerical Features

In [None]:
col = ['host_listings_count', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating', 'reviews_per_month']
sns.set(style="ticks", color_codes=True)
sns.pairplot(listings.loc[(listings.price <= 600) & (listings.price > 0)][col].dropna())
plt.show();

In [None]:
corr = listings.loc[(listings.price <= 600) & (listings.price > 0)][col].dropna().corr()
plt.figure(figsize = (6,6))
sns.set(font_scale=1)
sns.heatmap(corr, cbar = True, annot=True, square = True, fmt = '.2f', xticklabels=col, yticklabels=col)
plt.show();

It can be seen from the above correlation matrix, accommodates is highly correlated with beds and bedrooms.
So we will keep only one of them.

Feature engineering

In [None]:
listings = listings.loc[(listings.price <= 600) & (listings.price > 0)]

In [None]:
#Term document matrix for amenities feature.
from sklearn.feature_extraction.text import CountVectorizer

listings.amenities = listings.amenities.str.replace("[{}]", "").str.replace('"', "")
count_vectorizer =  CountVectorizer(tokenizer=lambda x: x.split(','))
amenities = count_vectorizer.fit_transform(listings['amenities'])
df_amenities = pd.DataFrame(amenities.toarray(), columns=count_vectorizer.get_feature_names())
df_amenities = df_amenities.drop('',1)

In [None]:
#Replace the values in the following feature to 0 if “f”, to 1 if “t”.
columns =  ['host_is_superhost', 'host_identity_verified', 'host_has_profile_pic','is_business_travel_ready',
                   'is_location_exact', 'requires_license', 'instant_bookable','has_availability',
                   'require_guest_profile_picture', 'require_guest_phone_verification']
for c in columns:
    listings[c] = listings[c].replace('f',0,regex=True)
    listings[c] = listings[c].replace('t',1,regex=True)

In [None]:
#Same way to clean up the other monetary value columns.
listings['security_deposit'] = listings['security_deposit'].fillna(value=0)
listings['security_deposit'] = listings['security_deposit'].replace( '[\$,)]','', regex=True ).astype(float)
listings['cleaning_fee'] = listings['cleaning_fee'].fillna(value=0)
listings['cleaning_fee'] = listings['cleaning_fee'].replace( '[\$,)]','', regex=True ).astype(float)

In [None]:
#Cleaning up host response rate column
listings['host_response_rate'] =  listings['host_response_rate'].str.replace('%','.0').astype('float')/100
listings['extra_people'] = listings['extra_people'].str.replace('$','').astype('float')

In [None]:
#The following are the numeric features we will be using.
num_features = ['host_is_superhost', 'host_identity_verified', 'host_has_profile_pic','is_location_exact', 
                         'requires_license', 'instant_bookable', 'require_guest_profile_picture','accommodates', 
                         'require_guest_phone_verification', 'security_deposit', 'cleaning_fee','beds','bathrooms', 
                         'host_listings_count', 'host_total_listings_count', 'minimum_nights','host_response_rate',
                     'bedrooms', 'guests_included', 'number_of_reviews','review_scores_rating', 'price','reviews_per_month']

In [None]:
#Fill the missing values in the numeric features with median
for col in num_features:
    if listings[col].isnull().any():
        listings[col] = listings[col].fillna(listings[col].median())

In [None]:
#Processing and adding categorical features
for cat_feature in ['property_type', 'room_type', 'cancellation_policy', 'neighbourhood_cleansed', 'bed_type']:
    listings = pd.concat([listings, pd.get_dummies(listings[cat_feature])], axis=1)

In [None]:
listings_new = pd.concat([listings,df_amenities],axis=1,join="inner")

In [None]:
listings_new.head()

In [None]:
#Removing the original variables for which we created dummy variables
listings_new.drop(['property_type', 'room_type', 'cancellation_policy', 'neighbourhood_cleansed', 'bed_type','amenities'],axis=1,inplace=True)

In [None]:
listings_new.shape

Data pre-processing and feature engineering done!

Random Forest Regressor

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
y = listings_new['price']
x = listings_new.drop('price', axis =1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=1)
rf = RandomForestRegressor(n_estimators=500,criterion='mse',random_state=3,n_jobs=-1)

In [None]:
rf.fit(X_train, y_train)
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)
rmse_rf= (mean_squared_error(y_test,y_test_pred))**(1/2)

In [None]:
print('RMSE test: %.3f' % rmse_rf)
print('R^2 test: %.3f' % (r2_score(y_test, y_test_pred)))

Feature importance of Random Forest

In [None]:
coefs_df = pd.DataFrame()
coefs_df['est_int'] = X_train.columns
coefs_df['coefs'] = rf.feature_importances_
coefs_df.sort_values('coefs', ascending=False).head(20)

LightGBM

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

y = listings_new['price']
x = listings_new.drop('price', axis =1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=1)


In [None]:
xgb = XGBRegressor()

In [None]:
parameters = {'objective':['reg:linear'],
              'learning_rate': [0.01], #so called `eta` value
              'max_depth': [10],
              'min_child_weight': [4],
              'silent': [True],
              'subsample': [0.9],
              'colsample_bytree': [0.9],
              'n_estimators': [1000],
             'gamma':[0.2]}

In [None]:
from sklearn.model_selection import GridSearchCV

xgb_grid = GridSearchCV(xgb,parameters,cv = 5,verbose=True)

In [None]:
xgb_grid.fit(X_train,y_train)

y_pred = clf.predict(X_test.values)
print('R^2 test: %.3f' % (r2_score(y_test, y_pred)))