### Introduction
This is an exploratory analysis notebook for airbnb listings in Nairobi. 
The data is scraped from airbnb's website.


### Load Modules

In [None]:
# load modules
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# check version
print('Python: {}'.format(sys.version))
print('Pandas: {}'.format(pd.__version__))
print('Numpy: {}'.format(np.__version__))
print('Matplotlib: {}'.format(mpl.__version__))
print('Seaborn: {}'.format(sns.__version__))

### 1. Load Data

In [None]:
# load data
listings = pd.read_json('../data/listings.json')

In [None]:
listings.info()

The dataset has 280 apartment listings

In [None]:
listings.head()

### 2. Data Cleaning

In [None]:
# replace specific values
listings.at[31,'details']= ['2 guests', '1 bedroom','1 bed', '1 bath', 'Wifi']
listings.at[73,'details'] = ['2 guests', '1 bedroom','1 bed', '1 bath', 'Wifi','Kitchen']
listings.at[230,'details'] = ['2 guests', 'Studio','1 bed', '1 bath', 'Wifi','Kitchen','Free parking']
listings.at[260,'details'] = ['1 guest', '1 bedroom','1 bed', '1 shared bath', 'Wifi','Kitchen','Free parking']
listings.at[104,'details'] = ['2 guests', 'Studio', '2 beds', '1 bath', 'Wifi','','Free parking']
listings.at[106,'details'] = ['2 guests', '1 bedroom', '1 bed', '1 bath', 'Wifi','', 'Free parking']
listings.at[169,'details'] = ['2 guests', '1 bedroom', '1 bed', '1 bath', 'Wifi','', 'Free parking']
listings.at[271,'details'] = ['2 guests', 'Studio', '1 bed', '1 bath', 'Wifi','', 'Free parking']
listings.at[278,'details'] = ['2 guests', 'Studio', '1 bed', '1 bath', 'Wifi','', 'Free parking']

In [None]:
# check total number of amenities offered
listings['total_amenities'] = listings['details'].apply(len)

# cleanup
listings[['guests','bedrooms','beds','baths','wifi','kitchen','parking','pool']] = listings['details'].apply(pd.Series)
listings['price($)'] = listings['price'].apply(pd.Series)
listings['rating'] = listings['review_score'].apply(pd.Series)
listings[['x','reviews','y']] = listings['total_reviews'].apply(pd.Series)
listings['price($)'] = listings['price($)'].str.replace('$','')
listings['price($)'] = listings['price($)'].astype(int)
listings['is_superhost'] = np.where(listings['is_superhost']=='SUPERHOST','yes','no')
listings = listings.drop(['x','y'],axis=1)
listings['rating'] = np.where(listings['rating'].isna(),0,listings['rating'])
listings['reviews'] = np.where(listings['reviews'].isna(),0,listings['reviews'])

# cleanup
listings['guests'] = listings['guests'].str.replace(' guests| guest', '')
listings['bedrooms'] = listings['bedrooms'].str.replace(' bedrooms| bedroom','')
listings['beds'] = listings['beds'].str.replace(' beds|bed','')
listings['bedrooms'] = listings['bedrooms'].str.replace('Studio','0')
listings['baths'] = listings['baths'].str.replace(' baths| bath','')
listings['baths'] = listings['baths'].str.replace('Half-bath','0.5')
listings['shared_bath'] = np.where(listings['baths'].str.contains('shared'),'yes','no')
listings['baths'] = listings['baths'].str.replace(' shared| private','')
listings['wifi'] = np.where(listings['wifi']=='Wifi','yes','no')
listings['kitchen'] = np.where(listings['kitchen']=='Kitchen','yes','no')
listings['parking'] = np.where(listings['parking']=='Free parking','yes','no')
listings['pool'] = np.where(listings['pool']=='Pool','yes','no')


In [None]:
# select clean columns
clean_cols = ['title','description','is_superhost','guests',
 'bedrooms','beds','baths','wifi','kitchen','parking','pool',
 'price($)','rating','reviews','shared_bath']
listings_clean = listings[clean_cols]
listings_clean.head()

In [None]:
# prices overview
listings_clean['price($)'].describe()

In [None]:
# plot price distribution
price_distribution = pd.DataFrame(listings_clean['price($)'].value_counts().reset_index())
price_distribution.columns =['price','total']
price_distribution = price_distribution.sort_values(by='price',ascending=True)
fig,ax = plt.subplots(figsize=(10,6))
# price_distribution.plot(kind='scatter',ax=ax)
sns.scatterplot(x='price', y='total',
            ci=None,
            palette="Blues_d",
#             orient='v',
            data=price_distribution)
ax.set_xlabel('Price ($)')
ax.set_ylabel('Number of Apartments')
plt.title('Price Distribution for Arbnb Apartments in Nairobi')
plt.show()

- Most apartments cost between \\$20 \& \$30 per night
- There are few apartments that cost more than \\$80 per night

In [None]:
# distribution by bedrooms
bedrooms_distribution = listings_clean['bedrooms'].value_counts().reset_index()
bedrooms_distribution.columns = ['bedrooms','total']
fig,ax=plt.subplots(figsize=(8,4))
sns.barplot(x='bedrooms',y='total',order=['0','1','2','3'],data=bedrooms_distribution)

ax.set_xlabel('Number of Bedrooms')
ax.set_ylabel('Number of Apartments')
plt.title('Bedrooms Distribution for Airbnb Apartments')
plt.show()

- One bedroom apartments are the most popular

In [None]:
# distribution by bedrooms
rating_distribution = listings_clean['rating'].value_counts().reset_index()
rating_distribution.columns = ['rating','total']
rating_distribution['rating'] = rating_distribution['rating'].astype(float)
rating_distribution = rating_distribution[rating_distribution['rating']>0]
fig,ax=plt.subplots(figsize=(8,4))
sns.scatterplot(x='rating',y='total',data=rating_distribution)

ax.set_xlabel('Rating')
ax.set_ylabel('Number of Apartments')
plt.title('Rating Distribution for Airbnb Apartments')
plt.show()

Customers are generally happy wih their hosts

In [None]:
# let's check average prices per bedrooms
avg_price_per_bedroom = listings_clean.groupby('bedrooms')['price($)'].mean().rename('price').reset_index()

fig,ax=plt.subplots(figsize=(8,4))
sns.barplot(x='bedrooms',y='price',palette='Blues_d',order=['0','1','2','3'],data=avg_price_per_bedroom)

ax.set_xlabel('Number of Bedrooms')
ax.set_ylabel('Price ($)')
plt.title('Average Price per Bedroom')
plt.show()

- As expected, average prices increase with the number of bedrooms

In [None]:
rating_greater_than_zero = listings_clean.copy()
rating_greater_than_zero['rating'] = rating_greater_than_zero['rating'].astype(float)
rating_greater_than_zero = rating_greater_than_zero[rating_greater_than_zero['rating']>0]

rating_distribution = rating_greater_than_zero.groupby('rating')['price($)'].mean().rename('price').reset_index()

fig,ax = plt.subplots(figsize=(8,4))
sns.scatterplot(x='rating',y='price',data=rating_distribution)
ax.set_xlabel('Rating')
ax.set_ylabel('Mean Price ($)')
plt.title('Price Distribution by Rating Scores')

plt.show()

In [None]:
rating_greater_than_zero['reviews'] = rating_greater_than_zero['reviews'].astype(int)
reviews_summary = rating_greater_than_zero.groupby('bedrooms')['reviews'].sum().rename('total').reset_index()

fig,ax=plt.subplots(figsize=(8,4))
sns.barplot(x='bedrooms',y='total',palette='Blues_d',order=['0','1','2','3'],data=reviews_summary)

ax.set_xlabel('Number of Bedrooms')
ax.set_ylabel('Total Reviews')
plt.title('Number of Reviews per Bedroom')
plt.show()

In [None]:
listings_clean.rename(columns={'price($)':'price'},inplace=True)

In [None]:
listings_clean.head()

In [None]:
listings_final = listings_clean.drop(['title','description'],axis=1)
listings_final['reviews'] = listings_final['reviews'].astype(int)
listings_final['rating'] = listings_final['rating'].astype(float)
listings_final['guests'] = listings_final['guests'].astype(int)
listings_final['baths'] = listings_final['baths'].astype(float)
listings_final['bedrooms'] = listings_final['bedrooms'].astype(int)
listings_final['beds'] = listings_final['beds'].astype(int)
listings_final.head()

In [None]:
listings_final.describe()

In [None]:
# plot summary for int values
listings_final.plot(kind='box', subplots=True,layout=(2,4),
                    sharex=False,sharey=False, figsize=(12,6))
plt.show()

In [None]:
# fig,ax = plt.subplots(figsize=(12,6))
listings_final.hist(figsize=(12,6))
plt.show()

In [None]:
# # scale listings data
# listings_final_scaled = listings_final.copy()

# scaler_X = MinMaxScaler()
# scaler_y = MinMaxScaler()

# cols_to_scale = ['guests', 'bedrooms','beds', 'baths', 'rating', 'reviews']
# listings_final_scaled[cols_to_scale] = scaler_X.fit_transform(listings_final_scaled[cols_to_scale])
# listings_final_scaled['price'] = scaler_y.fit_transform(listings_final_scaled['price'])
# listings_final_scaled.head()


### One Hot Encoding
In this section we prepare the data for machine learning

In [None]:
listings_ml = pd.get_dummies(listings_final)
# listings_ml = listings_final.copy()
# listings_ml['kitchen'] = np.where(listings_ml['kitchen']=='yes',1,0)
# listings_ml['wifi'] = np.where(listings_ml['wifi']=='yes',1,0)
# listings_ml['is_superhost'] = np.where(listings_ml['is_superhost']=='yes',1,0) 
# listings_ml['parking'] = np.where(listings_ml['parking']=='yes',1,0) 
# listings_ml['pool'] = np.where(listings_ml['pool']=='yes',1,0) 
# listings_ml['shared_bath'] = np.where(listings_ml['shared_bath']=='yes',1,0)
listings_ml.head()

###  Machine Learning
- First we set aside 10% of our data for validation
- Then we train different regression algorithms and compare their performance
- We then choose the model with the best score

In [None]:
# split the data into training 
# & validation set

# generate random indices
val = 0.10
len_dataset = len(listings_ml)
val_indices = [x for x in np.random.randint(0, len_dataset,int(val*len_dataset))]
# print(val_indices)

# split the dataset based on indices
validation = listings_ml.loc[listings_ml.index.isin(val_indices)]
train = listings_ml.loc[~listings_ml.index.isin(val_indices)]
print("Train Set: ",len(train),"Test Set: ", len(validation))
validation.head()

In [None]:
# validation set
val_test = validation.drop('price',axis=1)
val_actual = validation['price']

# train data into train & test sets
train = train.reset_index(drop=True)
X = train.drop('price',axis=1)
y = train['price']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

# scale_data
# scaler = MinMaxScaler()
# X_train_scaled = scaler.fit_transform(X_train)

In [None]:
# train rf model
rf_model = RandomForestRegressor(n_estimators=2000,criterion='mse',verbose=0,max_leaf_nodes=25)
rg_model = DecisionTreeRegressor(max_depth=5)
svm_model = SVR()
lasso_model = Lasso(alpha=1.0)
ridge_model = Ridge(alpha=1.0)
linear_model = LinearRegression()
gbr_model = GradientBoostingRegressor(max_depth=3,n_estimators=1000,learning_rate=0.001)
xgb_model = xgboost.XGBRegressor()
neighbors_model = KNeighborsRegressor(n_neighbors=2)

model_scores = []

models = [rf_model, rg_model, svm_model,
          lasso_model,ridge_model,linear_model,
         gbr_model,xgb_model,neighbors_model]
for model in models:
#     print(model)
    model_score = {}
    model_score['Model'] = str(model).split('(')[0]
    
    fit_model = model.fit(X_train,y_train)            # fit model
    
#     X_test_scaled = scaler.transform(X_test)
    y_pred = [round(x,0) for x in fit_model.predict(X_test)]     # test performance on test set
    mse = round(mean_squared_error(y_pred,y_test),2)
    rmse = round(mse**0.5,2)
    model_score['test_mse']= mse
    model_score['test_rmse']= rmse

#     val_test_scaled = scaler.transform(val_test)
    val_pred = model.predict(val_test)                 # test performance on validation set
    mse_val = round(mean_squared_error(val_actual,val_pred),2)
    rmse_val = round(mse_val**0.5,2)
    model_score['validation_mse'] = mse_val
    model_score['validation_rmse'] = rmse_val
    
    model_scores.append(model_score)                  # add model result to scores list

    
model_scores_df = pd.DataFrame(model_scores)
# print(model_scores_df)
model_scores_df = model_scores_df.sort_values(by='validation_rmse',ascending=True)
model_scores_df

In [None]:
# define labels & xticks
labels = [x.split('Re')[0] for x in model_scores_df['Model']]
x_ticks = [x for x in range(len(labels))]

# plot model scores
fig,ax = plt.subplots(figsize=(10,6))
sns.barplot(x='Model', y='validation_rmse', data=model_scores_df,ax=ax)
ax.set_xlabel('')
plt.xticks(rotation=10,ticks=x_ticks,labels=labels)
plt.show()

In [None]:
# make predictions using the model
rf_model.fit(X,y)
rg_model.fit(X,y)
ridge_model.fit(X,y)
neighbors_model.fit(X,y)
linear_model.fit(X,y)
rf_vals = [round(x,0) for x in rf_model.predict(val_test)]
dt_vals = [round(x,0) for x in rg_model.predict(val_test)]
rg_vals = [round(x,0) for x in ridge_model.predict(val_test)]
kn_vals = [round(x,0) for x in neighbors_model.predict(val_test)]
lnr_vals = [round(x,0) for x in linear_model.predict(val_test)]

results_df = pd.DataFrame({'RandomForest': rf_vals,
                           'DecisionTree': dt_vals,
                           'Ridge': rg_vals,
                           'Kneighbors': kn_vals,
                           'Linear':lnr_vals,
                           'actual': val_actual})
results_df.head(10)

In [None]:
results_df.describe()

In [None]:
# plot results for RF & DT models
fig,ax=plt.subplots(figsize=(8,6))
plt.plot(results_df['RandomForest'],val_test['reviews'],'go', label='RandomForest')
plt.plot(results_df['DecisionTree'],val_test['reviews'],'bo', label='DecisionTree')
plt.plot(results_df['actual'],val_test['reviews'],'ro', label = 'Actual')
plt.title("Predicted Prices for Total Reviews")
plt.legend()
plt.show()

### Create Final Model: RandomForest Regression


In [None]:
# prepare the data
X = listings_ml.drop('price',axis=1)
y = listings_ml['price']

# split the data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)
# X_train.shape,X_test.shape,y_train.shape,y_test.shape

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
# X_train.to_csv('models/X_train_data.csv',index=False)

# create model
model = RandomForestRegressor(n_estimators=2000,criterion='mse',max_depth=5)
# model = Ridge()
model.fit(X_train_scaled,y_train)

# save model
pickle.dump(model,open('models/model_v1.pkl','wb'))

# load model
saved_model = pickle.load(open('models/model_v1.pkl','rb'))


In [None]:
X_test_scaled = scaler.transform(X_test)
mse = round(mean_squared_error(saved_model.predict(X_test_scaled),y_test),1)
rmse = round(mse**0.5,1)
print('\n',"MSE: ", mse,'\n',"RMSE:", rmse)

In [None]:
x_level = 10
round(saved_model.predict(X_test_scaled)[x_level],0)

In [None]:
x_val = X_test[x_level:x_level+1]
x_val

In [None]:
y_test[x_level:x_level+1]

In [None]:
# linear regression features
# important_features_dict = {}

# model_coefficients = [x for x in saved_model.coef_]
# model_columns = [x for x in X_train.columns]
# linear_model_features = pd.DataFrame(list(zip(model_columns,model_coefficients)),columns=['feature','coef'])
# linear_model_features['coef_abs'] = abs(linear_model_features['coef'])
# linear_model_features = linear_model_features.sort_values(by='coef_abs',ascending=False)
# linear_model_features.to_csv('models/feature_importances.csv',index=False)
# linear_model_features

In [None]:
# randomforest regression
# get feature importances

important_features_dict = {}
for x,i in enumerate(saved_model.feature_importances_):
    important_features_dict[x]=i


important_features_list = sorted(important_features_dict,
                                 key=important_features_dict.get,
                                 reverse=True)

print('Most important features: %s' %important_features_list)

In [None]:
# # get feature names
feature_names = [x for x in X_train.columns]

# create feature importances df
importances_df = pd.DataFrame(important_features_dict,index=[0]).melt().drop('variable',axis=1)
importances_df.insert(loc=0,column='feature', value = feature_names)

# save file
importances_df.to_csv('models/feature_importances.csv',index=False)

importances_df.sort_values(by='value',ascending=False)
