In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
#The goal is to build a machine learning model to be used to predict the price
#However, this is so a user can tell from just the app if the price is a good value
#This means the columns will be limited to data readily available on the app.
#I am also going to remove descriptive text, like summary for example
#Adding this to a model is beyond thhe scope of this project

In [3]:
#Import calendar and listings df
calendar_df = pd.read_csv('../Data/calendar.csv')
listings_df = pd.read_csv('../Data/listings.csv')

#Change index to allow merging
listings_df = listings_df.rename(index=str, columns={'id': 'listing_id'})
#And then merge
combined_df = pd.merge(calendar_df, listings_df, on = 'listing_id')

#Now to drop all the rows that do not have a price_x as this is our predictor
combined_df = combined_df.dropna(subset=['price_x'])

#Create a new dataframe of just the values I can use 
df = combined_df[['bathrooms','bedrooms','beds','date','host_total_listings_count',
                  'number_of_reviews','review_scores_rating','price_x']].copy()

In [4]:
#Now for some basica analysis to see what kind of cleaning we need to do

In [5]:
#First check for nan's
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df

Unnamed: 0,column_name,percent_missing
date,date,0.0
number_of_reviews,number_of_reviews,0.0
price_x,price_x,0.0
host_total_listings_count,host_total_listings_count,0.009844
beds,beds,0.039057
bedrooms,bedrooms,0.21433
bathrooms,bathrooms,0.513942
review_scores_rating,review_scores_rating,15.466079


In [6]:
#Then to check the format of the columns data
df.dtypes

bathrooms                    float64
bedrooms                     float64
beds                         float64
date                          object
host_total_listings_count    float64
number_of_reviews              int64
review_scores_rating         float64
price_x                       object
dtype: object

In [7]:
#checklist
#1.Bathrooms, Bedrooms, and beds have some missing values. Simple thing is to replace with the mode
#2.host_total_listings_count, is missing less than 1%, take the mode
#3.review_scores_rating is missing quite a few values, will just take the mean
#4.number_of_reviews does not need any changes
#5.date has no missing values. nothing directly needs to be done
#6.Price needs to be converted to float and renamed
#6.create a new value that has the month extracted from date
#7.create a new value that is number of reviews * review scores rating

In [8]:
#The following columns will all fill in nan with the mode
cols = ['bathrooms','bedrooms','beds','host_total_listings_count']
df[cols]=df[cols].fillna(df.mode().iloc[0])
#Nan for review score will use the mean
df['review_scores_rating'] = df['review_scores_rating'].fillna(df['review_scores_rating'].mean())
#Convert date to datetime object, get the month, drop original date column
df['date'] = pd.to_datetime(df['date'])
df['month_of_date'] = df['date'].dt.month
df = df.drop(columns = ['date'])
#Convert price to a string, remove non nummerical values, delete original column
df['price'] = df[['price_x']].astype(str)
df['price'] = df[['price']].replace('[$, ]','',regex=True).astype(float)
df = df.drop(columns = ['price_x'])
#Trying some feature engineering I saw in a kaggle competition
#It didn't add to much value to the model, but I thought it was nice to keep in
df['engineered_review'] = df['number_of_reviews'] * df['review_scores_rating']
df['engineered_review_2'] = df['number_of_reviews'] * df['review_scores_rating'] * df['review_scores_rating']

In [9]:
#Now to build and run the model

In [10]:
#Split out our X and y variables and then create train test splits
X = df.drop(columns='price')
y = df[['price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) 

In [11]:
#It's always a good idea to scale your data
sc = StandardScaler()
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test) 

In [None]:
#Using a random forest regressor as this gave the best results
reg = RandomForestRegressor(n_estimators=100,
                            criterion='mse',
                            random_state=42,
                            n_jobs=-1)

reg.fit(X_train,y_train.squeeze())

In [None]:
#Gettin g the predicted values and seeing how well out model scored
y_train_preds = reg.predict(X_train)
y_test_preds = reg.predict(X_test)

print('Random Forest MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_preds),
        mean_squared_error(y_test, y_test_preds)))
print('Random Forest R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_preds),
        r2_score(y_test, y_test_preds)))

In [None]:
#This wasn't the best performing model, but considering the limitations I put in place, it will do

In [None]:
#To visiualize the important porameters I need to get them in a dataframe
headers = ["name", "score"]
values = sorted(zip(X.columns, reg.feature_importances_), key=lambda x: x[1] * -1)
important_features = pd.DataFrame(values, columns = headers)
important_features = important_features.sort_values(by = ['score'], ascending = False)

In [None]:
#Now to graph the results
sns.barplot(x=important_features.name,y=important_features.score,palette='Paired')

plt.title('Important Features, ranked')
plt.xticks(rotation=80)
plt.tight_layout()
plt.savefig('Important_Features.png',dpi=300)
plt.savefig('Important_Features_jpg.jpg',dpi=300)

In [None]:
#The number of bedrooms and bathrooms are the most significant factors in price. 
#This is likely due to the correlation to them and the size of the rental. 