# Predicting Rent per Sqft for Bay Area

Arezoo Besharati, UrbanSim, July 2018

In [1]:
from __future__ import print_function
import warnings;warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

from scipy.stats import norm
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
import os; os.chdir('../..')

In [3]:
import orca

In [4]:
# Load any script-based Orca registrations
from scripts import datasources
from scripts import models

In [5]:
orca.list_tables()

['parcels',
 'buildings',
 'craigslist',
 'rentals',
 'nodessmall',
 'nodeswalk',
 'units',
 'households',
 'persons',
 'jobs']

In [6]:
rentals = orca.get_table('rentals').to_frame()
rentals.shape

(363055, 16)

In [7]:
rentals = orca.get_table('rentals').to_frame()
rentals['pred_rent_sqft'] = 0.0
rentals['res_sqft_per_unit'] = rentals['sqft']

orca.add_table('rentals', rentals)

<orca.orca.DataFrameWrapper at 0x1013cb908>

In [8]:
df = orca.merge_tables(target = 'rentals', tables = ['rentals','nodeswalk','nodessmall'])
df.shape

(363055, 107)

In [9]:
df.rent_sqft.max()

19.609375

In [10]:
#df.columns.tolist()

In [11]:
df.drop(['state',
         'bathrooms',
         'rent',
         'longitude',
         'latitude',
         'neighborhood',
         'county',
         'date',
         'avg_rent_1500_walk',
         'avg_rent_10000',
         'region',
         'avg_rent_25000',
         'node_id_small',
         'node_id_walk',
         'Unnamed: 0',
         'sqft',
         'avg_rent_500_walk',
         'fips_block'] , axis = 1, inplace = True)

In [12]:
from scipy.stats import skew

df.dtypes[df.dtypes != "object"]
numeric_feats = df.dtypes[df.dtypes != "object"].index

skewed_feats = df[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness

skewed_feats = skewed_feats[skewed_feats > 0.7]
skewed_feats = skewed_feats.index

df[skewed_feats] = np.log1p(df[skewed_feats])

In [13]:
#print(df.shape)
#df.columns.tolist()

In [14]:
# all_data_na = (df.isnull().sum() / len(df)) * 100
# all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
# missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
# missing_data.head(20)

In [15]:
df = df.dropna()

In [16]:
#creating matrices for sklearn:
X = df.loc[:, df.columns != 'rent_sqft']
y = np.log1p(df.rent_sqft)

## Spliting the dataset for validation

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
y_train.max() 

1.3317226100425972

In [19]:
y_test.max()

1.3927102525403174

## Model

In [20]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(n_estimators=100, max_features=30,n_jobs= -1)
forest_model_2= RandomForestRegressor(n_estimators=200, max_features=.5,n_jobs= -1)
forest_model = forest_model.fit(X_train, y_train)
forest_model_2 = forest_model_2.fit(X_train, y_train)
preds = forest_model.predict(X_test)
preds_2 = forest_model_2.predict(X_test)
print('Number of predictions = %2.f' % len(preds))

Number of predictions = 115763


## Prediction and Model evaluation 

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt
# The mean squared error
meanSquaredError = mean_squared_error(y_test, preds)
print('Mean squared error = %.2f' %meanSquaredError)

# rmse
rootMeanSquaredError = sqrt(meanSquaredError)
print('RMSE = %.2f' %rootMeanSquaredError)

# Explained variance score: 1 is perfect prediction
print('R2_score = %.2f' % r2_score(y_test, preds))


print('R2_score = %.2f' % r2_score(y_test, preds_2))



Mean squared error = 0.00
RMSE = 0.04
R2_score = 0.89
R2_score = 0.89


### Tuning the model for overfit

In [29]:
forest_model_3= RandomForestRegressor(n_estimators=100, max_features=30,n_jobs= -1, max_depth = 30)
forest_model_3 = forest_model_3.fit(X_train, y_train)
preds_3 = forest_model_3.predict(X_test)
print('R2_score = %.2f' % r2_score(y_test, preds_3))

R2_score = 0.89


In [None]:
preds = np.exp(preds)-1
y_test = np.exp(y_test) -1

In [None]:
% matplotlib inline

f, ax = plt.subplots(1, figsize=(10, 10))
plt.title("Predicted vs Observed Rent per Sqft", size = 30,color="b",alpha=1)

ax = sns.regplot(y = preds, x = y_test, \
                scatter_kws={"color":"darkred","alpha":0.3,"s":1})

ax.set_xlabel("Observed Values",size = 20,color="b",alpha=0.5)
ax.set_ylabel("Predicted Values",size = 20,color="b",alpha=0.5)

## Feature Importance

In [None]:
sorted(zip(map(lambda x: round(x, 3), forest_model.feature_importances_), X_train.columns), reverse=True)[:15]

In [None]:
coef = pd.Series(map(lambda x: round(x, 3), forest_model.feature_importances_), index = X_train.columns)
#print("Random Forest picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [None]:
% matplotlib inline

from matplotlib.pyplot import *
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)

imp_coef = coef.sort_values().tail(10)
imp_coef.plot(kind = "barh")

plt.title("Coefficients in the Random Forest Model")
plt.show()

### Try simple regression tree rather than random forest

In [None]:
df_small = df[['rent_sqft','res_sqft_per_unit','rich_1500_walk','avg_income_25000','bedrooms','units_mf_25000',\
         'avg_income_10000','jobs_25000','renters_25000','pop_white_2500_walk','units_25000','pop_asian_25000',
        'jobs_10000','hh_25000','jobs_1500_walk','pop_black_10000']]

In [None]:
#creating matrices for sklearn:
X = df_small.loc[:, df_small.columns != 'rent_sqft']
y = np.log1p(df_small.rent_sqft)

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr_1 = DecisionTreeRegressor(max_depth=2)
dtr_2 = DecisionTreeRegressor(max_depth=4)

dtr_1= dtr_1.fit(X_train, y_train)
dtr_2= dtr_2.fit(X_train, y_train)


preds_1= dtr_1.predict(X_test)
preds_2= dtr_2.predict(X_test)


preds_2_train = dtr_2.predict(X_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt
# The mean squared error
meanSquaredError = mean_squared_error(y_test, preds_2)
print('Mean squared error = %.2f' %meanSquaredError)

# rmse
rootMeanSquaredError = sqrt(meanSquaredError)
print('RMSE = %.2f' %rootMeanSquaredError)

# Explained variance score: 1 is perfect prediction
print('R2_score = %.2f' % r2_score(y_test, preds_2))
print('R2_score = %.2f' % r2_score(y_train, preds_2_train))

In [None]:
# Plot the results
plt.figure()
plt.scatter(X_test['res_sqft_per_unit'], y_test, s=20, edgecolor="black",
            c="darkorange", label="data")
plt.plot(X_test['res_sqft_per_unit'], preds_1, color="cornflowerblue",
         label="max_depth=2", linewidth=2)
plt.plot(X_test['res_sqft_per_unit'], preds_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

In [None]:


plt.figure(figsize=(8,6))
plt.plot(X_test['res_sqft_per_unit'], y_test, 'o', label='observation')
plt.plot(X_test['res_sqft_per_unit'], preds_2, linewidth=4, alpha=.7, label='prediction')
plt.xlabel('temperature')
plt.ylabel('bikes')
plt.legend()
plt.show()

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(dtr_2, out_file='tree.dot',feature_names=['res_sqft_per_unit','rich_1500_walk','avg_income_25000','bedrooms','units_mf_25000',\
         'avg_income_10000','jobs_25000','renters_25000','pop_white_2500_walk','units_25000','pop_asian_25000',
        'jobs_10000','hh_25000','jobs_1500_walk','pop_black_10000'])

In [None]:
!dot -Tpng tree.dot > tree.png # to convert the tree in a png image


In [None]:
from IPython.display import Image
Image(filename='tree.png')

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(dtr_2, out_file=dot_data,feature_names=['res_sqft_per_unit','rich_1500_walk','avg_income_25000','bedrooms','units_mf_25000',\
         'avg_income_10000','jobs_25000','renters_25000','pop_white_2500_walk','units_25000','pop_asian_25000',
        'jobs_10000','hh_25000','jobs_1500_walk','pop_black_10000']  
                ,filled=True, rounded=True,
                special_characters=True, max_depth=None)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())