## HLCM Experiment
Arezoo Besharati, UrbanSim, July 2018 


In [None]:
import os; os.chdir('../../')
import numpy as np, pandas as pd 
import matplotlib.pyplot as plt
import warnings;
warnings.simplefilter('ignore')

In [None]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import orca
import seaborn as sns
%matplotlib notebook

### Load data

In [None]:
# Load any script-based Orca registrations
from scripts import datasources
from scripts import models

In [None]:
#mm.list_steps()

In [None]:
#mm.initialize()

In [None]:
orca.list_tables()
orca.list_broadcasts()
#orca.list_steps()

### Load accessibility vars

In [None]:
%%capture
orca.run(["initialize_network_walk"])

In [None]:
%%capture
orca.run(["initialize_network_small"])

### Explore the data

In [None]:
bld = orca.get_table('buildings').to_frame()
bld.columns.tolist()

In [None]:
hh = orca.get_table('households').to_frame()
hh.columns.tolist()

### Some Data Processing

In [None]:
# scale the income
hh.income_cl= hh.income/1000
# update the column
orca.get_table('households').update_col_from_series('income', hh.income_cl)

In [None]:
hh.building_type.head()

In [None]:
hh['building_type']= hh['building_type'].astype(int)
hh_singlefamily=hh[hh['building_type']==2]
hh_multifamily=hh[hh['building_type']!=2]

In [None]:
hh_singlefamily.income.plot(kind='hist',bins=100)
print(hh_singlefamily.income.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
hh_singlefamily.income.plot(kind='hist',bins=100)
print(hh_singlefamily.income.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
%matplotlib notebook
ax = sns.boxplot(x = hh_singlefamily.income,palette="Set2", width=0.5)


In [None]:
%matplotlib notebook
ax = sns.boxplot(x = hh_multifamily.income,palette="Set2", width=0.5)

In [None]:
hh_multifamily.income.plot(kind='hist',bins=100)
print(hh_multifamily.income.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
# Create a dummy for if a building type is single family 
bld.is_building_type_id_1=(bld.building_type_id == 1).astype(int)
orca.add_column('buildings', 'is_building_type_id_1', bld.is_building_type_id_1)


# Create a dummy for if a year built is after 1950
bld.is_year_built_after_1950=(bld.year_built>1950).astype(int)
orca.add_column('buildings', 'is_year_built_after_1950', bld.is_year_built_after_1950)

# Create a dummy for if a building stories is less than 5
bld.is_stories_less_than_5=(bld.stories<5).astype(int)
orca.add_column('buildings', 'is_stories_less_than_5', bld.is_stories_less_than_5)

# Create dummy for white

hh.white = (hh.race_of_head == 1).astype(int)
hh.hisp = (hh.hispanic_head == 'yes').astype(int)

orca.add_column('households', 'white', hh.white)
orca.add_column('households', 'hispanic', hh.hisp)


In [None]:
nodeswalk= orca.get_table('nodeswalk').to_frame()
nodessmall= orca.get_table('nodessmall').to_frame()
print(nodeswalk.columns.tolist())
print(nodessmall.columns.tolist())

In [None]:

#np.log1p(nodeswalk.pop_white_500_walk[nodeswalk['pop_white_500_walk']>0]).plot(kind='hist', bins= 100)
#nodeswalk.pop_white_500_walk.quantile([0.10,0.5,0.9,0.99,1])
#np.log(nodeswalk.pop_white_1500_walk+1).plot(kind='hist', bins= 100)
# nodeswalk.pop_1500_walk.plot(kind='hist',bins=100)
# nodeswalk.pop_1500_walk.quantile([0.10,0.5,0.9,0.99,1])
#np.cbrt(nodeswalk.pop_1500_walk).plot(kind='hist', bins= 100)
#np.log1p(nodeswalk.pop_1500_walk).plot(kind='hist', bins= 100)
# hh.building_type.value_counts().plot(kind='bar')
# hh.building_type.value_counts()

# bld.building_type_id.value_counts().sort_index().plot(kind='bar')
# bld.building_type_id.value_counts().sort_index()

#bld.stories.value_counts().plot(kind='bar')
#bld.stories.value_counts()

## Get the predicted price from REPM

In [None]:
#orca.run(['hedonic_rent_sqft'])

## Model Estimation

In [None]:
%%time
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m.choice_column = 'building_id'
m.alt_sample_size = 50

#Filters on choosers
m.chooser_filters = ['household_id <50000 & recent_mover == 1 & 0 <income < 600000']

#Filters on alternatives
m.alt_filters = ['residential_units!=0',
                 '0 < avg_income_500_walk < 300000',
                 '0 < pop_1500_walk < 50000',
                 '0 < pop_white_1500_walk < 30000',
                 'sqft_per_unit > 0',
                 'stories > 0']

m.model_expression = 'np.log1p(res_price_per_sqft)+\
                      np.log(residential_units)+\
                      np.cbrt(pop_1500_walk)+\
                      np.log(avg_income_500_walk):np.log(income)+\
                      np.log1p(pop_hisp_1500_walk)+\
                      np.log1p(pop_black_1500_walk)+\
                      np.log1p(jobs_25000)+\
                      np.log1p(pop_white_1500_walk):white +\
                      is_year_built_after_1950+\
                      is_stories_less_than_5+\
                      np.log1p(sqft_per_unit) - 1'

m.name = 'hlcm'
m.tags = ['arezoo', 'test']
m.fit()

### Check for multicolinearity

In [None]:
df = orca.merge_tables(target = 'buildings', tables = ['buildings','parcels','nodeswalk','nodessmall'])
#df.columns.tolist()
df_sf= df[df['residential_units']==1]
df_mf= df[(df['residential_units']!=1)&(df['residential_units']!=0)]

In [None]:
%matplotlib notebook

df_sf.avg_income_1500_walk.plot(kind='hist',bins=100)
plt.show()
print(df_sf.avg_income_1500_walk.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
%matplotlib notebook

df_mf.avg_income_1500_walk.plot(kind='hist',bins=100)
plt.show()
print(df_mf.avg_income_1500_walk.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
%matplotlib inline
plt.subplots(figsize=(20, 20))


# Create correlation matrix
corr_matrix = df_sf.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
high_corr = [column for column in upper.columns if any(upper[column] > 0.95)]
X= df_sf[high_corr]
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")

In [None]:
%matplotlib inline
plt.subplots(figsize=(20, 20))

# Create correlation matrix
corr_matrix = df_mf.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
high_corr = [column for column in upper.columns if any(upper[column] > 0.95)]
X= df_mf[high_corr]
plt.subplots(figsize=(20, 20))
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")

In [None]:
%matplotlib inline
Selected_features = ['res_price_per_sqft', 'residential_units','pop_500_walk',
                     'jobs_1500_walk','pop_white_500_walk',
                     'avg_income_500_walk','pop_black_1500_walk', 'sqft_per_unit','jobs_25000']
X = df[Selected_features]

plt.subplots(figsize=(8, 8))
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")
plt.show()