## HLCM 2 - with saved network vars
Arezoo Besharati, UrbanSim, July 2018 


In [None]:
import os; os.chdir('../../')
import numpy as np, pandas as pd 
import matplotlib.pyplot as plt
import warnings;
warnings.simplefilter('ignore')
%load_ext autoreload
%autoreload 2

In [None]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import orca
import seaborn as sns
%matplotlib notebook

In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Load data

In [None]:
# Load any script-based Orca registrations
from scripts import datasources
from scripts import models

In [None]:
#mm.list_steps()

In [None]:
#mm.initialize()

In [None]:
orca.list_tables()
#orca.list_broadcasts()
#orca.list_steps()

### Load accessibility vars

In [None]:
%%capture
orca.run(["initialize_network_walk"])

In [None]:
%%capture
orca.run(["initialize_network_small"])

### Explore the data

In [None]:
parcel = orca.get_table('parcels').to_frame()
parcel.columns

In [None]:
bld = orca.get_table('buildings').to_frame()
bld.columns

In [None]:
bld.head()

In [None]:
bld.groupby('building_type_id')['residential_units'].sum()

In [None]:
bld.describe().transpose()

In [None]:
bld.quantile(.99)

In [None]:
bld[['residential_units','residential_sqft', 'non_residential_sqft']].sum()

In [None]:
bld.building_type_id.value_counts()

In [None]:
# building_type dummies
bld.single_family = (bld.building_type_id == 1).astype(int)
bld.multi_family = (bld.building_type_id == 3).astype(int)
bld.mixed_use = (bld.building_type_id > 3).astype(int)
# add the columns
orca.add_column('buildings', 'single_family', bld.single_family)
orca.add_column('buildings', 'multi_family', bld.multi_family)
orca.add_column('buildings', 'mixed_use', bld.mixed_use)

In [None]:
bld = orca.get_table('buildings').to_frame()
bld.head()

In [None]:
hh = orca.get_table('households').to_frame()
hh.columns.tolist()

In [None]:
# scale income and create race dummies
hh.income_k = hh.income/1000
hh.white = (hh.race_of_head == 1).astype(int)
hh.black = (hh.race_of_head == 2).astype(int)
hh.asian = (hh.race_of_head == 6).astype(int)
hh.hisp = (hh.hispanic_head == 'yes').astype(int)
hh.single = (hh.persons == 1).astype(int)
hh.elderly = (hh.age_of_head > 65).astype(int)
# add the columns
orca.add_column('households', 'income_k', hh.income_k)
orca.add_column('households', 'white', hh.white)
orca.add_column('households', 'black', hh.black)
orca.add_column('households', 'asian', hh.asian)
orca.add_column('households', 'hispanic', hh.hisp)
orca.add_column('households', 'elderly', hh.elderly)
orca.add_column('households', 'single', hh.single)


In [None]:
#bld.building_type_id.value_counts()

In [None]:
nodeswalk = orca.get_table('nodeswalk').to_frame()
nodessmall = orca.get_table('nodessmall').to_frame()
print(nodeswalk.columns.tolist())
print(nodessmall.columns.tolist())

In [None]:
nodessmall.describe().transpose()

In [None]:
nodessmall_upper = nodessmall.quantile(.99)
nodessmall_upper

In [None]:
nodessmall_clipped = nodessmall.clip_upper(nodessmall_upper, axis=1) 
nodessmall_clipped.describe().transpose()

In [None]:
orca.add_table('nodessmall', nodessmall_clipped)

In [None]:
nodeswalk.describe().transpose()

In [None]:
nodeswalk_upper = nodeswalk.quantile(.99)
nodeswalk_upper

In [None]:
#DataFrame.clip_upper(threshold, axis=None, inplace=False)
nodeswalk_clipped = nodeswalk.clip_upper(nodeswalk_upper, axis=1) 
nodeswalk_clipped.describe().transpose()

In [None]:
nodeswalk['prop_children_500_walk'] = (nodeswalk['children_500_walk'] > 0).astype(int) / nodeswalk['hh_500_walk']
nodeswalk['prop_singles_500_walk'] = nodeswalk['singles_500_walk'] / nodeswalk['hh_500_walk']
nodeswalk['prop_elderly_500_walk'] = nodeswalk['elderly_hh_500_walk'] / nodeswalk['hh_500_walk']
nodeswalk['prop_black_500_walk'] = nodeswalk['pop_black_500_walk'] / nodeswalk['pop_500_walk']
nodeswalk['prop_white_500_walk'] = nodeswalk['pop_white_500_walk'] / nodeswalk['pop_500_walk']
nodeswalk['prop_asian_500_walk'] = nodeswalk['pop_asian_500_walk'] / nodeswalk['pop_500_walk']
nodeswalk['prop_hisp_500_walk'] = nodeswalk['pop_hisp_500_walk'] / nodeswalk['pop_500_walk']
nodeswalk['prop_rich_500_walk'] = nodeswalk['rich_500_walk'] / nodeswalk['pop_500_walk']
nodeswalk['prop_poor_500_walk'] = nodeswalk['poor_500_walk'] / nodeswalk['pop_500_walk']

nodeswalk['prop_children_1500_walk'] = (nodeswalk['children_1500_walk'] > 0).astype(int) / nodeswalk['hh_1500_walk']
nodeswalk['prop_singles_1500_walk'] = nodeswalk['singles_1500_walk'] / nodeswalk['hh_1500_walk']
nodeswalk['prop_elderly_1500_walk'] = nodeswalk['elderly_hh_1500_walk'] / nodeswalk['hh_1500_walk']
nodeswalk['prop_black_1500_walk'] = nodeswalk['pop_black_1500_walk'] / nodeswalk['pop_1500_walk']
nodeswalk['prop_white_1500_walk'] = nodeswalk['pop_white_1500_walk'] / nodeswalk['pop_1500_walk']
nodeswalk['prop_asian_1500_walk'] = nodeswalk['pop_asian_1500_walk'] / nodeswalk['pop_1500_walk']
nodeswalk['prop_hisp_1500_walk'] = nodeswalk['pop_hisp_1500_walk'] / nodeswalk['pop_1500_walk']
nodeswalk['prop_rich_1500_walk'] = nodeswalk['rich_1500_walk'] / nodeswalk['pop_1500_walk']
nodeswalk['prop_poor_1500_walk'] = nodeswalk['poor_1500_walk'] / nodeswalk['pop_1500_walk']

In [None]:
orca.add_table('nodeswalk', nodeswalk_clipped)

In [None]:
#bld.stories.value_counts().plot(kind='bar')
#bld.stories.value_counts()

## Get the predicted price from REPM

In [None]:
#orca.run(['hedonic_rent_sqft'])

## Check Distributions

In [None]:
hh['building_type']= hh['building_type'].astype(int)
hh_singlefamily=hh[hh['building_type']==2]
hh_multifamily=hh[hh['building_type']!=2]

In [None]:
hh_income_upper = hh_singlefamily.income.quantile(0.99)
len(hh[hh['income']>hh_income_upper])

In [None]:
hh_singlefamily.income.plot(kind='hist',bins=100)
print(hh_singlefamily.income.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
hh_singlefamily.income.plot(kind='hist',bins=100)
print(hh_singlefamily.income.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
%matplotlib notebook
ax = sns.boxplot(x = hh_singlefamily.income,palette="Set2", width=0.5)


In [None]:
%matplotlib notebook
ax = sns.boxplot(x = hh_multifamily.income,palette="Set2", width=0.5)

In [None]:
%matplotlib notebook
hh_multifamily.income.plot(kind='hist',bins=100)
print(hh_multifamily.income.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
%matplotlib notebook
hh_singlefamily.income.plot(kind='hist',bins=100)
print(hh_singlefamily.income.quantile([0.10,0.5,0.9,0.99,1]))

## Check for Multicollinearity

In [None]:
df = orca.merge_tables(target = 'buildings', tables = ['buildings','parcels','nodeswalk','nodessmall'])
#df.columns.tolist()
df_sf= df[df['residential_units']==1]
df_mf= df[df['residential_units']>1]

In [None]:
%matplotlib notebook
df_sf.avg_income_1500_walk.plot(kind='hist',bins=100)
plt.show()
print(df_sf.avg_income_1500_walk.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
%matplotlib notebook

df_mf.avg_income_1500_walk.plot(kind='hist',bins=100)
plt.show()
print(df_mf.avg_income_1500_walk.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
%matplotlib inline
plt.subplots(figsize=(20, 20))


# Create correlation matrix
corr_matrix = df_sf.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
high_corr = [column for column in upper.columns if any(upper[column] > 0.98)]
X= df_sf[high_corr]
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")

In [None]:
%matplotlib inline
plt.subplots(figsize=(20, 20))

# Create correlation matrix
corr_matrix = df_mf.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
high_corr = [column for column in upper.columns if any(upper[column] > 0.95)]
X= df_mf[high_corr]
plt.subplots(figsize=(20, 20))
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")

## Model Estimation

In [None]:
%%time
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m.choice_column = 'building_id'
m.alt_sample_size = 50

#Filters on choosers
m.chooser_filters = ['building_type == 2 & household_id <3000000 & recent_mover == 1 & 0 <income < 1000000']

#Filters on alternatives
m.alt_filters = ['residential_units == 1',
                 '0 < avg_income_500_walk < 500000',
                 'sqft_per_unit > 0']

# np.log(residential_units) +
# np.log(income):np.log(avg_income_500_walk) + \
# np.log1p(income):np.log1p(rich_1500_walk) + \
# np.log1p(income):np.log1p(poor_1500_walk) + \
# np.log1p(persons):np.log1p(sqft_per_unit) + \

m.model_expression = ' \
np.log1p(res_price_per_sqft) + \
np.log1p(income):np.log1p(res_price_per_sqft) + \
np.log1p(sqft_per_unit) + \
np.log1p(income):np.log1p(sqft_per_unit) + \
np.log1p(acres) + \
np.log1p(income):np.log1p(acres) + \
np.log1p(pop_1500_walk) + \
np.log1p(jobs_1500_walk) + \
np.log1p(jobs_25000) + \
single:np.log1p(singles_1500_walk) + \
elderly:np.log1p(elderly_hh_1500_walk) + \
children:np.log1p(children_1500_walk) + \
white:np.log1p(pop_white_1500_walk) + \
black:np.log1p(pop_black_1500_walk) + \
asian:np.log1p(pop_asian_1500_walk) + \
hispanic:np.log1p(pop_hisp_1500_walk) \
- 1'

m.name = 'hlcm'
m.tags = ['arezoo', 'test']
m.fit()

In [None]:
%%time
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m.choice_column = 'building_id'
m.alt_sample_size = 50

#Filters on choosers
m.chooser_filters = ['building_type > 2 & household_id <1000000 & recent_mover == 1 & 0 <income < 600000']

#Filters on alternatives
m.alt_filters = ['residential_units > 1',
                 '0 < avg_income_500_walk < 500000',
                 'sqft_per_unit > 0']

# np.log(residential_units) +

m.model_expression = ' np.log(residential_units) + \
np.log1p(res_price_per_sqft) + \
np.log1p(sqft_per_unit) + \
np.log1p(income):np.log1p(sqft_per_unit) + \
np.log1p(jobs_1500_walk) + \
np.log1p(jobs_25000) + \
np.log(income):np.log(avg_income_1500_walk) + \
np.log1p(pop_1500_walk) + \
white:np.log1p(pop_white_1500_walk) + \
black:np.log1p(pop_black_1500_walk) + \
asian:np.log1p(pop_asian_1500_walk) + \
hispanic:np.log1p(pop_hisp_1500_walk) \
- 1'

m.name = 'hlcm'
m.tags = ['arezoo', 'test']
m.fit()

In [None]:
## Check for multicolinearity

In [None]:
df = orca.merge_tables(target = 'buildings', tables = ['buildings','parcels','nodeswalk','nodessmall'])


In [None]:
Selected_features = ['residential_units','res_price_per_sqft', 'pop_500_walk', 'jobs_1500_walk','singles_1500_walk',
                     'elderly_hh_1500_walk','children_1500_walk',
                     'avg_income_1500_walk','pop_black_1500_walk', 'sqft_per_unit',
                     'jobs_25000','pop_white_1500_walk','pop_black_1500_walk','pop_asian_1500_walk',
                     'pop_hisp_1500_walk']

X = df_mf[Selected_features]

plt.subplots(figsize=(12, 12))
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")
plt.show()