## HLCM - Data Preprocessing & Model Estimation

Arezoo Besharati, UrbanSim, June 2018 


In [1]:
import os; os.chdir('../../')
import numpy as np, pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import skew
import warnings;
warnings.simplefilter('ignore')

In [2]:
class color:
    BOLD = '\033[1m'
    END = '\033[0m'
    RED = '\033[91m'
    PURPLE = '\033[95m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'

In [3]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import orca

### Load data

In [4]:
# Load any script-based Orca registrations
from scripts import datasources
from scripts import models

In [5]:
orca.list_tables()

['parcels', 'buildings', 'rentals', 'units', 'households', 'persons', 'jobs']

### Generate Node variables

In [None]:
orca.run(['initialize_network'])

In [None]:
orca.run(['network_aggregations'])

In [None]:
for table_name in orca.list_tables():
    print(table_name.upper())
    print(orca.get_table(table_name).to_frame().columns.tolist())
    print()

In [None]:
#orca.list_broadcasts()

## Data Cleaning

 - Handling missing values
 - Checking the data types
 - Normalization: check for feature's distributions (the skewness)
 - Scalling: Check the units of data
 - Create dummy variables if needed


## 1. Buildings Table

### 1. 1. Missing values and data types

In [None]:
# Does data have missing values? 
bld = orca.get_table('buildings').to_frame()
print(color.RED + 'Data has missing values'+ color.END if bld.isnull().values.any() else 'Data doesnt have missing values')
print ('')

# What columns/ variables
print (color.BOLD +'Columns with missing values :\n'+ color.END + '{}'.format(bld.columns[bld.isna().any()].tolist()))
print ('')

# Check features datatypes and see if there is any string feature that needs to be int/float
print(color.BOLD + 'String Features: \n'+ color.END+ '{}'.format(bld.dtypes[bld.dtypes == "object"]))


costar_rent should be integer. 

In [None]:
# change the data type and update the column
bld.costar_rent = pd.to_numeric(bld.costar_rent, errors='coerce')

# Take a count of missing values. What proportion of the data is missing? 
missing_values_count_bld = bld[['redfin_sale_price', 'redfin_sale_year', 'redfin_home_type', 'costar_property_type', 'costar_rent', 'node_id']].isnull().sum()
print ('')
print(color.BOLD + 'BUILDINGS' + color.END + '\nCount of missing points: \n{}'.format (missing_values_count_bld))
print('Proportion of missing points: \n{}'.format (missing_values_count_bld/ len (bld.costar_rent)))
print ('')


- 99% of costar rent is missing!! Don't use this feature
- 40 percent of redfin sale price and year is missing  

In [None]:
# drop the missing rows for "redfin_home_type" and "costar_property_type"       
bld.dropna(subset=['redfin_home_type', 'costar_property_type'],inplace = True)

## Update the whole table
orca.add_table('buildings', bld)

In [None]:
#bld.redfin_home_type.value_counts()

In [None]:
#bld = bld[(bld.redfin_home_type !='Ranch')&(bld.redfin_home_type !='Timeshare')&(bld.redfin_home_type !='Other')& (bld.redfin_home_type !='Unknown')]

### 1. 2. Normalization

In [None]:
numeric_feats = bld.dtypes[bld.dtypes != "object"].index

skewed_feats = bld[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 1]
print(color.BOLD +'Skewed features are as follows'+ color.END + '\n{}'.format (skewed_feats))

In [None]:
skewed_feats = skewed_feats[['improvement_value', 'residential_sqft', 'sqft_per_unit','non_residential_sqft',
                            'building_sqft','nonres_rent_per_sqft','res_price_per_sqft','redfin_sale_price']]

In [None]:
skewed_feats = skewed_feats.index

bld[skewed_feats] = np.log1p(bld[skewed_feats])

# Update the building table 
orca.add_table('buildings', bld)

_____________________________________________________________________________________________________

## 2. Households Table

### 2. 1. Missing values and data types

In [None]:
# Does data have missing values? 
households= orca.get_table('households').to_frame()
print(color.RED + 'Data has missing values'+ color.END if households.isnull().values.any() else 'Data doesnt have missing values')
print ('')

# What columns/ variables
print ('Columns with missing values : {}'.format(households.columns[households.isna().any()].tolist()))
print ('')

# Check features datatypes and see if there is any string feature that needs to be int/float
print(color.BOLD + 'String Features: \n'+ color.END+ '{}'.format(households.dtypes[households.dtypes == "object"]))

In [None]:
# Take a count of missing values. What proportion of the data is missing? 
missing_values_count_hh = households[['workers', 'children', 'node_id']].isnull().sum()
print ('')
print(color.BOLD + 'HOUSEHOLDS' + color.END + '\nCount of missing points: \n{}'.format (missing_values_count_hh))
print('Proportion of missing points: \n{}'.format (missing_values_count_hh/ len (households.node_id)))

### Handling missing values

#### Option 1: Not using the features with missing values !

#### Option 2: Deleting/ignoring the missing values !!!

In [None]:
##I don't recommend this method at all. But in case one wants to do it 

#households.dropna(axis=0, how='any', inplace = True)

## Update the whole table
#orca.add_table('households', households)

#### Option 3:  Filling in the Value

In [None]:
# Impute with mean 
households.workers.fillna(households.workers.mean(), inplace = True)

# Update column
orca.get_table('households').update_col_from_series('workers', households.workers)

### 2. 2. Normalizing

In [None]:
# Only income needs to be normalized
households.income = np.log1p(households.income)

# Update column
orca.get_table('households').update_col_from_series('income', households.income)

_____________________________________________________________________________________________________

## 3. Parcels Table

In [None]:
# Does data have missing values? 
parcels= orca.get_table('parcels').to_frame()
print(color.RED + 'Data has missing values'+ color.END if parcels.isnull().values.any() else 'Data doesnt have missing values')
print ('')

# What columns/ variables
print ('Columns with missing values : {}'.format(parcels.columns[parcels.isna().any()].tolist()))
print ('')


#Check features datatypes and see if there is any string feature that needs to be int/float
print(color.BOLD + 'String Features: \n'+ color.END + '{}'.format(parcels.dtypes[parcels.dtypes == "object"]))
print ('')

# Take a count of missing values. What proportion of the data is missing? 
missing_values_count_parcels = parcels[['x', 'y', 'block_id', 'node_id']].isnull().sum()
print(color.BOLD + 'PARCELS' + color.END + '\nCount of missing points: \n {}'.format(missing_values_count_parcels))
print('Proportion of missing points: \n{}'.format(missing_values_count_parcels/ len(parcels.node_id)))

## 4. Node Variables


In [None]:
# Does data have missing values? 
nodes = orca.get_table('nodes').to_frame()
print(color.RED + 'Data has missing values'+ color.END if nodes.isnull().values.any() else 'Data doesnt have missing values')
print ('')

print ('Nodes table shape{}'.format(nodes.shape))
print ('')

# Normalizing
numeric_feats = nodes.dtypes[nodes.dtypes != "object"].index

skewed_feats = nodes[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
print(color.BOLD +'Skewed features are as follows'+ color.END + '\n{}'.format (skewed_feats))

In [None]:

nodes = nodes[nodes.ave_income_500 > 0]

orca.add_table('nodes', nodes)
print ('Nodes table shape{}'.format(nodes.shape))


## Model Estimation

### Renters

In [None]:
df = orca.merge_tables(target='units', tables=['units','buildings','nodes'])

In [None]:
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['units','buildings','nodes']
m.choice_column = 'unit_id'
m.alt_sample_size = 50
m.chooser_filters = ['tenure == 2 & household_id <1000']

m.model_expression = 'res_price_per_sqft + population + ave_income_500 + job_500 + renters - 1'

m.name = 'hlcm_renter'
m.tags = ['arezoo', 'test']
m.fit()

In [None]:
m2 = LargeMultinomialLogitStep()
m2.choosers = ['households']
m2.alternatives = ['units','buildings','nodes']
m2.choice_column = 'unit_id'
m2.alt_sample_size = 50
m2.chooser_filters = ['tenure == 2 & household_id <1000']

m2.model_expression = 'res_price_per_sqft + population + ave_income_500:income + job_500 + renters - 1'

m2.name = 'hlcm_renter_2'
m2.tags = ['arezoo', 'test']
m2.fit()

In [None]:
m3 = LargeMultinomialLogitStep()
m3.choosers = ['households']
m3.alternatives = ['units','buildings','nodes']
m3.choice_column = 'unit_id'
m3.alt_sample_size = 50
m3.chooser_filters = ['tenure == 2 & household_id <1000']

m3.model_expression = 'res_price_per_sqft + population + ave_income_500 + job_500+ renters + redfin_home_type - 1'

m3.name = 'hlcm_renter_3'
m3.tags = ['arezoo', 'test']
m3.fit()

### Owners

In [None]:
m3 = LargeMultinomialLogitStep()
m3.choosers = ['households']
m3.alternatives = ['units','buildings','nodes']
m3.choice_column = 'unit_id'
m3.alt_sample_size = 50
m3.chooser_filters = ['tenure == 1 & household_id <1000']

m3.model_expression = ' population +ave_income_500 + job_500 + renters +year_built - 1'

m3.name = 'hlcm_owner'
m3.tags = ['arezoo', 'test','hlcm','owner']
m3.fit()


### Warnings:

Updating a column gives error if you want to convert obj (string) to integer


### To Do:
    
- create more node variables such as average household size 
- clean the redfin_home_type var from Buildings table
- check the skewness for node variables


### Cheat Sheet

In [None]:
# To add a new column to an existing orca table
#orca.add_column('name_of_the_table', 'new_column_name', new_column, cache=False, cache_scope='forever')