# Car ownership model

In [4]:
import numpy as np
import pandas as pd
import orca
import os; os.chdir('../')
import warnings; warnings.simplefilter('ignore')
from urbansim.utils import misc
import pandana as pdna
from collections import OrderedDict
    
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import pandas as pd
import orca
# import os; os.chdir('/home/juan/activitysynth/')
import warnings; warnings.simplefilter('ignore')
from matplotlib import pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
from sklearn.neighbors import BallTree

import pylogit as pl                   # For choice model estimation
from pylogit import nested_logit as nl # For nested logit convenience funcs
import math 
from collections import OrderedDict 

import pickle
import dill
import time
import random
import scipy.stats as st

import urbansim_templates

from scripts import datasources, models, variables, utils

In [5]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [6]:
os.getcwd()

'/home/juan'

## Preparing data

In [7]:
input_file_format = 'csv'
input_data_dir = '/home/data/spring_2019/base/'

formattable_fname_dict = {
    'parcels': 'parcels.{0}',
    'buildings': 'buildings.{0}',
    'jobs': 'jobs.{0}',
    'establishments': 'establishments.{0}',
    'households': 'households.{0}',
    'persons': 'persons.{0}',
    'rentals': 'craigslist.{0}',
    'units': 'units.{0}',
    'mtc_skims': 'mtc_skims.{0}',
    'beam_skims_raw': '30.skims-smart-23April2019-baseline.csv.gz',
    'beam_skims_imputed': 'beam_skims_imputed.{0}',
    # the following nodes and edges .csv's will be phased out and
    # replaced by travel model skims entirely
    'drive_nodes': 'drive_nodes.{0}',
    'drive_edges': 'drive_edges.{0}',
    'drive_access_vars': 'drive_net_vars.{0}',
    'walk_nodes': 'walk_nodes.{0}',
    'walk_edges': 'walk_edges.{0}',
    'walk_access_vars': 'walk_net_vars.{0}',
    'zones': 'zones.{0}',
    'zone_access_vars': 'zones_w_access_vars.{0}',
}

def format_fname_dict(formattable_fname_dict, format='csv'):
    formatted_dict = {
        k: v.format('csv')
        for k, v in formattable_fname_dict.items()}
    return formatted_dict

input_fnames = format_fname_dict(
            formattable_fname_dict, input_file_format)

In [8]:
orca.add_injectable('input_file_format', input_file_format)
orca.add_injectable('input_data_dir', input_data_dir)
orca.add_injectable('input_fnames', input_fnames)
orca.add_injectable('store', None)

In [9]:
#Changing directory to run the network aggregation steps
os.chdir('/home/juan/activitysynth/activitysynth/')

In [10]:
orca.run(['initialize_network_small', 'network_aggregations_small', #Drivving Accesibility variables
          'initialize_network_walk', 'network_aggregations_walk', #Walking accesibility variables
          'initialize_imputed_skims','skims_aggregations']) # Beam-based accesibility varaibles

Running step 'initialize_network_small'
Time to execute step 'initialize_network_small': 0.00 s
Running step 'network_aggregations_small'
Computing accessibility variables
Computing units_10000
Computing units_sf_10000
Computing units_mf_10000
Computing pop_10000
Removed 189769 rows because they contain missing values
Computing hh_10000
Removed 189769 rows because they contain missing values
Computing poor_10000
Removed 53114 rows because they contain missing values
Computing renters_10000
Removed 102597 rows because they contain missing values
Computing avg_income_10000
Removed 189769 rows because they contain missing values
Computing jobs_10000
Computing avg_rent_10000
Computing med_rent_10000
Computing pop_white_10000
Removed 107372 rows because they contain missing values
Computing pop_black_10000
Removed 10541 rows because they contain missing values
Computing pop_asian_10000
Removed 51048 rows because they contain missing values
Computing pop_hisp_10000
Removed 31685 rows because

Computing pop_asian_1500_walk
Removed 51048 rows because they contain missing values
Computing pop_hisp_1500_walk
Removed 31685 rows because they contain missing values
Computing pop_2500_walk
Removed 189769 rows because they contain missing values
Computing pop_white_2500_walk
Removed 107372 rows because they contain missing values
Computing pop_black_2500_walk
Removed 10541 rows because they contain missing values
Computing pop_asian_2500_walk
Removed 51048 rows because they contain missing values
Computing pop_hisp_2500_walk
Removed 31685 rows because they contain missing values
Computing jobs_500_walk_retail
Computing jobs_1500_walk_retail
Computing jobs_2500_walk_retail
Computing jobs_500_walk_fire
Computing jobs_1500_walk_fire
Computing jobs_2500_walk_fire
Computing jobs_500_walk_tech
Computing jobs_1500_walk_tech
Computing jobs_2500_walk_tech
Computing jobs_500_walk_serv
Computing jobs_1500_walk_serv
Computing jobs_2500_walk_serv
       units_500_walk  sqft_unit_500_walk  single

In [11]:
#Tables
table_list = ['households', 'persons', 'nodessmall', 'nodeswalk', 'zones', 'access_indicators_ampeak', 'parcels']

households = orca.get_table('households').to_frame()
persons = orca.get_table('persons').to_frame()
nodessmall = orca.get_table('nodessmall').to_frame()
nodeswalk = orca.get_table('nodeswalk').to_frame()
zones = orca.get_table('zones').to_frame()
transit = orca.get_table('access_indicators_ampeak').to_frame()
parcels = orca.get_table('parcels').to_frame()

In [91]:
# orca.list_tables()

In [12]:
#Adding variables to the model 
#Person table 
@orca.column('persons', cache=True)
def children(persons):
    return [1 if x < 18 else 0 for x in persons.age]

@orca.column('persons', cache=True)
def children(persons):
    return [1 if x < 18 else 0 for x in persons.age]

@orca.column('persons')
def age_0_15(persons):
    return  (persons.age.between(0,15, inclusive = True )).astype(int)

@orca.column('persons')
def age_16_17(persons):
    return  (persons.age.between(16,17, inclusive = True )).astype(int)

@orca.column('persons')
def age_18_25(persons):
    return  (persons.age.between(18,25, inclusive = True )).astype(int)

@orca.column('persons')
def age_26_40(persons):
    return  (persons.age.between(26,40, inclusive = True )).astype(int)

@orca.column('persons')
def age_41_60(persons):
    return  (persons.age.between(41,60, inclusive = True )).astype(int)

@orca.column('persons')
def age_60(persons):
    return  (persons.age.between(61,100, inclusive = True )).astype(int)

@orca.column('persons')
def worker_student(persons):
    return  (persons.worker * persons.student).astype(int)

@orca.column('persons')
def non_worker_non_student(persons):
    return  ((~persons.worker.astype(bool))*(~persons.student.astype(bool))).astype(int)

In [64]:
# persons = orca.get_table('persons').to_frame()
# persons

In [13]:
#Creating person-based variables to merge onto Household table
# Missing variables>> Parcel ID

person_var = persons.groupby(by ='household_id').agg({'age': 'max',
                                                    'zone_id_home': 'first',
                                                    'children':'sum',
                                                    'age_0_15': 'sum',
                                                    'age_16_17': 'sum',
                                                    'age_18_25': 'sum',
                                                    'age_26_40': 'sum',
                                                    'age_41_60': 'sum',
                                                    'age_60': 'sum', 
                                                    'non_worker_non_student': 'sum',
                                                    'worker_student':'sum',
                                                    'worker': 'sum',
                                                    'student': 'sum'})

orca.add_table('person_var', person_var);

<orca.orca.DataFrameWrapper at 0x7f4bdde61710>

In [15]:
#Broadcasting 
orca.broadcast(cast = 'nodessmall', onto = 'households', cast_index = True, onto_on = 'node_id_small')
orca.broadcast(cast = 'nodesewalk', onto = 'households', cast_index = True, onto_on = 'node_id_wlak')
orca.broadcast(cast = 'person_var', onto = 'households', cast_index = True, onto_index = True)

# parcels_acc = orca.merge_tables(target = 'parcels', tables = ['parcels', 'nodessmall', 'nodeswalk'])

In [38]:
final_merge = orca.merge_tables(target = 'households', 
                  tables = ['zones', 'parcels', 'buildings', 'units','nodessmall', 
                            'nodeswalk','person_var', 'households'],) 
#                   columns = ['zone_id'])

In [49]:
#Getting hispanic head variable as a dummy variable
final_merge['hisp_head'] = final_merge.hispanic_head.map({'no': 0, 'yes': 1})

#Deleting object type  columns
drop_var = list(final_merge.loc[:,list(final_merge.dtypes[final_merge.dtypes == 'object'].index)])
final_merge.drop(columns=drop_var, inplace= True)

#New cateforization for number of cars per household
final_merge['cars'] = final_merge['cars'].apply(lambda x: 0*(x==0)+ 1*(x==1)+ 2*(x==2)+3*(x>=3))

In [None]:
#Getting variables names for each data source
accessibility_var = set(parcels_acc) - set(parcels)
parcel_var = set(parcels.columns) - set(accessibility_var)
skim_accessibility = set(skim_based_accessibilities.columns)
households_var_continious = ['INCOM','AGE', 'children']#, 'TransitPass'] # ,'PERNO', 'EMPLY'                   
households_var_dummy = ['max_35', 'age_0_15', 
                    'age_16_17', 'age_18_25', 'age_25_40','age_40_60',
                    'age_60', 'worker_student', 'non_worker_non_student',
                   'worker','student','RESTY', 'TEN']#, 'HHLIC', ]
dependent_var = ['cars_alt']

## Machine learning approach

In [53]:
def normalize(data):
    ''' 
    Normalizes values in a Series
    Input: 
    data: Series-like
    
    Return: Normalized Series-like object
    '''
    if data.dtype == 'O':
        return 0
    else:
        return (data - np.mean(data)) / np.std(data)

In [None]:
['zones', 'parcels', 'buildings', 'units','nodessmall', 
                            'nodeswalk','person_var', 'households'

In [132]:
hh_var = set(households)-{'state','serialno','income_10','income_12','income_12p','income_2',
                 'income_4', 'income_6', 'income_8','hh_inc_150kplus','hh_inc_25_to_75k',
                 'hh_inc_75_to_200k','hh_inc_under_25k', 'hh_size_1per','hh_size_over_4',
                 'building_type_2','node_id_small','node_id_walk','single_family',
                 'single_family_int','tenure_1','tenure_2','tenure_3','tenure_4','unit_id',
                 'block_group','block_group_id', 'hispanic_head', 'county','tract'}
zone_var = set(zones)-{'acres'}
parcel_var =set(parcels)-{'node_id','node_id_small','node_id_walk','shape_area','x','y',
                          'zone_id', 'apn', 'parcel_id_local', 'block_id','imputation_flag'}

per_var = set(person_var)-{'zone_id_home'}

In [133]:
final_vars = set(list(hh_var) + list(zone_var) +list(parcel_var) + list(per_var))

In [134]:
df = final_merge.loc[:,list(final_vars)]

In [135]:
# set(parcels)

In [136]:
for x in df.columns:
    if len(df[x].unique()) <= 2:
        print (x)
#     print (x, len(df[x].unique()))

recent_mover
proportion_undevelopable
tenure
tax_exempt_status


In [119]:
df['hispanic_head'].unique()

array([nan])

In [None]:
recent_mover, proportion_undevelopable, tax_exempt_status

In [138]:
households.tenure.unique()

array([2, 1])

In [None]:
"""
Merge is complete. 
>> Run the ML algrotihms for a feature selction
>> Run a real model using pylogit instead of statmodels
>> Save the pickle file and run simulation. Do a step for simulation and add it to the run.py
"""