In [1]:
import os; os.chdir('../')
from tqdm import tqdm

In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

In [3]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import MNLDiscreteChoiceStep
import orca

  from pandas.core import datetools


### Load data

In [4]:
chts_persons = pd.read_csv('/home/mgardner/data/chts-orig/data/Deliv_PER.csv', low_memory=False)

In [5]:
chts_persons_lookup = pd.read_csv('/home/mgardner/data/chts-orig/data/LookUp_PER.csv')

In [6]:
chts_persons = pd.merge(
    chts_persons.set_index(['SAMPN','PERNO']),
    chts_persons_lookup.set_index(['SAMPN','PERNO']),
    left_index=True, right_index=True,
    suffixes=('_persons', '_lookup')).reset_index()

In [7]:
jobs = pd.read_csv('/home/mgardner/data/jobs_w_occup.csv')

In [10]:
buildings = pd.read_hdf('./data/bayarea_ual.h5', 'buildings')

In [11]:
parcels = pd.read_hdf('./data/bayarea_ual.h5', 'parcels')

### Get job coords

In [12]:
buildings = pd.merge(buildings, parcels[['x', 'y']], left_on='parcel_id', right_index=True)

In [14]:
jobs = pd.merge(jobs, buildings[['x', 'y']], left_on='building_id', right_index=True)

### Prepare jobs table and CHTS persons table for job assignment

In [43]:
jobs.loc[:,'taken'] = False
jobs.rename(columns={'x': 'lng', 'y': 'lat'}, inplace=True)

In [44]:
chts_persons.loc[:, 'job_id'] = None

In [19]:
# haversine requires data in form of [lat, lng] and inputs/outputs in units of radians
persons_rad = np.deg2rad(chts_persons[['WYCORD_lookup', 'WXCORD_lookup']])
jobs_rad = np.deg2rad(jobs[['lng', 'lat']])
jobs.loc[:, 'x'] = jobs_rad['lng']
jobs.loc[:, 'y'] = jobs_rad['lat']

### Assign CHTS persons a job ID

In [None]:
dists = []
no_job_info = []
no_work_coords = []

for i, person in tqdm(chts_persons.iterrows(), total=len(chts_persons)):
    
    # only assign a job ID for employed persons with a fixed work location
    if (person['EMPLY'] == 1) & (person['WLOC'] == 1):
        
        # skip person if no CHTS industry or occupation
        if (person['INDUS'] > 96) & (person['OCCUP'] > 96):
            no_job_info.append(i)
            continue
        
        # skip person if no work location
        elif pd.isnull(person[['WYCORD_lookup', 'WXCORD_lookup']]).any():
            no_work_coords.append(i)
            continue
        
        # if CHTS industry is unknown, match jobs based on occupation only
        elif person['INDUS'] > 96:
            potential_jobs = jobs[
                (jobs['occupation_id'] == person['OCCUP']) &
                (jobs['taken'] == False)]
        
        # if occupation is unknown, match jobs based on industry only
        elif person['OCCUP'] > 96:
            potential_jobs = jobs[
                (jobs['naics'] == person['INDUS']) &
                (jobs['taken'] == False)]
        
        elif (person['INDUS'] < 97) & (person['OCCUP'] < 97):
            
            # define potential jobs based on industry and occupation
            potential_jobs = jobs[
                (jobs['naics'] == person['INDUS']) &
                (jobs['occupation_id'] == person['OCCUP']) &
                (jobs['taken'] == False)]
            
            # if no such jobs exist, define jobs by industry
            if len(potential_jobs) == 0:
                potential_jobs = jobs[
                    (jobs['naics'] == person['INDUS']) &
                    (jobs['taken'] == False)]
                
            # if no such jobs exist, define jobs by occupation
            if len(potential_jobs) == 0:
                potential_jobs = jobs[
                    (jobs['occupation_id'] == person['OCCUP']) &
                    (jobs['taken'] == False)]
            
            # otherwise, continue
            if len(potential_jobs) == 0:
                continue
        
        # build the tree of potential jobs for fast nearest-neighbor search
        tree = BallTree(potential_jobs[['y','x']], metric='haversine')
        
        # query the tree for nearest job to each workplace
        idx, dist = tree.query(persons_rad.iloc[i].values.reshape(1,-1), return_distance=True)
        
        # save results
        job = potential_jobs.iloc[idx[0]]
        dists.append(dist[0][0])
        chts_persons.loc[i, 'job_id'] = job['job_id'].values[0]
        jobs.loc[jobs['job_id'] == job['job_id'].values[0], 'taken'] = True 


  1%|          | 1330/109113 [01:14<1:40:18, 17.91it/s]

In [33]:
person[['INDUS','OCCUP']]

INDUS    62
OCCUP    29
Name: 8567, dtype: object

In [36]:
persons_rad.iloc[i].values.reshape(1,-1)

array([[nan, nan]])

In [37]:
persons_rad.iloc[i]

WYCORD_lookup   NaN
WXCORD_lookup   NaN
Name: 8567, dtype: float64

In [40]:
pd.isnull(person[['WYCORD_lookup', 'WXCORD_lookup']]).any()

True