# Create UrbanSim Jobs Table

In [26]:
import pandas as pd
import numpy as np
from tqdm import tqdm

#### 1. Load data

In [2]:
establishments = pd.read_csv('/home/mgardner/data/establishments.csv')

In [3]:
buildings = pd.read_hdf('/home/mgardner/data/2015_09_01_bayarea_v3.h5', 'buildings')

In [4]:
parcels = pd.read_hdf('/home/mgardner/data/2015_09_01_bayarea_v3.h5', 'parcels')

In [17]:
persons = pd.read_csv('/home/mgardner/data/chts-orig/data/Deliv_PER.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [18]:
persons_lookup = pd.read_csv('/home/mgardner/data/chts-orig/data/LookUp_PER.csv')

#### 2. Expand establishments based on number of employees

In [6]:
jobs = pd.DataFrame(columns=['job_id','sector_id','building_id'])

In [7]:
for i, establishment in tqdm(establishments.iterrows(), total=len(establishments)):
    multiplier = establishment['employees']
    if multiplier > 0:
        jobs = jobs.append([establishment[['establishment_id','sector_id','building_id']]] * int(multiplier), ignore_index=True)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  other = other.loc[:, self.columns]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
100%|██████████| 234956/234956 [3:31:43<00:00, 18.50it/s]


#### 3. Get county ID for jobs

In [8]:
bldgs_w_parcel = pd.merge(buildings, parcels, left_on='parcel_id', right_index=True)

In [14]:
bldgs_w_parcel.index.name = 'building_id'

In [15]:
jobs = pd.merge(jobs, bldgs_w_parcel[['county_id']].reset_index(), on='building_id')

#### 4. Compute `county` $\times$ `industry` $\times$ `occupation` probabilities from CHTS

In [19]:
persons = pd.merge(
    persons.set_index(['SAMPN','PERNO']),
    persons_lookup.set_index(['SAMPN','PERNO']),
    left_index=True, right_index=True,
    suffixes=('_persons', '_lookup')).reset_index()

In [56]:
county_industry_occupation_counts = persons.groupby(['WCTFIP_lookup','INDUS','OCCUP']).agg({'SAMPN':'count'})

In [57]:
county_industry_occupation_probs = county_industry_occupation_counts.groupby(level=[0,1]).apply(
    lambda x: x/x.sum()).reset_index().rename(columns={'SAMPN': 'prob', 'WCTFIP_lookup': 'county_FIPS'})

In [58]:
county_industry_occupation_probs['county_FIPS'] = county_industry_occupation_probs['county_FIPS'].astype(int).astype(str).str.zfill(3)
county_industry_occupation_probs['INDUS'] = county_industry_occupation_probs['INDUS'].astype(int).astype(str)

In [59]:
industry_occupation_counts = persons.groupby(['INDUS','OCCUP']).agg({'SAMPN':'count'})

In [60]:
industry_occupation_probs = industry_occupation_counts.groupby(level=0).apply(
    lambda x: x/x.sum()).reset_index().rename(columns={'SAMPN': 'prob'})

In [61]:
industry_occupation_probs['INDUS'] = industry_occupation_probs['INDUS'].astype(int).astype(str)

In [62]:
county_occupation_counts = persons.groupby(['WCTFIP_lookup','OCCUP']).agg({'SAMPN':'count'})

In [63]:
county_occupation_probs = county_occupation_counts.groupby(level=0).apply(
    lambda x: x/x.sum()).reset_index().rename(columns={'SAMPN': 'prob', 'WCTFIP_lookup': 'county_FIPS'})

In [64]:
county_occupation_probs['county_FIPS'] = county_occupation_probs['county_FIPS'].astype(int).astype(str).str.zfill(3)

#### 5. Impute occupation in jobs table from CHTS data

In [92]:
jobs['county_FIPS'] = jobs['county_id'].astype(str).str.zfill(3)
jobs['occupation_id'] = -99  # 98 == "DK"
jobs['2_digit_sector_id'] = jobs['sector_id'].astype(str).str[0:2]

In [96]:
for i, row in tqdm(jobs.iterrows(), total=len(jobs)):
    
    # define occupation probability by industry and county
    probs = county_industry_occupation_probs[(
        county_industry_occupation_probs['county_FIPS'] == row['county_FIPS']) & (
        county_industry_occupation_probs['INDUS'] == row['2_digit_sector_id']
    )]
    
    # if no such entries exist, define probabilities by industry
    if len(probs['prob']) == 0:
        probs = industry_occupation_probs[(
            industry_occupation_probs['INDUS'] == row['2_digit_sector_id'])]
        
    # if no such entries exist, define probabilities by county
    if len(probs['prob']) == 0:
        probs = county_occupation_probs[(
            county_occupation_probs['county_FIPS'] == row['county_FIPS'])]
        
    
    jobs.loc[i, 'occupation_id'] = np.random.choice(probs['OCCUP'], p=probs.prob)

100%|██████████| 2656876/2656876 [12:51:22<00:00, 57.41it/s]  


In [114]:
jobs = pd.read_csv('/home/mgardner/data/jobs_w_occup.csv')

In [123]:
jobs.index.name = 'job_id'
jobs = jobs.rename(columns={'2_digit_sector_id': 'naics'})
jobs['occupation_id'] = jobs['occupation_id'].astype(int)

In [124]:
jobs[['building_id', 'naics', 'occupation_id']].to_csv('/home/mgardner/data/jobs_w_occup.csv')