# Create UrbanSim Jobs Table

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests
from lxml import html

#### 1. Load data

In [2]:
establishments = pd.read_csv('/home/data/spring_2019/base/establishments.csv', index_col='establishment_id')

In [3]:
buildings = pd.read_csv('/home/data/spring_2019/base/buildings.csv', index_col='building_id')

  mask |= (ar1 == a)


In [4]:
parcels = pd.read_csv('/home/data/spring_2019/base/parcels.csv', index_col='primary_id',
                      dtype={'primary_id': int, 'block_id':str, 'county_id': str})

#### 2. Expand establishments based on number of employees

In [44]:
jobs = pd.DataFrame(columns=['job_id','sector_id','building_id'])

In [45]:
jobs = establishments.reset_index()[['establishment_id', 'sector_id', 'building_id', 'employees']]
jobs = jobs.reindex(jobs.index.repeat(jobs.employees)).reset_index(drop=True)

#### 3. Get county ID for jobs

In [46]:
bldgs_w_parcel = pd.merge(buildings, parcels, left_on='parcel_id', right_index=True)

In [47]:
bldgs_w_parcel.index.name = 'building_id'

In [48]:
jobs = pd.merge(jobs, bldgs_w_parcel[['county_id', 'block_id', 'x', 'y']].reset_index(), on='building_id')

In [49]:
jobs['county_id'] = jobs['county_id'].astype(str).str.zfill(3)

In [11]:
naics_to_ctpp_label = {
    11: 'Agriculture, forestry, fishing and hunting, and mining',
    21: 'Agriculture, forestry, fishing and hunting, and mining',
    22: 'Transportation and warehousing, and utilities',
    23: 'Construction',
    31: 'Manufacturing',
    32: 'Manufacturing',
    33: 'Manufacturing',
    42: 'Wholesale trade',
    44: 'Retail trade',
    45: 'Retail trade',
    48: 'Transportation and warehousing, and utilities',
    49: 'Transportation and warehousing, and utilities',
    51: 'Information',
    52: 'Finance, insurance, real estate and rental and leasing',
    53: 'Finance, insurance, real estate and rental and leasing',
    54: 'Professional, scientific, management, administrative,  and waste management services',
    55: 'Professional, scientific, management, administrative,  and waste management services',
    56: 'Professional, scientific, management, administrative,  and waste management services',
    61: 'Educational, health and social services',
    62: 'Educational, health and social services',
    71: 'Arts, entertainment, recreation, accommodation and food services',
    72: 'Arts, entertainment, recreation, accommodation and food services',
    81: 'Other services (except public administration)',
    92: 'Public administration',
    99: None
}

#### 4. Compute `county` $\times$ `industry` $\times$ `occupation` probabilities from CHTS

In [12]:
naics_soc_df = pd.read_csv(
    "/home/max/A202212 - Occupation (25) by Industry (15)(Workers 16 years and over) (3).csv",
    skiprows=1
)

In [13]:
ctpp_occ_to_code_dict = {
    'Management occupations': 11,
    'Farmers and farm managers': 45,
    'Business and financial operations specialists': 13,
    'Computer and mathematical occupations': 15,
    'Architecture and engineering occupations': 17,
    'Life, physical, and social science occupations': 19,
    'Community and social service occupations': 21,
    'Legal occupations': 23,
    'Education, training, and library occupations': 25,
    'Arts, design, entertainment, sports, and media occupations': 27,
    'Healthcare practitioners and technicians occupations': 29,
    'Healthcare support occupations': 31,
    'Protective service occupations': 33,
    'Food preparation and serving related occupations': 35,
    'Building and grounds cleaning and maintenance occupations': 37,
    'Personal care and service occupations': 39,
    'Sales and related occupations': 41,
    'Office and administrative support occupations': 43,
    'Farming, fishing, and forestry occupations': 45,
    'Construction and extraction occupations': 47,
    'Installation, maintenance, and repair occupations': 49,
    'Production occupations': 51,
    'Transportation and material moving occupations': 53,
    'Armed Forces': 55,
}

In [14]:
naics_soc_df['Occupation 25'] = naics_soc_df['Occupation 25'].map(ctpp_occ_to_code_dict)

In [15]:
naics_soc_df['prob'] = naics_soc_df['Workers 16 and Over pct1'] / 100

In [16]:
naics_soc_df.sample(5)

Unnamed: 0,WORKPLACE,Industry 15,Occupation 25,Output,Workers 16 and Over pct1,Unnamed: 5,prob
13854,6077,Information,21.0,Estimate,0.36,,0.0036
18114,6101,Wholesale trade,45.0,Estimate,4.35,,0.0435
12450,6069,"Finance, insurance, real estate and rental and...",45.0,Estimate,0.0,,0.0
12607,6071,"Total, all industries",23.0,Estimate,0.56,,0.0056
12135,6067,"Educational, health and social services",39.0,Estimate,10.81,,0.1081


In [17]:
wages = pd.read_csv('/home/max/msa_occ_wages.csv')

In [18]:
wages.msa.value_counts()

solano    22
sf        22
sc        22
sonoma    22
napa      22
Name: msa, dtype: int64

In [19]:
county_to_msa_dict = {
    '001': 'sf',
    '013': 'sf',
    '041': 'sf',
    '055': 'napa',
    '075': 'sf',
    '081': 'sf',
    '085': 'sc',
    '095': 'solano',
    '097': 'sonoma',
}

In [21]:
wages.ann_mean_wage.value_counts()

66130     2
46150     1
43130     1
65350     1
127040    1
         ..
31140     1
28070     1
45390     1
93610     1
37890     1
Name: ann_mean_wage, Length: 109, dtype: int64

#### 5. Impute occupation in jobs table from CHTS data

In [51]:
jobs['occupation_id'] = -99  # 98 == "DK"

In [52]:
jobs.index.name = 'job_id'

In [53]:
jobs.reset_index(inplace=True)

In [54]:
missing = []
for i, group in tqdm(jobs.groupby(['county_id', 'sector_id'])):
    probs = naics_soc_df.loc[
        (naics_soc_df['WORKPLACE'] == '06' + i[0]) &
        (naics_soc_df['Industry 15'] == naics_to_ctpp_label[i[1]])]
    if len(probs) == 0:
        missing.append(i)
    else:
        probs['prob'] = probs['prob'] / probs['prob'].sum()
        jobs.loc[jobs['job_id'].isin(group['job_id']), 'occupation_id'] = np.random.choice(
            probs['Occupation 25'], size=len(group), p=probs['prob'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
100%|██████████| 225/225 [00:35<00:00,  6.26it/s]


In [55]:
jobs.sample(20)

Unnamed: 0,job_id,establishment_id,sector_id,building_id,employees,county_id,block_id,x,y,occupation_id
671426,671426,61988,33,1660305.0,120.0,85,60855050062023,-121.93918,37.395529,17.0
1972798,1972798,168424,44,1220941.0,2.0,81,60816041013029,-122.410958,37.626933,37.0
566866,566866,55458,71,1184036.0,20.0,1,60014507431086,-121.896924,37.68333,53.0
1403607,1403607,122708,61,1095185.0,15.0,1,60014228001002,-122.257426,37.867373,11.0
1938571,1938571,165591,56,71292.0,3.0,81,60816001001014,-122.406744,37.698047,11.0
708187,708187,63395,81,458447.0,25.0,1,60014422004008,-121.945621,37.521426,49.0
1022385,1022385,89618,92,1306117.0,102.0,13,60133200041061,-122.07901,37.993072,15.0
1955440,1955440,166129,44,335069.0,23.0,75,60750117002029,-122.4071,37.785155,41.0
548123,548123,54330,42,1183569.0,65.0,1,60014506071034,-121.883494,37.648203,51.0
2573807,2573807,239809,42,705876.0,15.0,95,60952534024003,-121.824913,38.445042,35.0


OFFICIAL SOC Codes

- 11: Management Occupations
- 13: Business and Financial Operations Occupations
- 15: Computer and Mathematical Occupations
- 17: Architecture and Engineering Occupations
- 19: Life, Physical, and Social Science Occupations
- 21: Community and Social Services Occupations
- 23: Legal Occupations
- 25: Education, Training, and Library Occupations
- 27: Arts, Design, Entertainment, Sports, and Media Occupations
- 29: Healthcare Practitioners and Technical Occupations
- 31: Healthcare Support Occupations
- 33: Protective Service Occupations
- 35: Food Preparation and Serving Related Occupations
- 37: Building and Grounds Cleaning and Maintenance Occupations
- 39: Personal Care and Service Occupations
- 41: Sales and Related Occupations
- 43: Office and Administrative Support Occupations
- 45: Farming, Fishing, and Forestry Occupations
- 47: Construction and Extraction Occupations
- 49: Installation, Maintenance, and Repair Occupations
- 51: Production Occupations
- 53: Transportation and Material Moving Occupations
- 55: Military Specific Occupations

In [57]:
jobs['occupation_id'].value_counts(dropna=False)

 41.0    330147
 11.0    321836
 43.0    298472
 13.0    167627
 15.0    148469
 35.0    147545
 25.0    109567
 29.0    108281
 53.0    108269
 39.0    108060
 51.0    101696
 47.0     97524
 37.0     91747
 17.0     82676
 27.0     72718
 49.0     59300
 33.0     50438
 19.0     37633
 23.0     37308
 31.0     37102
 21.0     33282
-99.0     20771
 45.0      7578
Name: occupation_id, dtype: int64

In [58]:
missing

[('001', 99),
 ('013', 99),
 ('041', 99),
 ('055', 99),
 ('075', 99),
 ('081', 99),
 ('085', 99),
 ('095', 99),
 ('097', 99)]

In [59]:
jobs.set_index('job_id', inplace=True)
jobs['occupation_id'] = jobs['occupation_id'].astype(int)

In [60]:
jobs = jobs[[col for col in jobs.columns if col not in ['employees']]]

In [61]:
jobs['msa'] = jobs['county_id'].map(county_to_msa_dict)

In [62]:
jobs.shape

(2578046, 9)

In [66]:
jobs2 = pd.merge(jobs, wages[['occ', 'msa', 'ann_mean_wage']],
                left_on=['occupation_id', 'msa'], right_on=['occ', 'msa'])

In [67]:
jobs2.shape

(2557275, 11)

In [42]:
jobs['ann_mean_wage'].value_counts()

53970     207934
156940    206998
49760     194531
96580     114947
33570      90920
           ...  
83580        676
31140        618
86210        494
134560       444
88120        408
Name: ann_mean_wage, Length: 109, dtype: int64

In [69]:
jobs.head()

Unnamed: 0_level_0,establishment_id,sector_id,building_id,county_id,block_id,x,y,occupation_id,msa
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,42,1546057.0,81,60816138003032,-122.271983,37.320879,49,sf
1,0,42,1546057.0,81,60816138003032,-122.271983,37.320879,11,sf
2,3,54,1200520.0,81,60816134003042,-122.267916,37.391072,23,sf
3,6,62,1545946.0,81,60816132004001,-122.221147,37.372496,43,sf
4,6,62,1545946.0,81,60816132004001,-122.221147,37.372496,37,sf


In [70]:
jobs2.to_csv('/home/max/jobs_w_occup_wages.csv')

In [114]:
jobs = pd.read_csv('/home/max/jobs_w_occup.csv')

#### Create NAICS Code Data Dict

In [89]:
page = requests.get('https://www.naics.com/business-lists/counts-by-naics-code/?#countsByNAICS')

In [90]:
tree = html.fromstring(page.content)

In [91]:
table = tree.xpath('//*[@id="post-3900"]/div/table')[0]

IndexError: list index out of range

In [92]:
df = pd.DataFrame(columns=['code', 'industry'])
for i, row in enumerate(table.getchildren()):
    if i > 0 and i < len(table.getchildren()) - 1:
        if '-' in row[0].text_content():
            range_codes = [int(x) for x in row[0].text_content().split('-')]
            all_codes = list(range(range_nums[0], range_nums[1] + 1))
        else:
            all_codes = [row[0].text_content()]
        for code in all_codes:
            df = df.append(
                {'code': code, 'industry': row[1].text_content()},
                ignore_index=True)
    

NameError: name 'table' is not defined

In [107]:
range_nums = [int(x) for x in row[0].text_content().split('-')]

In [118]:
df.to_csv('NAICS_2_digit_data_dict.csv', index=False)