In [12]:
import pandas as pd
from zipfile import ZipFile
import os
import numpy as np
import re
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

## Hardcoded data structures

In [13]:
cols = ['Company','Address Line 1','City','State','ZipCode','Employee Size (5) - Location',
        'County Code','Sales Volume (9) - Location','Primary NAICS Code','NAICS8 Descriptions',
        'Primary SIC Code','SIC6_Descriptions','Business Status Code', 
        'Year Established','Subsidiary Number', 'Parent Number', 
        'Parent Actual Employee Size','Parent Actual Sales Volume','ABI','Census Tract',
        'Census Block','Latitude','Longitude','CBSA Code','CBSA Level','CSA Code','FIPS Code']

In [14]:
# Assign the FIPS state code to each record. Prior examination shows some data entry 
# errors in the FIPS Code variable so we do this by brute force.
state_fips = {
'AL':'01',
'AK':'02',
'AS':'60',
'AZ':'04',
'AR':'05',
'CA':'06',
'CO':'08',
'CT':'09',
'DE':'10',
'DC':'11',
'FL':'12',
'FM':'64',
'GA':'13',
'GU':'66',
'HI':'15',
'ID':'16',
'IL':'17',
'IN':'18',
'IA':'19',
'KS':'20',
'KY':'21',
'LA':'22',
'ME':'23',
'MH':'68',
'MD':'24',
'MA':'25',
'MI':'26',
'MN':'27',
'MS':'28',
'MO':'29',
'MT':'30',
'NE':'31',
'NV':'32',
'NH':'33',
'NJ':'34',
'NM':'35',
'NY':'36',
'NC':'37',
'ND':'38',
'MP':'69',
'OH':'39',
'OK':'40',
'OR':'41',
'PW':'70',
'PA':'42',
'PR':'72',
'RI':'44',
'SC':'45',
'SD':'46',
'TN':'47',
'TX':'48',
'UM':'74',
'UT':'49',
'VT':'50',
'VA':'51',
'VI':'78',
'WA':'53',
'WV':'54',
'WI':'55',
'WY':'56'
}

In [15]:
# Short descriptions for 2-digit NAICS codes
naics_desc = {
'11':'Agriculture, Forestry, Fishing and Hunting',
'21':'Mining',
'22':'Utilities',
'23':'Construction',
'31':'Manufacturing',
'32':'Manufacturing',
'33':'Manufacturing',
'42':'Wholesale Trade',
'44':'Retail Trade',
'45':'Retail Trade',
'48':'Transportation and Warehousing',
'49':'Transportation and Warehousing',
'51':'Information',
'52':'Finance and Insurance',
'53':'Real Estate Rental and Leasing',
'54':'Professional, Scientific, and Technical Services',
'55':'Management of Companies and Enterprises',
'56':'Administrative and Support and Waste Management and Remediation Services',
'61':'Eucational Services',
'62':'Health Care and Social Assistance',
'71':'Arts, Entertainment, and Recreation',
'72':'Accommodation and Food Services',
'81':'Other Services (except Public Administration)',
'92':'Public Administration',
'99':'Unknown'
}

In [16]:
ag_naics = {'112112':'Cattle Feedlots',
            '115111':'Cotton Ginning',
            '115112':'Soil Preparation, Planting and Cultivating',
            '115113':'Crop Harvesting, Primarily by Machine',
            '115114':'Postharvest Crop Activities except Cotton Ginning',
            '115115':'Farm Labor Contractors and Crew Leaders',
            '115116':'Farm Management Services',
            '423820':'Farm and Garden Machinery and Equipment Merchant Wholesalers',
            '424430':'Dairy Product Merchant Wholesalers',
            '424440':'Poultry and Poultry Product Merchant Wholesalers',
            '424480':'Fresh Fruit and Vegetable Merchant Wholesalers',
            '424510':'Grain and Field Bean Merchant Wholesalers',
            '424520':'Livestock Merchant Wholesalers',
            '424590':'Other Farm Product Raw Material Merchant Wholesalers',
            '424910':'Farm Supplies Merchant Wholesalers',
            '493130':'Farm Product Warehousing and Storage'}

## Main functions

In [17]:
def extract_and_correct():
    print(f'\n{yr}:',file=logfile)
    dir = '/InfoGroup/data/original/'
    xdir = '/tmp/xtrcts/'
    fname = f'{yr}_Business_Academic_QCQ_utf-8.'
    with ZipFile(dir + fname + 'zip','r') as myzip:
        myzip.extract(fname + 'csv',xdir)
        df = pd.read_csv(xdir + fname + 'csv',dtype=object,usecols=cols)
        os.remove(xdir + fname + 'csv')
    
    # Overwrite the state FIPS code
    df['State FIPS'] = df['State'].apply(state_code)
    df['FIPS Code'] = df.apply(new_fips, axis=1)

    # Exclude territories, leaving only the 50 states and DC.
    df['State FIPS'] = df['State FIPS'].astype(int)
    df = df[df['State FIPS'] <= 56]
        
    # Flag the 48 continental states
    df['Continental'] = df['State FIPS'].apply(is_continental)
        
    # Give 'CBSA Level' a missing value of 0, meaning 'rural'
    df['CBSA Level'].fillna('0',inplace=True)
        
     # Give "Primary NAICS Code" a useable missing value code.
    df['Primary NAICS Code'].fillna('99999999',inplace=True)
        
    # Create 2-digit NAICS code
    df['NAICS2'] = df['Primary NAICS Code'].apply(lambda x: str(x)[:2])
        
    # Create descriptions of the 2-digit NAICS codes
    df['NAICS2 desc'] = df['NAICS2'].apply(naics2_desc)

    # The project statement mentions some 6-digit NAICS codes as
    # definitely referring to agricultural activities.
    # Create 6-digit versions of all NAICS code and descriptions for 
    # those specified in the project statement.
    df['NAICS6'] = df['Primary NAICS Code'].apply(lambda x: str(x)[:6])
    df['NAICS6 desc'] = df['NAICS6'].apply(naics6_desc)    
    return df

def CBSA_partition(df):
    df['CBSA Code'].fillna(-9,inplace=True)
    df['CBSA Level'].fillna(-9,inplace=True)
    df['CBSA Code'] = df['CBSA Code'].astype(int)
    df['CBSA Level'] = df['CBSA Level'].astype(int)
    df['FIPS Code'] = df['FIPS Code'].astype(int)
    
    urban = df[(df['CBSA Level'] > 0)]
    urban['CBSA Text'] = 'urban'
    rural = df[(df['CBSA Code'] > 0) & (df['CBSA Level']==0)]
    rural['CBSA Text'] ='rural'
    unknown = df[(df['CBSA Code'] < 0) | ((df['CBSA Code'] == 0) & (df['CBSA Level']==0))]
    unknown['FIPS Code'] = unknown['FIPS Code'].astype(int)
    
    nrows = len(df)
    sum_of_parts = len(urban) + len(rural) + len(unknown)
    if sum_of_parts != nrows:
        print(f'Error in dividing enterprises into categories:',file=logfile)
        print(f'\t{nrows} != {sum_of_parts}',file=logfile)
        
    corrected = extract_corrections(unknown)
    corrected = cbsa_text(corrected,yr)
    corrected.drop(columns=['CBSA','FIPS Code_r','LSAD'],inplace=True)
    corrected.rename(columns={"FIPS Code_l": "FIPS Code"},inplace=True)

    print(corrected['CBSA Level'].value_counts(),file=logfile)
    return (urban, rural, corrected)

def new_vars(df):
    print(df['CBSA Text'].value_counts(),file=logfile)
    print(df['CBSA Text'].value_counts(normalize=True) * 100,file=logfile)
    print(f'rural/urban by OMB standard:',file=logfile)
    
    df['CBSA Code'] = df['CBSA Code'].fillna(99999)
    df['CBSA Code'] = df['CBSA Code'].astype(int)
    df['UA Code'] = df['CBSA Code'].apply(find_ua)
    showtime('\t5-0')
    df['UA Type'] = df['UA Code'].apply(get_ua_type)
    showtime('\t5-1')
    df['rural_by_UA'] = df['UA Code'].apply(how_rural) 
    showtime('\t5-2')
    print(df['UA Type'].value_counts(),file=logfile)
    print(df['UA Type'].value_counts(normalize=True) * 100,file=logfile)
    print(df['rural_by_UA'].value_counts(),file=logfile)
    print(df['rural_by_UA'].value_counts(normalize=True) * 100,file=logfile)
    
    df['Census Tract'].fillna('999999',inplace=True)
    df['FIPS Code'].fillna('99999',inplace=True)
    df['Full Census Tract'] = df.apply(combo,axis=1)
    showtime('\t5-3')
    df['rural_HRSA'] = df.apply(rur3,axis=1)
    showtime('\t5-4')
    print(df['rural_HRSA'].value_counts(),file=logfile)
    print(df['rural_HRSA'].value_counts(normalize=True) * 100,file=logfile)

    df['ZipCode'] = df['ZipCode'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)
    showtime('\t5-5')
    return df

def merges_and_output(df):
    df_out = df.merge(df_zip,how='left',left_on='ZipCode',right_on='ZIP',indicator=True)
    showtime('\t6-0')
    df_out.drop(columns=['ZIP','_merge'],inplace=True)
    
    # FAR codes apply only to the continental states. To select just those for reporting:
    # df_select = df_out[(df_out['Continental']=='1') & (df_out['State FIPS'] != '11')].copy()
    
    # Log the distribution of FAR levels by year.
    log_far_levels(df_out)

    merged = df_out.merge(tracts,how='left',left_on='Full Census Tract',right_on='GEOID',indicator=True)
    showtime('\t6-1')
    print('Rural=1,Urban=0:',file=logfile)
    print(merged['rural_tract'].value_counts(),file=logfile)
    merged.drop(columns=['GEOID','_merge'],inplace=True)
    print('Missing:',file=logfile)
    print(merged['rural_tract'].isnull().sum(),file=logfile)
    merged.to_csv(f'/InfoGroup/data/rurality/InfoGroup_{yr}_all_steps.csv',index=None)
    showtime('\t6-2\n')


## Little functions

In [18]:
def state_code(state):
    """ Establish an accurate state FIPS code"""
    return state_fips[state]

def new_fips(row):
    if str(row['State FIPS']) == 'nan' or str(row['County Code']) == 'nan':
        return '00000'
    else:
        return str(row['State FIPS']) + str(row['County Code'])

def is_continental(code):
    """ Create a 1/0 flag for the 48 continental states and DC, excluding HI and AK. 
        This could be useful for mapping.
    """
    if int(code) not in [2,15]:
        return 1
    return 0

def naics2_desc(code):
    """ Create a variable with text descriptions of the 2-digit NAICS codes"""
    return naics_desc[code]

def naics6_desc(code):
    """Assign text descriptions to certain 6-digit NAICS codes."""
    if code in ag_naics.keys():
        return ag_naics[code]
    return np.nan

def extract_corrections(unknowns):
    """Extracts CBSA Code and appropriate CBSA Level for a list of InfoGroup FIPS Codes"""
    # Files cross-references CBSA codes and county FIPS codes.
    # Variable 'STCOU' is the 5-digit county FIPS code. The CBSA Level is inferred from the
    # text in the 'LSAD' variable.
    unk = unknowns.join(cbsa_df,on='FIPS Code',how='left',lsuffix='_l',rsuffix='_r')
    unk['CBSA Level'] = 0
    for i in unk.index:
        if str(unk.at[i,'LSAD']).find("Metropolitan") > -1:
            unk.at[i,'CBSA Level'] = 2
        elif str(unk.at[i,'LSAD']).find("Micropolitan") > -1:
            unk.at[i,'CBSA Level'] = 1
    return unk

def cbsa_text(df,yr):
    """Create a text description of he CBSA level code."""
    df['CBSA Text'] = ''
    for i in df.index:
        if int(df.at[i,'CBSA Level']) > 0: 
            df.at[i,'CBSA Text'] = 'urban'
        elif df.at[i,'CBSA Code'] == -9 and df.at[i,'CBSA Level']==0:
            df.at[i,'CBSA Text'] = 'rural'
        elif (df.at[i,'CBSA Code']==0 and df.at[i,'CBSA Level']==0): 
            df.at[i,'CBSA Text'] = 'rural'
        elif df.at[i,'CBSA Level']==0:
            df.at[i,'CBSA Text'] = 'unknown'
        else:
            df.at[i,'CBSA Text'] = '????'
    return df

def all_nines(li):
    """Determine whether the list of CBSA codes in a UA are all missing values."""
    length = len(li)
    n = 0
    for item in li:
        if item == 99999:
            n += 1
    if n == length:
        return True
    return False

def how_rural(code):
    """Apply the UA rurality flag."""
    if code in ua_keys.keys():
        return ua_keys[code]
    else:
        return np.nan
    
def find_ua(code):
    """Get the UA code that matches a particular CBSA code in the census relationship file."""
    for pair in pairs:
        if pair[1] == code:
            return pair[0]

def ua_type(place):
    """Create the UA Type value by skimming the last two words off the 
       relationship file's UANAME value.
    """
    if place.find('Not in a 2010 urban area') != -1:
        return np.nan
    else:
        list = re.findall("(\S+)",place)
        return ' '.join(list[len(list)-2:])

def get_ua_type(code):
    """Return the UA Type value for a UA Code."""
    try:
        return ua_type_dict[code]
    except KeyError:
        return np.nan

def combo(row):
    if row['FIPS Code'] == 99999 or row['Census Tract'] == 999999:
        return np.nan
    else:
        #try:
        return str(row['FIPS Code']) + str(row['Census Tract'])
        #except TypeError:
        #    fips = str(int(row['FIPS Code']))
        #    tract = str(int(row['Census Tract']))
        #    return fips + tract
        
def rur3(row):
    if int(row['CBSA Level']) == 0 or row['Full Census Tract'] in rural_tracts:
        return 1
    return 0

def farlevel(row):
    if sum([row['far1'],row['far2'],row['far3'],row['far4']]) == 0:
        return 0
    elif row['far1'] == 1 and sum([row['far2'],row['far3'],row['far4']]) == 0:
        return 1
    elif row['far2'] == 1 and sum([row['far3'],row['far4']]) == 0:
        return 2
    elif row['far3'] == 1 and row['far4'] == 0:
        return 3
    elif row['far4'] == 1:
        return 4
    
def log_far_levels(dfx):
    f = len(dfx)
    f0 = len(dfx[dfx['FAR Level']==0])
    print('Not Far and remote in InfoGroup file:',str(((f0/f) * 100))+'%',file=logfile)
    print('Distribution of FAR types for enterprises with non-zero FAR Level:',file=logfile)
    df = dfx[dfx['FAR Level']>0]
    f = len(df)
    f1 = len(df[df['FAR Level'] == 1])
    print('\tFAR Level 1:',str(((f1/f) * 100))+'%',file=logfile)
    f2 = len(df[df['FAR Level'] == 2])
    print('\tFAR Level 2:',str(((f2/f) * 100))+'%',file=logfile)
    f3 = len(df[df['FAR Level'] == 3])
    print('\tFAR Level 3:',str(((f3/f) * 100))+'%',file=logfile)
    f4 = len(df[df['FAR Level'] == 4])
    print('\tFAR Level 4:',str(((f4/f) * 100))+'%',file=logfile)
    
def showtime(num):
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    print(str(num),'  ',dt_string)	

## Reference datasets and derived data structures

In [19]:
# Census relationship files: CBSAs to counties
cbsa_df = pd.read_csv(f'/home/tflory/notebooks/InfoGroup/rurality/Relationship_Files/cbsa-county-relationships-2017.csv',usecols=['STCOU','CBSA','LSAD'])
cbsa_df.fillna('-9',inplace=True)
cbsa_df.rename(columns={'STCOU':'FIPS Code'},inplace=True)
cbsa_df['FIPS Code'] = cbsa_df['FIPS Code'].astype(int)

# Census relationship files: Urban Areas to CBSAs
relationship = '/home/tflory/notebooks/InfoGroup/rurality/Relationship_Files/Urban_Area_to_Metro_Micro_Area_utf-8.csv'
rel = pd.read_csv(relationship)
rel['UA Type'] = rel['UANAME'].apply(ua_type)
# Make a list of lists of all the UA/CBSA pairs in the relationship file.
pairs = rel[['UA','CBSA']].values.tolist()
# Create dict from list of lists. pairs[0] is UA, pairs[1] is CBSA.
# The result is a dict with keys of UAs and values of lists of CBSAs that are
# associated with each UA in the relationship file
uas = {}
for pair in pairs:
    try:
        uas[pair[0]].append(pair[1])
    except:
        uas[pair[0]] = [pair[1]]
        
ua_keys = dict.fromkeys(uas.keys())
for k,v in uas.items():
    if len(v) > 1:
        if all_nines(v):
            ua_keys[k] = 'rural-multi'
        elif 99999 in v:
            ua_keys[k] = 'partly rural'
        elif k == 99999:
            ua_keys[k] = 'unknown'
        else:
            ua_keys[k]= 'urban-multi'
    else:
        if v[0] == 99999:
            ua_keys[k] = 'rural-single'
        else:
            ua_keys[k] = 'urban-single'
        
# Create a dict with the UA as key and the UA Type as value.
ua_type_dict = {}
for v in rel[['UA','UA Type']].values.tolist():
    ua_type_dict[v[0]] = v[1]
    
# FORHP list of 2300+ rural census tracts
rural_tracts = []
with open('/InfoGroup/data/rurality/tract_data.txt','r') as fin:
    for line in fin:
        if line[0] != chr(32):
            continue
        else:
            line = line.strip()
            try:
                if line[0].isnumeric(): 
                    rural_tracts.append(line)
            except IndexError:
                pass

# Census Bureau: all census tracts, 2010 slightly updated in 2017
tracts_file = \
      '/home/tflory/notebooks/InfoGroup/rurality/points-in-polygons/data/all_tracts.csv'
tracts = pd.read_csv(tracts_file,dtype=object,usecols=['GEOID','rural_tract'])

# ERS: Frontier and Remote census data tracts
far_file = '/InfoGroup/data/rurality/FARcodesZIPdata2010WithAKandHI.csv'
df_far = pd.read_csv(far_file,dtype=object)
df_far['ZIP'] = df_far['ZIP'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)
df_zip = df_far[['ZIP','far1','far2','far3','far4']].copy()
df_zip[['far1','far2','far3','far4']] = df_zip[['far1','far2','far3','far4']].astype(int)
df_zip['FAR Level'] = df_zip.apply(farlevel,axis=1)
df_zip = df_zip.drop_duplicates()


## Main

In [20]:
# open log file
logfile = open('all_steps.log','w')

In [21]:
for yr in range(1997,2015):
    showtime('start')
    df = extract_and_correct()
    showtime('extract_and_correct')
    (urban,rural,corrected) = CBSA_partition(df)
    showtime('CBSA_partition')
    final_df = pd.concat([urban,rural,corrected],ignore_index=True)
    showtime('inline concat')
    final_df = new_vars(final_df)
    showtime('new_vars (5)')
    merges_and_output(final_df)
    showtime('merges_and_output (6)')

start    30/07/2020 17:49:49
extract_and_correct    30/07/2020 18:18:45
CBSA_partition    30/07/2020 18:20:40
inline concat    30/07/2020 18:21:10
	5-0    30/07/2020 18:37:45
	5-1    30/07/2020 18:37:51
	5-2    30/07/2020 18:37:58
	5-3    30/07/2020 19:02:29
	5-4    30/07/2020 19:28:43
	5-5    30/07/2020 19:28:46
new_vars (5)    30/07/2020 19:28:46
	6-0    30/07/2020 19:30:10
	6-1    30/07/2020 19:32:00
	6-2
    30/07/2020 19:36:51
merges_and_output (6)    30/07/2020 19:36:56
start    30/07/2020 19:36:56
extract_and_correct    30/07/2020 20:05:47
CBSA_partition    30/07/2020 20:07:39
inline concat    30/07/2020 20:08:10
	5-0    30/07/2020 20:24:43
	5-1    30/07/2020 20:24:48
	5-2    30/07/2020 20:24:55
	5-3    30/07/2020 20:49:53
	5-4    30/07/2020 21:16:36
	5-5    30/07/2020 21:16:40
new_vars (5)    30/07/2020 21:16:40
	6-0    30/07/2020 21:18:03
	6-1    30/07/2020 21:19:45
	6-2
    30/07/2020 21:24:41
merges_and_output (6)    30/07/2020 21:24:46


In [22]:
logfile.close()