In [17]:
# default_exp Census_Bureau_urban_areas

In [18]:
# hide
# From https://www2.census.gov/geo/pdfs/reference/GARM/Ch12GARM.pdf:

# "Urbanized Areas (UAs)
# A UA is a continuously built-up area with a population of 50,000 or more.
# It comprises one or more places—central place(s)—and the adjacent densely settled 
# surrounding area—urban fringe—consisting of other places and nonplace territory."

# The Urban Area concept is a combination of urbanized areas and urban clusters, the
# distinction being based on population size and settlement density. The urbanized area
# has a greater population size and density of settlement.

# "Rural Places and Territory
# Territory, population, and housing units that the Census Bureau does not classify as urban 
# are classified as rural."

# Census Urbanized Areas and Urban Clusters are not based on political boundaries at all.

In [19]:
# export
import pandas as pd
import numpy as np
import re

In [20]:
# export
# open a log file
logfile = open('004-UA.log','w')

In [21]:
# export
def all_nines(li):
    """Determine whether the list of CBSA codes in a UA are all missing values."""
    length = len(li)
    n = 0
    for item in li:
        if item == 99999:
            n += 1
    if n == length:
        return True
    return False

def remove_all(li,val):
    """Remove all occurrences of a particular value from a list."""
    i = 0
    while i < len(li):
        try:
            li.remove(val)
        except:
            pass
        i += 1
    return li

def how_rural(code):
    """Apply the UA rurality flag."""
    if code in ua_keys.keys():
        return ua_keys[code]
    else:
        return np.nan
    
def find_ua(code):
    """Get the UA code that matches a particular CBSA code in the census relationship file."""
    for pair in pairs:
        if pair[1] == code:
            return pair[0]

def ua_type(place):
    """Create the UA Type value by skimming the last two words off the 
       relationship file's UANAME value.
    """
    if place.find('Not in a 2010 urban area') != -1:
        return np.nan
    else:
        list = re.findall("(\S+)",place)
        return ' '.join(list[len(list)-2:])

def get_ua_type(code):
    """Return the UA Type value for a UA Code."""
    try:
        return ua_type_dict[code]
    except KeyError:
        return np.nan
    

In [22]:
# export
# Eventually add the Urban Area (UA) to each IndoGroup record by matching InfoGroup 'CBSA Code' 
# (metro/micropolitan area) to the Census Relationship file that cross-references CBSA and UA.
# The file contains a record for all metro/micro areas (CBSAs) and all urban areas/clusters 
# (UAs) in 2010.
# Prepare the data in the relationship file.
relationship = '/home/tflory/notebooks/InfoGroup/rurality/Relationship_Files/Urban_Area_to_Metro_Micro_Area_utf-8.csv'
rel = pd.read_csv(relationship)

cbsa_list = rel['CBSA'].drop_duplicates().tolist()
ua_list = rel['UA'].drop_duplicates().tolist()
# 99999 is the code for a missing value in the pair: a UA with no corresponding CBSA
# or a CBSA with no corresponding UA.
cbsa_list.remove(99999)
ua_list.remove(99999)
# Add a UA Type variable to the rel dataframe by extracting the last two
# words in the UANAME variable.
rel['UA Type'] = rel['UANAME'].apply(ua_type)
# Make a list of lists of all the UA/CBSA pairs in the relationship file.
pairs = rel[['UA','CBSA']].values.tolist()

# Create dict from list of lists. pairs[0] is UA, pairs[1] is CBSA.
# The result is a dict with keys of UAs and values of lists of CBSAs that are
# associated with each UA in the relationship file
uas = {}
for pair in pairs:
    try:
        uas[pair[0]].append(pair[1])
    except:
        uas[pair[0]] = [pair[1]]
        
# This is the reverse of the uas dict above. Each key is a CBSA, each value
# a list of the UAs associated with each CBSA.
cbsas = {}
for pair in pairs:
    try:
        cbsas[pair[1]].append(pair[0])
    except:
        cbsas[pair[1]] = [pair[0]]
        
# Remove missing as a key and remove all missing values from each value list of UAs.
del cbsas[99999]
for k,v in cbsas.items():
    cbsas[k] = remove_all(v,99999)
    
# Create a dict with the UA as key and the UA Type as value.
ua_type_dict = {}
for v in rel[['UA','UA Type']].values.tolist():
    ua_type_dict[v[0]] = v[1]

In [23]:
# hide
rel.columns

Index(['UA', 'UANAME', 'CBSA', 'MNAME', 'MEMI', 'POPPT', 'HUPT', 'AREAPT',
       'AREALANDPT', 'UAPOP', 'UAHU', 'UAAREA', 'UAAREALAND', 'MPOP', 'MHU',
       'MAREA', 'MAREALAND', 'UAPOPPCT', 'UAHUPCT', 'UAAREAPCT',
       'UAAREALANDPCT', 'MPOPPCT', 'MHUPCT', 'MAREAPCT', 'MAREALANDPCT',
       'UA Type'],
      dtype='object')

In [24]:
# hide
rel['UANAME'].value_counts()

Not in a 2010 urban area                                955
Charlotte, NC--SC Urbanized Area                          5
Aguadilla--Isabela--San Sebastián, PR Urbanized Area      5
New York--Newark, NY--NJ--CT Urbanized Area               5
Allentown, PA--NJ Urbanized Area                          5
                                                       ... 
Adrian, MI Urban Cluster                                  1
Helena-West Helena, AR Urban Cluster                      1
Portage, WI Urban Cluster                                 1
Mount Angel, OR Urban Cluster                             1
Bellefonte, PA Urban Cluster                              1
Name: UANAME, Length: 3602, dtype: int64

In [25]:
# export
# Categorize all the UAs as urban, rural, partly rural, or unknown.
# Partly rural == UA with some valid CBSAs and at least 1 99999;
# Rural (entirely rural) == UA with no valid CBSAs, just some 99999s;
# Urban (not at all rural) == UA with some valid CBSAs but no 99999s.
# Unknown == UA 99999 (missing)
ua_keys = dict.fromkeys(uas.keys())
for k,v in uas.items():
    if len(v) > 1:
        if all_nines(v):
            ua_keys[k] = 'rural-multi'
        elif 99999 in v:
            ua_keys[k] = 'partly rural'
        elif k == 99999:
            ua_keys[k] = 'unknown'
        else:
            ua_keys[k]= 'urban-multi'
    else:
        if v[0] == 99999:
            ua_keys[k] = 'rural-single'
        else:
            ua_keys[k] = 'urban-single'

In [26]:
# hide
# Count and log the number of rural, urban, and unknown UAs.
len_att = len(ua_keys)
entire = 0
urban = 0
part = 0
unk = 0
for k,v in ua_keys.items():
    if v in ['rural-single','rural-multi']:
        entire += 1
    elif v == 'partly rural':
        part += 1
    elif v in ['urban-single','urban-multi']:
        urban += 1
    elif v == 'unknown':
        unk += 1

print(f'Out of',str(len_att),'UAs:\n',str(entire),'are entirely rural\n',str(part),'are partly rural\n',
      str(urban),'are urban, and\n',str(unk),'= unknown.') #,file=logfile)

Out of 3602 UAs:
 908 are entirely rural
 130 are partly rural
 2563 are urban, and
 1 = unknown.


In [27]:
# hide
# On the average there are is least a part of more than 3 UA's for each CBSA. Thus the
# only way to get a one-to-one mapping of CBSA to UA will be to do the shapely 'within' 
# comparison: determine whether the point coordinates of an IG enterprise fall within 
# the polygon coordinates of a particular UA.

# But to create a simple 'rural_Census' flag we need only to discover whether the CBSA Code 
# of the IG record matches any CBSA code in the relationship file that is not in an urban area.

In [28]:
# hide
# But there is more we can learn.
# Is a UA in a CBSA 1) partly, 2) entirely, or 3) not at all?
# Make a dict with the UA as the key and the value a list of all CBSA codes.

In [29]:
# hide
# The uas dict has a list of CBSAs for each UA. The ua_keys dict tags each UA as rural, 
# partly rural, or urban. But in InfoGroup we have only CBSA Code. We can determine if the 
# IG record's CBSA Code matches to a UA that is entirely urban and that is completely within 
# the CBSA; i.e., it has only the IG record's CBS code in its
# CBSA list. And if it matches to a UA that is entirely rural; i.e., the UA has only a single 
# 99999 in its CBSA list.

# The cbsas dict is a reverse version of uas: a key for every CBSA and, as the value, the 
# list of UAs associated with the key. 

# The cbsas_keys dict substitutes the urban/rural tag of each UA for its code value.
# Using those lists of text values we can determine whether a record is rural, partly 
# rural, or urban.

In [30]:
# hide
# CBSA key
# A CBSA is not covered entirely by UAs, so is it valid to assess the rurality of a record if 
# it only has a CBSA Code and no way to categorize the parts of the CBSA that are not covered 
# by any UA?
# Create dict from list of lists.


In [31]:
# export
for yr in range(1997,2018):
    print(f'\n{yr}:',file=logfile)
    infile = f'/InfoGroup/data/rurality/InfoGroup_{yr}_nb03.csv'
    df = pd.read_csv(infile,dtype=object)
    df['CBSA Code'] = df['CBSA Code'].fillna(99999)
    df['CBSA Code'] = df['CBSA Code'].astype(int)
    df['UA Code'] = df['CBSA Code'].apply(find_ua)
    df['UA Type'] = df['UA Code'].apply(get_ua_type)
    df['rural_by_UA'] = df['UA Code'].apply(how_rural) 
    print(df['UA Type'].value_counts(),file=logfile)
    print(df['UA Type'].value_counts(normalize=True) * 100,file=logfile)
    print(df['rural_by_UA'].value_counts(),file=logfile)
    print(df['rural_by_UA'].value_counts(normalize=True) * 100,file=logfile)
    # write out a new file
    outfile = f'/InfoGroup/data/rurality/InfoGroup_{yr}_nb04.csv' 
    df.to_csv(outfile,index=None)

In [32]:
# export
logfile.close()