In [24]:
#default_exp fix_cbsa_errors

In [25]:
# export
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [26]:
# export
cbsa_df = pd.read_csv(f'/home/tflory/notebooks/InfoGroup/rurality/Relationship_Files/cbsa-county-relationships-2017.csv',usecols=['STCOU','CBSA','LSAD'])
cbsa_df.fillna('-9',inplace=True)
cbsa_df.rename(columns={'STCOU':'FIPS Code'},inplace=True)
cbsa_df['FIPS Code'] = cbsa_df['FIPS Code'].astype(int)

In [27]:
# export
def extract_corrections(unknowns):
    """Extracts CBSA Code and appropriate CBSA Level for a list of InfoGroup FIPS Codes"""
    # Files cross-references CBSA codes and county FIPS codes.
    # Variable 'STCOU' is the 5-digit county FIPS code. The CBSA Level is inferred from the
    # text in the 'LSAD' variable.
    unk = unknowns.join(cbsa_df,on='FIPS Code',how='left',lsuffix='_l',rsuffix='_r')
    unk['CBSA Level'] = 0
    for i in unk.index:
        if str(unk.at[i,'LSAD']).find("Metropolitan") > -1:
            unk.at[i,'CBSA Level'] = 2
        elif str(unk.at[i,'LSAD']).find("Micropolitan") > -1:
            unk.at[i,'CBSA Level'] = 1
    return unk

def cbsa_text(df,yr):
    """Create a text description of he CBSA level code."""
    df['CBSA Text'] = ''
    for i in df.index:
        if int(df.at[i,'CBSA Level']) > 0: 
            df.at[i,'CBSA Text'] = 'urban'
        elif df.at[i,'CBSA Code'] == -9 and df.at[i,'CBSA Level']==0:
            df.at[i,'CBSA Text'] = 'rural'
        elif (df.at[i,'CBSA Code']==0 and df.at[i,'CBSA Level']==0): 
            df.at[i,'CBSA Text'] = 'rural'
        elif df.at[i,'CBSA Level']==0:
            df.at[i,'CBSA Text'] = 'unknown'
        else:
            df.at[i,'CBSA Text'] = '????'
    return df

words = {'0':'rural','1':'micropolitan','2':'metropolitan'}
def make_word(code):
    """Apply a text description to the CBSA Level code."""
    return words[code]

In [28]:
# export
# open a log file
logfile = open('003-CBSA.log','w')

In [29]:
# export
for yr in range(1997,2018):
    print(f'\n{yr}:',file=logfile)
    in_fname = f'/InfoGroup/data/rurality/InfoGroup_{yr}_extract.csv'
    df = pd.read_csv(in_fname,dtype=object)
    df['CBSA Code'].fillna(-9,inplace=True)
    df['CBSA Level'].fillna(-9,inplace=True)
    df['CBSA Code'] = df['CBSA Code'].astype(int)
    df['CBSA Level'] = df['CBSA Level'].astype(int)
    df['FIPS Code'] = df['FIPS Code'].astype(int)
    
    urban = df[(df['CBSA Level'] > 0)]
    urban['CBSA Text'] = 'urban'
    rural = df[(df['CBSA Code'] > 0) & (df['CBSA Level']==0)]
    rural['CBSA Text'] ='rural'
    unknown = df[(df['CBSA Code'] < 0) | ((df['CBSA Code'] == 0) & (df['CBSA Level']==0))]
    unknown['FIPS Code'] = unknown['FIPS Code'].astype(int)
    
    nrows = len(df)
    sum_of_parts = len(urban) + len(rural) + len(unknown)
    if sum_of_parts != nrows:
          print(f'Error in dividing enterprises into categories:',file=logfile)
          print(f'\t{nrows} != {sum_of_parts}',file=logfile)
        
    corrected = extract_corrections(unknown)
    corrected = cbsa_text(corrected,yr)
    corrected.drop(columns=['CBSA','FIPS Code_r','LSAD'],inplace=True)
    corrected.rename(columns={"FIPS Code_l": "FIPS Code"},inplace=True)

    print(corrected['CBSA Level'].value_counts(),file=logfile)
    final_df = pd.concat([urban,rural,corrected],ignore_index=True)
    print(final_df['CBSA Text'].value_counts(),file=logfile)
    print(final_df['CBSA Text'].value_counts(normalize=True) * 100,file=logfile)
    print(f'rural/urban by OMB standard:',file=logfile)
    
    # Write a new file
    outfile = f'/InfoGroup/data/rurality/InfoGroup_{yr}_nb03.csv'
    final_df.to_csv(outfile,index=None)

In [30]:
# export
logfile.close()