In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [None]:
cbsa_df = pd.read_csv(f'/home/tflory/notebooks/InfoGroup/rurality/Relationship_Files/cbsa-county-relationships-2017.csv',usecols=['STCOU','CBSA','LSAD'])
cbsa_df.fillna('-9',inplace=True)
cbsa_df.rename(columns={'STCOU':'FIPS Code'},inplace=True)

In [None]:
def extract_corrections(unknowns):
    """Extracts CBSA Code and appropriate CBSA Level for a list of InfoGroup FIPS Codes"""
    # Files cross-references CBSA codes and county FIPS codes.
    # Variable 'STCOU' is the 5-digit county FIPS code. The CBSA Level is inferred from the
    # text in the 'LSAD' variable.
    cbsa_df['FIPS Code'] = cbsa_df['FIPS Code'].astype(int)
    unknowns['FIPS Code'] = unknowns['FIPS Code'].astype(int)
    unk = unknowns.join(cbsa_df,on='FIPS Code',how='left',lsuffix='_l',rsuffix='_r')
    unk['CBSA Level'] = 0
    for i in unk.index:
        if str(unk.at[i,'LSAD']).find("Metropolitan") > -1:
            unk.at[i,'CBSA Level'] = 2
        elif str(unk.at[i,'LSAD']).find("Micropolitan") > -1:
            unk.at[i,'CBSA Level'] = 1
    return unk

In [None]:
def cbsa_text(df,yr):
    df['CBSA Text'] = ''
    for i in df.index:
        if int(df.at[i,'CBSA Level']) > 0: 
            df.at[i,'CBSA Text'] = 'urban'
        elif df.at[i,'CBSA Code'] == -9 and df.at[i,'CBSA Level']==0:
            df.at[i,'CBSA Text'] = 'rural'
        elif (df.at[i,'CBSA Code']==0 and df.at[i,'CBSA Level']==0): 
            df.at[i,'CBSA Text'] = 'rural'
        elif df.at[i,'CBSA Level']==0:
            df.at[i,'CBSA Text'] = 'unknown'
        else:
            df.at[i,'CBSA Text'] = '????'
    return df

In [None]:
for yr in range(1997,2018):
    in_fname = f'/InfoGroup/data/rurality/df_{yr}_uncorrected.csv'
    df = pd.read_csv(in_fname,dtype=object)
    df['CBSA Code'].fillna(-9,inplace=True)
    df['CBSA Level'].fillna(-9,inplace=True)
    df['CBSA Code'] = df['CBSA Code'].astype(int)
    df['CBSA Level'] = df['CBSA Level'].astype(int)
    df['FIPS Code'].fillna('00000',inplace=True)
    
    urban = df[(df['CBSA Level'] > 0)]
    urban['CBSA Text'] = 'urban'
    rural = df[(df['CBSA Code'] > 0) & (df['CBSA Level']==0)]
    rural['CBSA Text'] ='rural'
    unknown = df[(df['CBSA Code'] < 0) | ((df['CBSA Code'] == 0) & (df['CBSA Level']==0))]
    
    nrows = len(df)
    sum_of_parts = len(urban) + len(rural) + len(unknown)
    if sum_of_parts != nrows:
          print(f'{yr} Error in dividing enterprises into categories:')
          print(f'\t{nrows} != {sum_of_parts}')
        
    corrected = extract_corrections(unknown)
    corrected = cbsa_text(corrected,yr)
    corrected.drop(columns=['CBSA','FIPS Code_r','LSAD'],inplace=True)
    corrected.rename(columns={"FIPS Code_l": "FIPS Code"},inplace=True)

    print(corrected['CBSA Level'].value_counts())
    final_df = pd.concat([urban,rural,corrected],ignore_index=True)
    print(final_df['CBSA Text'].value_counts())

    out_fname = f'/InfoGroup/data/rurality/InfoGroup_final_{yr}.csv'
    df.to_csv(out_fname,index=None)


*** 1997 ***
0    794019
2      6891
1      4690
Name: CBSA Level, dtype: int64
urban    10469902
rural      794019
Name: CBSA Text, dtype: int64

*** 1998 ***
0    770262
2      6748
1      4601
Name: CBSA Level, dtype: int64
urban    10003949
rural      770262
Name: CBSA Text, dtype: int64

*** 1999 ***
0    789067
2      7062
1      4679
Name: CBSA Level, dtype: int64
urban    10230999
rural      789067
Name: CBSA Text, dtype: int64

*** 2000 ***
0    792797
2      7093
1      4798
Name: CBSA Level, dtype: int64
urban    10376479
rural      792797
Name: CBSA Text, dtype: int64

*** 2001 ***
0    903899
2      8265
1      5345
Name: CBSA Level, dtype: int64
urban    11645485
rural      903899
Name: CBSA Text, dtype: int64

*** 2002 ***
0    910425
2      8283
1      5410
Name: CBSA Level, dtype: int64
urban    11938524
rural      910425
Name: CBSA Text, dtype: int64

*** 2003 ***
0    887255
2      7966
1      5282
Name: CBSA Level, dtype: int64
urban    11768331
rural      887255
N