In [1]:
# default_exp log_cbsa_missing

# Log 'CBSA Level' Missing Data
> Finds and logs correctable anomalies in the 'CBSA Level' variable.

In [2]:
# export
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [3]:
# export
def count_nans(s):
    """Count the number of missing values in a set of floats"""
    c = [str(item) for item in list(s)]
    return c.count('nan')

In [4]:
# export
# Open a logging file
logfile = open('001-CBSA.log','a')

In [5]:
# export
for yr in range(1997,2018):
    fname = f'/InfoGroup/data/rurality/InfoGroup_{yr}_extract.csv'
    df = pd.read_csv(fname,dtype=object)
    unknown = df[(df['CBSA Code']!='00000') & (df['CBSA Level']=='0')]
    
    # 'unknown' is a dataframe containing cases in with there is a valid CBSA Code but a
    # missing CBSA Level value. If unknown is empty, not further action is needed in this
    # module for that year.
    print(f'\nNumber of unknown cases in {yr}: ' + str(len(unknown)),file=logfile)
    
    # 'unk_cbsas' is the set of values on 'CBSA Code' among records with a non-zero CBSA Code and
    # CBSA Level value of zero. It is expected that this will always be {nan}, requiring some 
    # lookup of proper identifiers for the small number of cases in each year. If unk_cbsas is 
    # an empty list, no corrections are necessary.
    unk_cbsas = unknown['CBSA Code'].tolist()
    unk_cbsas = set(unk_cbsas)
    unknowns = len(unk_cbsas)
    if unknowns == 0:
        print(f'No unknowns in {yr}',file=logfile)
    else:
        print(f'Number of unknown values in {yr}: ' + str(len(unk_cbsas)),file=logfile)
        nans = count_nans(unk_cbsas)
        if nans == unknowns:
            print('\tAll are missing',file=logfile)
        else:
            print('\tNot all are missing',file=logfile)

In [6]:
# export
logfile.close()