In [None]:
# default_exp extract

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
import pandas as pd
from zipfile import ZipFile
import os
import warnings
warnings.filterwarnings("ignore")

In [None]:
cols = ['Company','Address Line 1','City','State','ZipCode','Employee Size (5) - Location',
        'County Code','Sales Volume (9) - Location','Primary NAICS Code','NAICS8 Descriptions',
        'Primary SIC Code','SIC6_Descriptions','Business Status Code', 
        'Industry Specific First Byte','Year Established','Subsidiary Number', 'Parent Number', 
        'Parent Actual Employee Size','Parent Actual Sales Volume','ABI','Census Tract',
        'Census Block','Latitude','Longitude','CBSA Code','CBSA Level','FIPS Code']

In [None]:
# Assign the FIPS state code to each record. Prior examination shows some data entry 
# errors in the FIPS Code variable so we do this by brute force.
state_fips = {
'AL':'01',
'AK':'02',
'AS':'60',
'AZ':'04',
'AR':'05',
'CA':'06',
'CO':'08',
'CT':'09',
'DE':'10',
'DC':'11',
'FL':'12',
'FM':'64',
'GA':'13',
'GU':'66',
'HI':'15',
'ID':'16',
'IL':'17',
'IN':'18',
'IA':'19',
'KS':'20',
'KY':'21',
'LA':'22',
'ME':'23',
'MH':'68',
'MD':'24',
'MA':'25',
'MI':'26',
'MN':'27',
'MS':'28',
'MO':'29',
'MT':'30',
'NE':'31',
'NV':'32',
'NH':'33',
'NJ':'34',
'NM':'35',
'NY':'36',
'NC':'37',
'ND':'38',
'MP':'69',
'OH':'39',
'OK':'40',
'OR':'41',
'PW':'70',
'PA':'42',
'PR':'72',
'RI':'44',
'SC':'45',
'SD':'46',
'TN':'47',
'TX':'48',
'UM':'74',
'UT':'49',
'VT':'50',
'VA':'51',
'VI':'78',
'WA':'53',
'WV':'54',
'WI':'55',
'WY':'56'
}

In [None]:
# Short descriptions for 2-digit NAICS codes
naics_desc = {
'11':'Agriculture, Forestry, Fishing and Hunting',
'21':'Mining',
'22':'Utilities',
'23':'Construction',
'31':'Manufacturing',
'32':'Manufacturing',
'33':'Manufacturing',
'42':'Wholesale Trade',
'44':'Retail Trade',
'45':'Retail Trade',
'48':'Transportation and Warehousing',
'49':'Transportation and Warehousing',
'51':'Information',
'52':'Finance and Insurance',
'53':'Real Estate Rental and Leasing',
'54':'Professional, Scientific, and Technical Services',
'55':'Management of Companies and Enterprises',
'56':'Administrative and Support and Waste Management and Remediation Services',
'61':'Eucational Services',
'62':'Health Care and Social Assistance',
'71':'Arts, Entertainment, and Recreation',
'72':'Accommodation and Food Services',
'81':'Other Services (except Public Administration)',
'92':'Public Administration',
'99':'Unknown'
}

In [None]:
# Little functions
def state_code(state):
    """ Establish an accurate state FIPS code"""
    return state_fips[state]

def is_continental(code):
    """ Create a 1/0 flag for the 48 continental states and DC, excluding HI and AK. 
        This could be useful for mapping.
    """
    if int(code) not in [2,15]:
        return '1'
    return '0'

def two_digit(code):
    """ Create a new 2-digit NAICS code variable from the 8-digit Primary NAICS Code"""
    return code[0:2]

def desc(code):
    """ Create a variable with text descriptions of the 2-digit NAICS codes"""
    return naics_desc[code]


In [None]:
for yr in range(1998,2017):
    dir = '/InfoGroup/data/original/'
    xdir = '/tmp/xtrcts/'
    fname = f'{yr}_Business_Academic_QCQ_utf-8.'
    with ZipFile(dir + fname + 'zip','r') as myzip:
        myzip.extract(fname + 'csv',xdir)
        df = pd.read_csv(xdir + fname + 'csv',dtype=object,usecols=cols)

        # Overwrite the state FIPS code
        df['State FIPS'] = df['State'].apply(state_code)
        df['State FIPS'] = df['State FIPS'].astype(int)
        
        # Exclude territories, leaving only the 50 states and DC.
        df = df[df['State FIPS'] <= 56]
        
        # Flag the 48 continental states
        df['Continental'] = df['State FIPS'].apply(is_continental)
        
        # Give 'Primary NAICS Code' a useable missing value.
        df['Primary NAICS Code'].fillna('99999999',inplace=True)
        
        # Give 'CBSA Level' a missing value of 0.
        df['CBSA Level'].fillna('0',inplace=True)
        
        # Create 2-digit NAICS code
        df['NAICS2'] = df['Primary NAICS Code'].apply(two_digit)
        
        # Create descriptions of the 2-digit NAICS codes
        df['NAICS2 desc'] = df['NAICS2'].apply(desc)
        
        # Write out the enhanced extract csv file
        outfile = f'/InfoGroup/data/rurality/df_{yr}_uncorrected.csv'
        df.to_csv(outfile,index=None)
        
        os.remove(xdir + fname + 'csv')