# Retrieve files from census.gov

In [1]:
import requests
import zipfile
from zipfile import BadZipFile
from io import BytesIO
import os
import pandas as pd

In [2]:
# Create 4 character year and 2 character year
yr4 = [y for y in range(1986, 2021)]
yr2 = [(str(y)[2:]) for y in range(1986, 2021)]

In [3]:
for f, t in zip(yr4, yr2):
    print(f'...getting file for {f}')
    url = f'https://www2.census.gov/programs-surveys/cbp/datasets/{f}/cbp{t}st.zip'
    req = requests.get(url)
    try:
        z = zipfile.ZipFile(BytesIO(req.content))
        oldname = z.namelist()[0]
        z.extractall()
        os.rename(oldname, f'{f}.txt')
    except BadZipFile:
        print(f'Bad Zip File for {f}')

...getting file for 1986
...getting file for 1987
...getting file for 1988
...getting file for 1989
...getting file for 1990
...getting file for 1991
...getting file for 1992
...getting file for 1993
...getting file for 1994
...getting file for 1995
...getting file for 1996
...getting file for 1997
...getting file for 1998
...getting file for 1999
...getting file for 2000
...getting file for 2001
...getting file for 2002
...getting file for 2003
...getting file for 2004
...getting file for 2005
...getting file for 2006
...getting file for 2007
...getting file for 2008
...getting file for 2009
...getting file for 2010
...getting file for 2011
...getting file for 2012
...getting file for 2013
...getting file for 2014
...getting file for 2015
...getting file for 2016
...getting file for 2017
...getting file for 2018
...getting file for 2019
...getting file for 2020


### Column Description
**fipstate**: fipstate code<br>
**sic**: 4-digit Standard Industrial Classification Code *1986-1997*<br>
**naics**: North American Industry Classification System *1998-2020*<br>
**emp**: Total Mid-March Employees<br>
**qp1**: First quarter payroll (1000 dollars)<br>
**ap**: Annual Payroll (1000 dollars)<br>
**est**: Number of establishments<br>

In [4]:
# Concat datasets together
year = [y for y in range(1986, 2021)]
full = pd.DataFrame()
for y in year:
    df = pd.read_csv(f'{y}.txt', dtype={'fipstate': str,
                                       'FIPSTATE': str})
    df['year'] = y
    df.columns = df.columns.str.lower()
    df = df.reindex(columns=['year', 'fipstate', 'sic', 'naics', 'emp', 'qp1', 'ap', 'est'])
    full = pd.concat([full, df], ignore_index=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
# Add state names to the dataset

# Import state names
state = pd.read_csv('us-state-ansi-fips.csv', dtype={' st': str})
state.columns = state.columns.str.strip()
state = state.rename(columns={'st': 'fipstate'})
state = state[['stname', 'fipstate']]
state['fipstate'] = state['fipstate'].str.strip()
# Merge
new = pd.merge(full, state, on='fipstate', how='left')
new = new[['year', 'stname', 'sic', 'naics', 'emp', 'qp1', 'ap', 'est']]
new.head()

Unnamed: 0,year,stname,sic,naics,emp,qp1,ap,est
0,1986,Alabama,----,,1164166,4680884,19306212,81811
1,1986,Alabama,07--,,5627,13687,59631,838
2,1986,Alabama,0700,,4573,10473,47724,723
3,1986,Alabama,0710,,0,0,0,10
4,1986,Alabama,0720,,306,1176,4646,58


In [6]:
# Export
new.to_csv('CBP_full_dataset.csv', index=False)