In [32]:
import pandas as pd
import geopandas as gpd
from census import Census
from us import states
import pathlib
import os
from tqdm import tqdm

In [33]:
from decouple import Config, RepositoryEnv
config = Config(RepositoryEnv(".env"))

In [34]:
YEAR = 2021

In [None]:
VARIABLE_NAME = 'B19013'

In [35]:
c = Census(config("CENSUS_API_KEY"), year=YEAR)

### Look for variable descriptions here: https://api.census.gov/data/2021/acs/acs5/variables.html

In [36]:
ga_census = c.acs5.state_county_blockgroup(fields = ('NAME', VARIABLE_NAME+'_001E', VARIABLE_NAME+"_001M", VARIABLE_NAME+"_001MA"),
                                      state_fips = states.AL.fips,
                                      county_fips = "*",
                                      tract = "*",
                                      blockgroup = '*')

# Create a dataframe from the census data
ga_df = pd.DataFrame(ga_census)
ga_df

Unnamed: 0,NAME,B19013_001E,B19013A_001M,B19013A_001MA,state,county,tract,block group
0,"Block Group 1, Census Tract 201, Autauga Count...",41607.0,,,01,001,020100,1
1,"Block Group 2, Census Tract 201, Autauga Count...",66313.0,,,01,001,020100,2
2,"Block Group 1, Census Tract 202, Autauga Count...",42288.0,,,01,001,020200,1
3,"Block Group 2, Census Tract 202, Autauga Count...",52609.0,,,01,001,020200,2
4,"Block Group 1, Census Tract 203, Autauga Count...",75074.0,,,01,001,020300,1
...,...,...,...,...,...,...,...,...
3920,"Block Group 4, Census Tract 9658, Winston Coun...",55375.0,,,01,133,965800,4
3921,"Block Group 5, Census Tract 9658, Winston Coun...",-666666666.0,,,01,133,965800,5
3922,"Block Group 1, Census Tract 9659, Winston Coun...",27232.0,,,01,133,965900,1
3923,"Block Group 2, Census Tract 9659, Winston Coun...",35370.0,,,01,133,965900,2


In [37]:
ga_df['GEOID21'] = ga_df['state'] + ga_df['county'] + ga_df['tract'] + ga_df['block group']
ga_df

Unnamed: 0,NAME,B19013_001E,B19013A_001M,B19013A_001MA,state,county,tract,block group,GEOID21
0,"Block Group 1, Census Tract 201, Autauga Count...",41607.0,,,01,001,020100,1,010010201001
1,"Block Group 2, Census Tract 201, Autauga Count...",66313.0,,,01,001,020100,2,010010201002
2,"Block Group 1, Census Tract 202, Autauga Count...",42288.0,,,01,001,020200,1,010010202001
3,"Block Group 2, Census Tract 202, Autauga Count...",52609.0,,,01,001,020200,2,010010202002
4,"Block Group 1, Census Tract 203, Autauga Count...",75074.0,,,01,001,020300,1,010010203001
...,...,...,...,...,...,...,...,...,...
3920,"Block Group 4, Census Tract 9658, Winston Coun...",55375.0,,,01,133,965800,4,011339658004
3921,"Block Group 5, Census Tract 9658, Winston Coun...",-666666666.0,,,01,133,965800,5,011339658005
3922,"Block Group 1, Census Tract 9659, Winston Coun...",27232.0,,,01,133,965900,1,011339659001
3923,"Block Group 2, Census Tract 9659, Winston Coun...",35370.0,,,01,133,965900,2,011339659002


In [38]:
ga_df['B19013A_001M'].value_counts()

Series([], Name: count, dtype: int64)

In [39]:
ga_df['B19013A_001MA'].value_counts()

Series([], Name: count, dtype: int64)

In [40]:
ga_df

Unnamed: 0,NAME,B19013_001E,B19013A_001M,B19013A_001MA,state,county,tract,block group,GEOID21
0,"Block Group 1, Census Tract 201, Autauga Count...",41607.0,,,01,001,020100,1,010010201001
1,"Block Group 2, Census Tract 201, Autauga Count...",66313.0,,,01,001,020100,2,010010201002
2,"Block Group 1, Census Tract 202, Autauga Count...",42288.0,,,01,001,020200,1,010010202001
3,"Block Group 2, Census Tract 202, Autauga Count...",52609.0,,,01,001,020200,2,010010202002
4,"Block Group 1, Census Tract 203, Autauga Count...",75074.0,,,01,001,020300,1,010010203001
...,...,...,...,...,...,...,...,...,...
3920,"Block Group 4, Census Tract 9658, Winston Coun...",55375.0,,,01,133,965800,4,011339658004
3921,"Block Group 5, Census Tract 9658, Winston Coun...",-666666666.0,,,01,133,965800,5,011339658005
3922,"Block Group 1, Census Tract 9659, Winston Coun...",27232.0,,,01,133,965900,1,011339659001
3923,"Block Group 2, Census Tract 9659, Winston Coun...",35370.0,,,01,133,965900,2,011339659002


In [41]:
ga_df = ga_df.drop(columns = ['NAME', 'state', 'county', 'tract', 'block group'])
ga_df

Unnamed: 0,B19013_001E,B19013A_001M,B19013A_001MA,GEOID21
0,41607.0,,,010010201001
1,66313.0,,,010010201002
2,42288.0,,,010010202001
3,52609.0,,,010010202002
4,75074.0,,,010010203001
...,...,...,...,...
3920,55375.0,,,011339658004
3921,-666666666.0,,,011339658005
3922,27232.0,,,011339659001
3923,35370.0,,,011339659002


In [42]:
ga_df[ga_df[VARIABLE_NAME+'_001E'] < 0].value_counts() # There are values less than zero?

Series([], Name: count, dtype: int64)

## Split the files by county, and save them as csv.xz inside the data repo

In [43]:
data_dir = '../../data/distribution/'+ VARIABLE_NAME
os.path.isdir(data_dir)

True

In [44]:
counties = sorted(ga_df['GEOID21'].str[:5].unique())

In [45]:
for county in tqdm(counties):
    pdf = ga_df[ga_df['GEOID21'].str[:5] == county]
    pdf.to_csv(os.path.join(data_dir, '{county}.csv.xz'.format(county=county)), index=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [00:00<00:00, 352.93it/s]
