In [1]:
import pandas as pd
import geopandas as gpd
from census import Census
from us import states
import pathlib
import os
from tqdm import tqdm

In [2]:
from decouple import Config, RepositoryEnv
config = Config(RepositoryEnv(".env"))

In [3]:
YEAR = 2021

In [4]:
c = Census(config("CENSUS_API_KEY"), year=YEAR)

### Look for variable descriptions here: https://api.census.gov/data/2021/acs/acs5/variables.html

In [6]:
ga_census = c.acs5.state_county_blockgroup(fields = ('NAME', 'B19013_001E'),
                                      state_fips = states.GA.fips,
                                      county_fips = "*",
                                      tract = "*",
                                      blockgroup = '*')

# Create a dataframe from the census data
ga_df = pd.DataFrame(ga_census)
ga_df

Unnamed: 0,NAME,B19013_001E,state,county,tract,block group
0,"Block Group 1, Census Tract 9501, Appling Coun...",29123.0,13,001,950100,1
1,"Block Group 2, Census Tract 9501, Appling Coun...",99643.0,13,001,950100,2
2,"Block Group 3, Census Tract 9501, Appling Coun...",68224.0,13,001,950100,3
3,"Block Group 1, Census Tract 9502.01, Appling C...",45243.0,13,001,950201,1
4,"Block Group 2, Census Tract 9502.01, Appling C...",38125.0,13,001,950201,2
...,...,...,...,...,...,...
7441,"Block Group 2, Census Tract 9505, Worth County...",26147.0,13,321,950500,2
7442,"Block Group 3, Census Tract 9505, Worth County...",53462.0,13,321,950500,3
7443,"Block Group 4, Census Tract 9505, Worth County...",65606.0,13,321,950500,4
7444,"Block Group 1, Census Tract 9506, Worth County...",84602.0,13,321,950600,1


In [7]:
ga_df['GEOID21'] = ga_df['state'] + ga_df['county'] + ga_df['tract'] + ga_df['block group']
ga_df

Unnamed: 0,NAME,B19013_001E,state,county,tract,block group,GEOID21
0,"Block Group 1, Census Tract 9501, Appling Coun...",29123.0,13,001,950100,1,130019501001
1,"Block Group 2, Census Tract 9501, Appling Coun...",99643.0,13,001,950100,2,130019501002
2,"Block Group 3, Census Tract 9501, Appling Coun...",68224.0,13,001,950100,3,130019501003
3,"Block Group 1, Census Tract 9502.01, Appling C...",45243.0,13,001,950201,1,130019502011
4,"Block Group 2, Census Tract 9502.01, Appling C...",38125.0,13,001,950201,2,130019502012
...,...,...,...,...,...,...,...
7441,"Block Group 2, Census Tract 9505, Worth County...",26147.0,13,321,950500,2,133219505002
7442,"Block Group 3, Census Tract 9505, Worth County...",53462.0,13,321,950500,3,133219505003
7443,"Block Group 4, Census Tract 9505, Worth County...",65606.0,13,321,950500,4,133219505004
7444,"Block Group 1, Census Tract 9506, Worth County...",84602.0,13,321,950600,1,133219506001


In [8]:
ga_df

Unnamed: 0,NAME,B19013_001E,state,county,tract,block group,GEOID21
0,"Block Group 1, Census Tract 9501, Appling Coun...",29123.0,13,001,950100,1,130019501001
1,"Block Group 2, Census Tract 9501, Appling Coun...",99643.0,13,001,950100,2,130019501002
2,"Block Group 3, Census Tract 9501, Appling Coun...",68224.0,13,001,950100,3,130019501003
3,"Block Group 1, Census Tract 9502.01, Appling C...",45243.0,13,001,950201,1,130019502011
4,"Block Group 2, Census Tract 9502.01, Appling C...",38125.0,13,001,950201,2,130019502012
...,...,...,...,...,...,...,...
7441,"Block Group 2, Census Tract 9505, Worth County...",26147.0,13,321,950500,2,133219505002
7442,"Block Group 3, Census Tract 9505, Worth County...",53462.0,13,321,950500,3,133219505003
7443,"Block Group 4, Census Tract 9505, Worth County...",65606.0,13,321,950500,4,133219505004
7444,"Block Group 1, Census Tract 9506, Worth County...",84602.0,13,321,950600,1,133219506001


In [9]:
ga_df = ga_df.drop(columns = ['NAME', 'state', 'county', 'tract', 'block group'])
ga_df

Unnamed: 0,B19013_001E,GEOID21
0,29123.0,130019501001
1,99643.0,130019501002
2,68224.0,130019501003
3,45243.0,130019502011
4,38125.0,130019502012
...,...,...
7441,26147.0,133219505002
7442,53462.0,133219505003
7443,65606.0,133219505004
7444,84602.0,133219506001


In [11]:
ga_df[ga_df['B19013_001E'] < 0].value_counts() # There are values less than zero?

B19013_001E   GEOID21     
-666666666.0  130019502022    1
              131350506173    1
              131350506231    1
              131350506251    1
              131350506314    1
                             ..
              130890235063    1
              130890237012    1
              130890237021    1
              130890238014    1
              133219504003    1
Name: count, Length: 669, dtype: int64

## Split the files by county, and save them as csv.xz inside the data repo

In [None]:
data_dir = '../../data/distribution'
os.path.isdir(data_dir)

In [None]:
counties = sorted(ga_df['GEOID21'].str[:5].unique())

In [None]:
for county in tqdm(counties):
    pdf = ga_df[ga_df['GEOID20'].str[:5] == county]
    pdf.to_csv(os.path.join(data_dir, '{county}.csv.xz'.format(county=county), inde