# Fetch a suite of Census data about Texas counties

#### Uses Census quickfacts, e.g. https://www.census.gov/quickfacts/fact/table/andrewscountytexas

* Input: `texas_counties.csv`
* Output: `census_data_by_county.csv`

##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## Configuration

In [1]:
DTW_PROJECT_KEY = 'tji/auxiliary-datasets'
COUNTY_INPUT_DATAFRAME_NAME = 'texas_counties'
OUTPUT_FILENAME = 'census_data_by_county.csv'

## Run

In [2]:
from bs4 import BeautifulSoup
import bs4
import datadotworld as dw
import numpy as np
import pandas as pd
import requests

%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -w -p datadotworld,numpy,pandas

Everett Wetchler 2018-05-08 17:08:27 CDT

datadotworld 1.6.0
numpy 1.14.3
pandas 0.22.0
watermark 1.6.0


In [3]:
print("Reading from data.world")
datasets = dw.load_dataset(DTW_PROJECT_KEY, force_update=True)
COUNTY_NAMES = sorted(list(datasets.dataframes[COUNTY_INPUT_DATAFRAME_NAME]['county']))

Reading from data.world


In [4]:
def extract_metrics(soup):
    '''Given the BeautifulSoup for a census page, extract a metrics dictionary.'''
    metrics = {}
    missing = 0
    sections = soup.find_all('caption')
    for sec in sections:
        sec_title = sec.text.strip()
        subsections = sec.parent.find_all('tr', attrs={'class': 'qf-header'})
        for sub in subsections:
            sub_title = sub.text.strip()
            for i, r in enumerate(sub.parent.find_all('tr', attrs={'class': 'fact'})):
                cells = r.find_all('td')
                metric = ' '.join(elt for elt in cells[0].find('span').children
                                  if isinstance(elt, bs4.element.NavigableString))
                key = (sec_title, sub_title, metric)
                value = float(cells[1].attrs['data-value'])
                if int(cells[1].attrs['data-isnumeric']) != 1:
                    value = np.nan
                    missing += 1
                else:
                    assert(not np.isnan(value))
                metrics[key] = value

#     print(f'{len(metrics)} metric names found, {missing} of those were missing values')
    return metrics

In [5]:
def fetch_one(location, is_county=True, return_soup=False):
    '''Fetch the census data for a given Texas county.'''
    url = 'https://www.census.gov/quickfacts/fact/table/' + location.lower().replace(' ', '')
    if is_county:
        url = url + 'countytexas'
#     print('> Fetching', url)
    res = s.get(url)
    res.raise_for_status()

    soup = BeautifulSoup(res.text, "lxml")
    for elt in soup.find_all('div', attrs={'class': 'qf-geobox'}):
        if elt.text.strip():
            title = elt.text.strip().upper()
            break
    if title == 'UNITED STATES':
        raise Exception("Could not find data for data for %s" % (location))
    if return_soup:
        return soup, extract_metrics(soup)
    else:
        return extract_metrics(soup)

In [6]:
county_metrics = {}

In [7]:
failures = []
s = requests.Session()

print(f"Fetching census data for {len(COUNTY_NAMES)} counties\n")

for i, county in enumerate(COUNTY_NAMES):
    if county in county_metrics:
        # Useful if you have to pause the script and restart it.
        continue
#     print("--- Pulling data for %s county (%d/%d) ---" % (county, i+1, len(COUNTY_NAMES)))
    print(f"[{i+1}]{county}", end=' ')
    metrics = fetch_one(county)
    if metrics:
        county_metrics[county] = metrics
    else:
        failures.append(county)

print("\n***** %d failure(s):" % len(failures), failures)

Fetching census data for 254 counties

[1]ANDERSON [2]ANDREWS [3]ANGELINA [4]ARANSAS [5]ARCHER [6]ARMSTRONG [7]ATASCOSA [8]AUSTIN [9]BAILEY [10]BANDERA [11]BASTROP [12]BAYLOR [13]BEE [14]BELL [15]BEXAR [16]BLANCO [17]BORDEN [18]BOSQUE [19]BOWIE [20]BRAZORIA [21]BRAZOS [22]BREWSTER [23]BRISCOE [24]BROOKS [25]BROWN [26]BURLESON [27]BURNET [28]CALDWELL [29]CALHOUN [30]CALLAHAN [31]CAMERON [32]CAMP [33]CARSON [34]CASS [35]CASTRO [36]CHAMBERS [37]CHEROKEE [38]CHILDRESS [39]CLAY [40]COCHRAN [41]COKE [42]COLEMAN [43]COLLIN [44]COLLINGSWORTH [45]COLORADO [46]COMAL [47]COMANCHE [48]CONCHO [49]COOKE [50]CORYELL [51]COTTLE [52]CRANE [53]CROCKETT [54]CROSBY [55]CULBERSON [56]DALLAM [57]DALLAS [58]DAWSON [59]DEAF SMITH [60]DELTA [61]DENTON [62]DEWITT [63]DICKENS [64]DIMMIT [65]DONLEY [66]DUVAL [67]EASTLAND [68]ECTOR [69]EDWARDS [70]EL PASO [71]ELLIS [72]ERATH [73]FALLS [74]FANNIN [75]FAYETTE [76]FISHER [77]FLOYD [78]FOARD [79]FORT BEND [80]FRANKLIN [81]FREESTONE [82]FRIO [83]GAINES [84]GALVESTON [8

In [8]:
print("Fetching data for the full state of Texas")
texas = fetch_one('TX', is_county=False)
if not texas:
    raise Exception('Could not get Texas state-wide data')

# The state-level data has some extra measurements, which we'll drop
to_delete = set(texas) - set(county_metrics['ANDERSON'])
for d in to_delete:
    del texas[d]

county_metrics['_ALL_TEXAS'] = texas

Fetching data for the full state of Texas


In [9]:
df = pd.DataFrame.from_records(county_metrics).sort_index()

#### The next two cells just unpack the multi-level index into three distinct columns (with sensible names)

In [10]:
def insert_col_front(df, s, name):
    cols = list(df.columns)
    newcols = [name] + cols
    df[name] = s
    return df[newcols]

In [11]:
df = insert_col_front(df, df.index.get_level_values(2), 'metric_description')
df = insert_col_front(df, df.index.get_level_values(1), 'metric_subcategory')
df = insert_col_front(df, df.index.get_level_values(0), 'metric_category')
df.reset_index(inplace=True, drop=True)
df.columns = [c.upper() for c in df.columns]

In [12]:
df.head()

Unnamed: 0,METRIC_CATEGORY,METRIC_SUBCATEGORY,METRIC_DESCRIPTION,ANDERSON,ANDREWS,ANGELINA,ARANSAS,ARCHER,ARMSTRONG,ATASCOSA,...,WILLIAMSON,WILSON,WINKLER,WISE,WOOD,YOAKUM,YOUNG,ZAPATA,ZAVALA,_ALL_TEXAS
0,Businesses,Businesses,"All firms, 2012",3107.0,1324.0,6330.0,2712.0,986.0,62.0,4106.0,...,38356.0,3893.0,689.0,4846.0,3800.0,637.0,2635.0,1964.0,1232.0,2356748.0
1,Businesses,Businesses,"Men-owned firms, 2012",1602.0,695.0,3487.0,1510.0,528.0,27.0,2197.0,...,20610.0,2444.0,307.0,2923.0,1931.0,393.0,1414.0,1003.0,674.0,1251696.0
2,Businesses,Businesses,"Minority-owned firms, 2012",722.0,469.0,1358.0,778.0,151.0,,1950.0,...,10020.0,1265.0,252.0,668.0,357.0,243.0,123.0,1680.0,1062.0,1070392.0
3,Businesses,Businesses,"Nonminority-owned firms, 2012",2209.0,726.0,4707.0,1835.0,817.0,57.0,2029.0,...,27023.0,2560.0,387.0,4019.0,3315.0,338.0,2349.0,235.0,159.0,1224845.0
4,Businesses,Businesses,"Nonveteran-owned firms, 2012",2471.0,1075.0,5020.0,2281.0,781.0,57.0,3591.0,...,32883.0,3202.0,601.0,4045.0,3081.0,518.0,2054.0,1738.0,1178.0,2057218.0


## Write

In [13]:
print("Writing to data.world")
with dw.open_remote_file(DTW_PROJECT_KEY, OUTPUT_FILENAME) as w:
    df.to_csv(w, index=False)

Writing to data.world


In [14]:
print("done")

done
