# Geocode NFIRS datapoints using Census Geocoder API

It took roughly 30 minutes to geocode 10,000 rows, which would mean it would take on the order of 100 hours to geocode the whole dataset, so I decided not to bother with geocoding the whole thing at this point. The code to do it is here though.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
import requests

%matplotlib inline
pd.set_option('display.max_columns',500)
sns.set()

In [2]:
def top_value_counts(df, n=5, only_categories = True, cols_to_include = None, cols_to_exclude = None):
    """ Function to generate summary information for string or categorical
    data in dataframes"""
    if cols_to_include:
        df = df[cols_to_include]
    if cols_to_exclude:
        df = df[df.columns[~df.columns.isin(cols_to_exclude)]]
    if only_categories:
        df = df.select_dtypes(include=['O','category'])
    cols = df.columns
    df_value_counts = pd.DataFrame()
    i_name = -1
    for col in cols:
        i_name += 1
        counts = df[col].value_counts(dropna=False)[:n]
        top_n_names = list(counts.index)
        top_n = list(counts)
        if len(top_n) < n+1:
            for i in range(n-len(top_n)):
                top_n.append('-')
                top_n_names.append('-')
        top_n_names.insert(0,'n_unique')
        top_n.insert(0,df[col].nunique())
        df_value_counts[col] = top_n_names
        df_value_counts[i_name] = top_n
    return(df_value_counts)

In [43]:
import os

In [47]:
os.chdir('../data/02_interim/10000row_chunks')
os.getcwd()

'C:\\Users\\thwhi\\Dropbox\\Personal\\Programming\\DataKind Red Cross Project\\data\\02_interim\\10000row_chunks'

In [69]:
def get_chunk_data(filename,output_filename):
    url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch'
    payload = {'benchmark':'Public_AR_Current','vintage':'Current_Current'}
    files = {'addressFile':(filename,open(filename,'rb'), 'text/csv')}
    response = requests.post(url, files=files, data=payload)
    
    cols = ['rcp2_id','original_address','match','match_type','matched_address','latlon','tigerline','tigerside',
            'geo_state_id','geo_county_id','geo_census_tract_id','geo_census_block_id']
    results = pd.read_csv(StringIO(response.text),names=cols,dtype=str)
    
    results['geo_id'] = results['geo_state_id'] + results['geo_county_id'] + results['geo_census_tract_id'] + results['geo_census_block_id']
    results[['lon','lat']] = results['latlon'].str.split(',',expand=True)
    results.to_csv(output_filename,index=False)
    return

In [84]:
# Iterate through all the 10,000 row address files and return a csv with the associated geo_ids
# This took ~ 30 minutes per file with 10,000 rows, and since there are 196 rows, this will take on the order of 100 hours to complete

FLAG = False

if Flag:
    for filename in os.listdir():
        if filename.endswith('.csv'):
            print('Working on: ', filename)
            output_filename = '../10000row_geoid_output/output_' + filename
            get_chunk_data(filename,output_filename)

Working on:  nfirs_addresses0.csv
Working on:  nfirs_addresses1.csv
Working on:  nfirs_addresses10.csv
Working on:  nfirs_addresses100.csv
Working on:  nfirs_addresses101.csv
Working on:  nfirs_addresses102.csv
Working on:  nfirs_addresses103.csv
Working on:  nfirs_addresses104.csv
Working on:  nfirs_addresses105.csv
Working on:  nfirs_addresses106.csv
Working on:  nfirs_addresses107.csv
Working on:  nfirs_addresses108.csv
Working on:  nfirs_addresses109.csv
Working on:  nfirs_addresses11.csv
Working on:  nfirs_addresses110.csv
Working on:  nfirs_addresses111.csv
Working on:  nfirs_addresses112.csv
Working on:  nfirs_addresses113.csv
Working on:  nfirs_addresses114.csv
Working on:  nfirs_addresses115.csv
Working on:  nfirs_addresses116.csv
Working on:  nfirs_addresses117.csv
Working on:  nfirs_addresses118.csv
Working on:  nfirs_addresses119.csv
Working on:  nfirs_addresses12.csv
Working on:  nfirs_addresses120.csv
Working on:  nfirs_addresses121.csv
Working on:  nfirs_addresses122.csv

# Old Testing Code

In [51]:
url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch'
payload = {'benchmark':'Public_AR_Current','vintage':'Current_Current'}
files = {'addressFile':('nfirs_addresses0.csv',open('nfirs_addresses0.csv','rb'), 'text/csv')}
response = requests.post(url, files=files, data=payload)

In [58]:
cols = ['rcp2_id','original_address','match','match_type','matched_address','latlon','tigerline','tigerside',
        'geo_state_id','geo_county_id','geo_census_tract_id','geo_census_block_id']
results = pd.read_csv(StringIO(response.text),names=cols,dtype=str)

In [59]:
results.head()

Unnamed: 0,rcp2_id,original_address,match,match_type,matched_address,latlon,tigerline,tigerside,geo_state_id,geo_county_id,geo_census_tract_id,geo_census_block_id
0,rcp2_id5196,"London Springs Rd., Pyatt, AR, 72672",No_Match,,,,,,,,,
1,rcp2_id5197,"358 Cambridge Lane, Pyatt, AR, 72672",No_Match,,,,,,,,,
2,rcp2_id5198,"322 Loblolly Trail, Pyatt, AR, 72672",No_Match,,,,,,,,,
3,rcp2_id5199,"761 mc 6040, Yellville, AR, 72687",No_Match,,,,,,,,,
4,rcp2_id645,"1701 2ND AVE, FAIRBANKS, AK, 99701",Match,Exact,"1701 2ND AVE, FAIRBANKS, AK, 99701","-147.74654,64.84433",190893232.0,L,2.0,90.0,200.0,2004.0


In [61]:
results['geo_id'] = results['geo_state_id'] + results['geo_county_id'] + results['geo_census_tract_id'] + results['geo_census_block_id']

In [62]:
results.head()

Unnamed: 0,rcp2_id,original_address,match,match_type,matched_address,latlon,tigerline,tigerside,geo_state_id,geo_county_id,geo_census_tract_id,geo_census_block_id,geo_id
0,rcp2_id5196,"London Springs Rd., Pyatt, AR, 72672",No_Match,,,,,,,,,,
1,rcp2_id5197,"358 Cambridge Lane, Pyatt, AR, 72672",No_Match,,,,,,,,,,
2,rcp2_id5198,"322 Loblolly Trail, Pyatt, AR, 72672",No_Match,,,,,,,,,,
3,rcp2_id5199,"761 mc 6040, Yellville, AR, 72687",No_Match,,,,,,,,,,
4,rcp2_id645,"1701 2ND AVE, FAIRBANKS, AK, 99701",Match,Exact,"1701 2ND AVE, FAIRBANKS, AK, 99701","-147.74654,64.84433",190893232.0,L,2.0,90.0,200.0,2004.0,20900002002004.0


In [66]:
results[['lon','lat']] = results['latlon'].str.split(',',expand=True)

In [67]:
results.head()

Unnamed: 0,rcp2_id,original_address,match,match_type,matched_address,latlon,tigerline,tigerside,geo_state_id,geo_county_id,geo_census_tract_id,geo_census_block_id,geo_id,lat,lon
0,rcp2_id5196,"London Springs Rd., Pyatt, AR, 72672",No_Match,,,,,,,,,,,,
1,rcp2_id5197,"358 Cambridge Lane, Pyatt, AR, 72672",No_Match,,,,,,,,,,,,
2,rcp2_id5198,"322 Loblolly Trail, Pyatt, AR, 72672",No_Match,,,,,,,,,,,,
3,rcp2_id5199,"761 mc 6040, Yellville, AR, 72687",No_Match,,,,,,,,,,,,
4,rcp2_id645,"1701 2ND AVE, FAIRBANKS, AK, 99701",Match,Exact,"1701 2ND AVE, FAIRBANKS, AK, 99701","-147.74654,64.84433",190893232.0,L,2.0,90.0,200.0,2004.0,20900002002004.0,64.84433,-147.74654


In [68]:
results.to_csv('nfirs_addresses_result0.csv',index=False)

In [4]:
blah = get_json('https://geocoding.geo.census.gov/geocoder/locations/onelineaddress?address=4600+Silver+Hill+Rd%2C+Suitland%2C+MD+20746&benchmark=9&format=json')

In [5]:
blah['result']['addressMatches'][0]['coordinates']

{'x': -76.92691, 'y': 38.846542}

In [28]:
parameters = {'returntype':'geographies','searchtype':'onelineaddress','benchmark':'Public_AR_Current',
              'address':'7423 chummley ct, falls church, VA 22043','format':'json',
             'vintage':'Current_Current'}

In [7]:
response = requests.get('https://geocoding.geo.census.gov/geocoder/locations/onelineaddress?',params=parameters)

In [29]:
response = requests.get('https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress?',params=parameters)
print(response.status_code)
response.json()

200


{'result': {'input': {'benchmark': {'id': '4',
    'benchmarkName': 'Public_AR_Current',
    'benchmarkDescription': 'Public Address Ranges - Current Benchmark',
    'isDefault': False},
   'vintage': {'id': '4',
    'vintageName': 'Current_Current',
    'vintageDescription': 'Current Vintage - Current Benchmark',
    'isDefault': True},
   'address': {'address': '7423 chummley ct, falls church, VA 22043'}},
  'addressMatches': [{'matchedAddress': '7423 CHUMMLEY CT, FALLS CHURCH, VA, 22043',
    'coordinates': {'x': -77.200806, 'y': 38.89857},
    'tigerLine': {'tigerLineId': '75978166', 'side': 'L'},
    'addressComponents': {'fromAddress': '7401',
     'toAddress': '7423',
     'preQualifier': '',
     'preDirection': '',
     'preType': '',
     'streetName': 'CHUMMLEY',
     'suffixType': 'CT',
     'suffixDirection': '',
     'suffixQualifier': '',
     'city': 'FALLS CHURCH',
     'state': 'VA',
     'zip': '22043'},
    'geographies': {'2010 Census Blocks': [{'SUFFIX': '',
     

In [26]:
data = response.json()['result']['addressMatches'][0]
data.keys()

dict_keys(['matchedAddress', 'coordinates', 'tigerLine', 'addressComponents', 'geographies'])

In [27]:
geo = data['geographies']
geo

{'Unified School Districts': [{'LOGRADE': 'PK',
   'GEOID': '5101260',
   'CENTLAT': '+38.8344842',
   'AREAWATER': 40107717,
   'STATE': '51',
   'BASENAME': 'Fairfax County Public Schools',
   'OID': 28490241127450,
   'SDTYP': '',
   'LSADC': '00',
   'FUNCSTAT': 'E',
   'INTPTLAT': '+38.8295203',
   'SDUNI': '01260',
   'NAME': 'Fairfax County Public Schools',
   'OBJECTID': 2695,
   'CENTLON': '-077.2761104',
   'HIGRADE': '12',
   'AREALAND': 1012703532,
   'INTPTLON': '-077.2732524',
   'MTFCC': 'G5420'}]}

In [24]:
len(data)

1

In [9]:
response.json()

{'result': {'input': {'benchmark': {'id': '4',
    'benchmarkName': 'Public_AR_Current',
    'benchmarkDescription': 'Public Address Ranges - Current Benchmark',
    'isDefault': False},
   'address': {'address': '7423 chummley ct, falls church, VA 22043'}},
  'addressMatches': [{'matchedAddress': '7423 CHUMMLEY CT, FALLS CHURCH, VA, 22043',
    'coordinates': {'x': -77.200806, 'y': 38.89857},
    'tigerLine': {'tigerLineId': '75978166', 'side': 'L'},
    'addressComponents': {'fromAddress': '7401',
     'toAddress': '7423',
     'preQualifier': '',
     'preDirection': '',
     'preType': '',
     'streetName': 'CHUMMLEY',
     'suffixType': 'CT',
     'suffixDirection': '',
     'suffixQualifier': '',
     'city': 'FALLS CHURCH',
     'state': 'VA',
     'zip': '22043'}}]}}

In [10]:
blah3 = requests.get('https://geocoding.geo.census.gov/geocoder/geographies/address?street=4600+Silver+Hill+Rd&city=Suitland&state=MD&benchmark=Public_AR_Census2010&vintage=Census2010_Census2010&layers=14&format=json')


In [11]:
blah3.json()

{'result': {'input': {'benchmark': {'id': '9',
    'benchmarkName': 'Public_AR_Census2010',
    'benchmarkDescription': 'Public Address Ranges - Census 2010 Benchmark',
    'isDefault': False},
   'vintage': {'id': '910',
    'vintageName': 'Census2010_Census2010',
    'vintageDescription': 'Census2010 Vintage - Census2010 Benchmark',
    'isDefault': True},
   'address': {'street': '4600 Silver Hill Rd',
    'city': 'Suitland',
    'state': 'MD'}},
  'addressMatches': [{'matchedAddress': '4600 Silver Hill Rd, SUITLAND, MD, 20746',
    'coordinates': {'x': -76.92691, 'y': 38.846542},
    'tigerLine': {'tigerLineId': '613199520', 'side': 'L'},
    'addressComponents': {'fromAddress': '4600',
     'toAddress': '4712',
     'preQualifier': '',
     'preDirection': '',
     'preType': '',
     'streetName': 'Silver Hill',
     'suffixType': 'Rd',
     'suffixDirection': '',
     'suffixQualifier': '',
     'city': 'SUITLAND',
     'state': 'MD',
     'zip': '20746'},
    'geographies': {'C