In [2]:
import requests
from bs4 import BeautifulSoup as bs
import re
from zipfile import ZipFile
import os
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import numpy as np


In [71]:
census_url = "https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/"

In [72]:
def get_zip_links(url):
    #create response object
    r = requests.get(url)
    #create beautiful-soup object
    soup = bs(r.content,'html5lib')
    #find all links on web-page
    links = soup.findAll('a')

    #filter the links to match pattern for state-wde shape files
    zip_links = []
    
    for link in links:
        
        try:
            if re.search('tl_2010_'+'\d{2}'+'_', link['href']):
                zip_links.append(url + link['href'])
        except:
            pass
        
    
    return zip_links
 

In [75]:
def download_zip_files(links):

    for link in links:
        file_name = link.split('/')[-1]
        
        # print('Downloading file:%s'%file_name)
        
        r = requests.get(link, stream = True)

        with open('data/zipfiles/' + file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)
        print('Downloaded file:%s'%file_name)

    print("All files downloaded!")
    return

In [76]:
zip_links = get_zip_links(census_url)

In [77]:
download_zip_files(zip_links)

Downloaded file:tl_2010_01_tract10.zip
Downloaded file:tl_2010_02_tract10.zip
Downloaded file:tl_2010_04_tract10.zip
Downloaded file:tl_2010_05_tract10.zip
Downloaded file:tl_2010_06_tract10.zip
Downloaded file:tl_2010_08_tract10.zip
Downloaded file:tl_2010_09_tract10.zip
Downloaded file:tl_2010_10_tract10.zip
Downloaded file:tl_2010_11_tract10.zip
Downloaded file:tl_2010_12_tract10.zip
Downloaded file:tl_2010_13_tract10.zip
Downloaded file:tl_2010_15_tract10.zip
Downloaded file:tl_2010_16_tract10.zip
Downloaded file:tl_2010_17_tract10.zip
Downloaded file:tl_2010_18_tract10.zip
Downloaded file:tl_2010_19_tract10.zip
Downloaded file:tl_2010_20_tract10.zip
Downloaded file:tl_2010_21_tract10.zip
Downloaded file:tl_2010_22_tract10.zip
Downloaded file:tl_2010_23_tract10.zip
Downloaded file:tl_2010_24_tract10.zip
Downloaded file:tl_2010_25_tract10.zip
Downloaded file:tl_2010_26_tract10.zip
Downloaded file:tl_2010_27_tract10.zip
Downloaded file:tl_2010_28_tract10.zip
Downloaded file:tl_2010_2

In [80]:
# Extract Zip Files
files = os.listdir("./data/zipfiles")

for file in files:
    if file.endswith('zip'):
        print(file)
        
        with ZipFile('./data/zipfiles/' + file, 'r') as zipObj:
    
       # Check if it is a shape files
            listOfFileNames = zipObj.namelist()
    
            for fileName in listOfFileNames:
                #if fileName.endswith('.shp'):
            
                zipObj.extract(fileName, './data/shapefiles')      

tl_2010_01_tract10.zip
tl_2010_02_tract10.zip
tl_2010_04_tract10.zip
tl_2010_05_tract10.zip
tl_2010_06_tract10.zip
tl_2010_08_tract10.zip
tl_2010_09_tract10.zip
tl_2010_10_tract10.zip
tl_2010_11_tract10.zip
tl_2010_12_tract10.zip
tl_2010_13_tract10.zip
tl_2010_15_tract10.zip
tl_2010_16_tract10.zip
tl_2010_17_tract10.zip
tl_2010_18_tract10.zip
tl_2010_19_tract10.zip
tl_2010_20_tract10.zip
tl_2010_21_tract10.zip
tl_2010_22_tract10.zip
tl_2010_23_tract10.zip
tl_2010_24_tract10.zip
tl_2010_25_tract10.zip
tl_2010_26_tract10.zip
tl_2010_27_tract10.zip
tl_2010_28_tract10.zip
tl_2010_29_tract10.zip
tl_2010_30_tract10.zip
tl_2010_31_tract10.zip
tl_2010_32_tract10.zip
tl_2010_33_tract10.zip
tl_2010_34_tract10.zip
tl_2010_35_tract10.zip
tl_2010_36_tract10.zip
tl_2010_37_tract10.zip
tl_2010_38_tract10.zip
tl_2010_39_tract10.zip
tl_2010_40_tract10.zip
tl_2010_41_tract10.zip
tl_2010_42_tract10.zip
tl_2010_44_tract10.zip
tl_2010_45_tract10.zip
tl_2010_46_tract10.zip
tl_2010_47_tract10.zip
tl_2010_48_

In [3]:
# Merge shapefiles into single GeoPandas DataFrame


# https://stackoverflow.com/questions/48874113/concat-multiple-shapefiles-via-geopandas
file = os.listdir('./data/shapefiles')
path = [os.path.join('./data/shapefiles', i) for i in file if i.endswith('.shp')]

gdf = gpd.GeoDataFrame(pd.concat([gpd.read_file(i) for i in path],
                                 ignore_index = True), crs = gpd.read_file(path[0]).crs)

In [24]:
# Keep only US states
gdf = gdf.loc[~gdf['STATEFP10'].isin(['60','66','69','72','78'])]

In [25]:
# Compress file to save it
gdf.memory_usage(deep=True) / 1024 ** 2

#gdf.to_file('./data/USA_gdf.shp')


Index         0.557381
STATEFP10     4.110682
COUNTYFP10    4.180355
TRACTCE10     4.389373
GEOID10       4.737736
NAME10        4.302261
NAMELSAD10    5.198243
MTFCC10       4.319700
FUNCSTAT10    4.319700
ALAND10       0.557381
AWATER10      0.557381
INTPTLAT10    4.737736
INTPTLON10    4.807408
geometry      0.557381
dtype: float64

In [26]:
us_tracts = gdf[['STATEFP10', 'COUNTYFP10', 'GEOID10', 'INTPTLAT10', 'INTPTLON10']]

In [27]:
# Convert to numerics
us_tracts.loc[:,'tract'] = us_tracts.loc[:,'GEOID10'].str.lstrip("0").astype(np.int64) 
us_tracts.loc[:,'lat'] = us_tracts.loc[:,'INTPTLAT10'].str.strip("+").astype(float)
us_tracts.loc[:,'lon'] = us_tracts.loc[:,'INTPTLON10'].str.strip("-").astype(float) * -1


In [28]:
# Save File
us_tracts[['tract', 'lat', 'lon']].to_csv('./data/us_tracts.csv')