Wendy's Replicated Notebook and Process to Download HPMS data

# Procedure:
    1. need list of states and districts to include in the analysis
    2. generate list of URLs to download from HPMS website
    3. run loop to download data
        correct any filename problems
    4. quality check the shapefiles to ensure polylines
    5. join shapefiles to geodatabase
    6. zip HPMS file geodatabase for each year

In [1]:
# Import Modules
import us
import arcpy
import os
import requests
import zipfile
import shutil
import io
import datetime
import pandas as pd

## User inputs include a directory to store downloaded files, and the range of years to include in the study

In [2]:
# Define workspace to hold new files
workspace = input("Input the file directory where you want to store the HPMS data")
years = []
print("What is the first year in your study range?")
year_st = int(input())
print("What is the last year in your study range?")
year_end = int(input())
for year in range(year_st,year_end+1):
    years.append(year)

Input the file directory where you want to store the HPMS data C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\


What is the first year in your study range?


 2011


What is the last year in your study range?


 2017


In [3]:
# Create the workspace file geodatabase
if not os.path.exists(os.path.join(workspace, "hpms_workspace.gdb")):
    os.makedirs(os.path.join(workspace,"hpms_workspace.gdb"))
else: 
    print("Workspace already exists")

Workspace already exists


## Prepare the lists to aid in downloading the shapefiles
In future versions, a list of states can be included with checkboxes for subsets of data

In [4]:
# Create list of states to include in the analysis
states_list = [state.name.lower().replace(" ","") for state in us.states.STATES]

In [5]:
# Generate a list of URLs for the download files
# First, the base of the URL
base_download_url = r"https://www.fhwa.dot.gov/policyinformation/hpms/shapefiles/"

# Create dataframe of URLs for each state_year download

rows = []
for state in states_list:
    for year in years:
        if state == 'missouri' and year == 2015:
            rows.append([state, year, f'{base_download_url}/{state}{year}t.zip'])
        elif state == 'districtofcolumbia':
            rows.append([state, year, f'{base_download_url}/district{year}.zip'])
        else:
            rows.append([state, year, f'{base_download_url}/{state}{year}.zip'])
urls = pd.DataFrame(rows, columns = ['state','year','URL'])

# Create dataframe of shapefile extensions for each state_year download
rows = []
Sy_years = [2011, 2012, 2017]
sy_years = [2013]
S_years = [2014, 2016]
S_Sections_years = [2015] 

for state in states_list:
    for year in years:
        if year in Sy_years:
            rows.append([state, year, f'{state.capitalize()}{year}.shp'])
        elif year in sy_years:
            rows.append([state, year, f'{state.lower()}{year}.shp'])
        elif year in S_years:
            rows.append([state, year, f'{state.capitalize()}.shp'])
        elif year in S_Sections_years:
            if state == 'missouri' and year == 2015:
                rows.append([state, year, f'{state.capitalize()}_Sectionst.shp'])
            else:
                rows.append([state, year, f'{state.capitalize()}_Sections.shp'])
                         
shapefiles = pd.DataFrame(rows, columns = ['state', 'year', 'extension'])

In [6]:
# Create the path folder for the geodatabase
gdb = os.path.join(workspace,"hpms_workspace.gdb")
gdb
if not os.path.exists(gdb):
    os.makedirs(gdb)
else:
    print("GDB already exists")

GDB already exists


## Download the shapefiles from HPMS website
https://www.fhwa.dot.gov/policyinformation/hpms/shapefiles.cfm/

In [7]:
# Iteration Loop to download each state_year shapefile (skip if it already exists), extract contents, and place in a directory that can be deleted or archived later, add name of directories to list
shapefiles_list = []

for year in years:
    for state in states_list:
        file_url = urls['URL'].loc[(urls['state']== state) & (urls['year']==year)].get_values()[0]
        yr_state_path = f'{os.path.join(gdb, str(year), state)}'
        path = shapefiles['extension'].loc[(shapefiles['state']==state) & (shapefiles['year']==year)].get_values()[0]
        shapefile_path = f'{yr_state_path}\\{path}'

        if os.path.exists(yr_state_path):
            print(f'{yr_state_path} already exists...skipping to next state')
        else:
            os.makedirs(yr_state_path)
            print(f'Directory created for {yr_state_path}')

        if os.path.exists(shapefile_path):
            print(f'{shapefile_path} already exists...skipping to next state')
            shapefiles_list.append(shapefile_path)
        else:
            print(f'Requesting {file_url} from website')
            response = requests.get(file_url)
            print(f'Extracting {file_url}')
            if response
            zipfile.ZipFile(io.BytesIO(response.content)).extractall(path=yr_state_path)
            shapefiles_list.append(shapefile_path)
            print(f'{shapefile_path} added to list')

print('Downloads complete')

C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2011\alabama already exists...skipping to next state
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2011\alabama\Alabama2011.shp already exists...skipping to next state
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2011\alaska already exists...skipping to next state
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2011\alaska\Alaska2011.shp already exists...skipping to next state
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2011\arizona already exists...skipping to next state
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2011\arizona\Arizona2011.shp already exists...skipping to 

BadZipFile: File is not a zip file

In [413]:
# QC Check the shapefile for valid geometry and shape type before merging. Remove any shapefiles without Polyline shape types.
print(f'Pre-QC shapefile count: {len(shapefiles_list)}')

for shapefile in shapefiles_list:
    if arcpy.Describe(shapefile_path).shapeType != "Polyline":
        print(shapefile, arcpy.Describe(shapefile_path).shapeType)
        shapefiles_list.remove(shapefile)
    else:
        print(f'{shapefile} is valid')

print(f'Pose-QC shapefile count: {len(shapefiles_list)}')

Pre-QC shapefile count: 21
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2011\alabama\Alabama2011.shp is valid
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2011\alaska\Alaska2011.shp is valid
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2011\arizona\Arizona2011.shp is valid
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2012\alabama\Alabama2012.shp is valid
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2012\alaska\Alaska2012.shp is valid
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspace.gdb\2012\arizona\Arizona2012.shp is valid
C:\Users\wen10109\Documents\Projects\NHTSA\Onboarding\2016Replication\DataDownloads\HPMS\hpms_workspa

In [425]:
gdb

'C:\\Users\\wen10109\\Documents\\Projects\\NHTSA\\Onboarding\\2016Replication\\DataDownloads\\HPMS\\hpms_workspace.gdb'

In [None]:
# Merge all data files into zipped files (one for each year)
for year in years:
    output_path = os.path.join(workspace, f'HPMS_Nat_{year}')
    if not os.path.exists(output_path):
        os.path.makedirs(output_path)
        print(f'Directory for {year} output created')
    output_hpms_year = arcpy.Merge_management(inputs=shapefiles_list, output=output_path).getOutput(0)
    output_hpms_year


### Now that files are zipped and merged into one geodatabase, the temporary folders can either be deleted to save disk space, or archived for storage.

In [25]:
# Prompt user to decide whether to keep archived files (they may take up storage space)
archive = "Do you want to delete or archive the downloaded files? Type Y to delete, otherwise, files will be archived."
if input().lower() == "y":
    if arcpy.Exists(os.path.join(workspace, "hpms_workspace.gdb")):
        shutil.rmtree(path=gdb)

 Y


In [52]:
os.path.exists(os.path.join(workspace,"hpms_workspace.gdb"))

True