In [None]:
#importing required libraries
import requests
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
#import time

#start_time = time.time()

In [None]:
def find_phone(park_soup):
#function to find phone number from each park's page (page 3)
#input is page soup
#output is phone number in string format
  phone_loc = park_soup.find("h4", text="Phone:")                                 #find text 'phone:' with tag <h4>
  if phone_loc:                                                                   
    park_ph = phone_loc.findNextSibling('p').get_text().strip('\n')               #extract the phone number and remove unwanted '\n'
    park_ph = park_ph.split("\n", 1)[0] #.replaceAll("\\D+", "")                  #some phone numbers are followed by text, keep only the phone number
  else:
    park_ph = ''                                                                  #condition if no phone number stated
  return park_ph

In [None]:
def find_add(soup):
#function to find the complete park address from each park's page (page 3)
#input is page soup
#output is address line 1, 2, 3, city, state, zip code as a list

  add_loc = soup.find("h4", text="Mailing Address:")
  if add_loc:
    park_add = add_loc.findNextSibling('div').find('span',
                                                     itemprop="streetAddress")    #extract mailing address tags
  
    if park_add:
      park_add = park_add.text.strip('\n')                                        #extract text from the tags
      park_add_list = re.split('\n', park_add)                                    #create a list of address with line 1, line 2, line 3 as 3 elements
    else:
      park_add_list = ['P.O. Box '+add_loc.findNextSibling('div')\
                       .find('span', itemprop="postOfficeBoxNumber")\
                       .text.strip('\n')]                                         #extract PO box number if no address

    if len(park_add_list)==1:                                                     #assign address to variables
      park_add_1 = park_add_list[0]
      park_add_2 = ''
      park_add_3 = ''
    elif len(park_add_list)==2:
      park_add_1 = park_add_list[0]
      park_add_2 = park_add_list[1]
      park_add_3 = '' 
    else:
      park_add_1 = park_add_list[0]
      park_add_2 = park_add_list[1]
      park_add_3 = park_add_list[2]

    park_city = add_loc.findNextSibling('div').find('span',
                                                    itemprop='addressLocality')\
                                                    .text.strip('\n')             #extract and assign city name
    park_state = add_loc.findNextSibling('div').find('span',
                                                     itemprop='addressRegion')\
                                                     .text.strip('\n')            #extract and assign state name
    park_zip = add_loc.findNextSibling('div').find('span',
                                                   itemprop='postalCode')\
                                                   .text.strip('\n')              #extract and assign zip code
  else:                                                                           #assign empty string if no address found
    park_add_1 = ''
    park_add_2 = ''
    park_add_3 = ''
    park_city = ''
    park_state = ''
    park_zip = ''

  return [park_add_1, park_add_2, park_add_3, park_city, park_state, park_zip]

In [None]:
def find_park_data(state_link):
#This function extracts all the data/attributes of all the parks of the state from the state's page (page 2)
#input is the state's link
#Output is a dataframe containing all attributes of the parks
  state_page = requests.get(state_link)
  state_soup = BeautifulSoup(state_page.content, 'html.parser')                   ###Retrieve each state’s page (page 2)
  park_loc = state_soup.find("h2", text="Parks")                                  #find the "parks" with <h2> tag, all the park categories follow in <h2>

###Retrieve each park’s name, category, and description from
###the list of parks in each state.
#finding park categories
  park_cat = park_loc.parent.parent.findChildren('h2')
  park_cats = []                                                                  #initialize empty list for park category, name, link and description
  park_names = []
  park_links = []
  park_descs = []
  for cat in park_cat:
      park_cat = cat.findNext('h2').text                                          #extract text from the tag, append and store in list park_cats
      park_cats.append(park_cat)
  park_cats = park_cats[:-1]                                                      #gives an extra unwanted row with unwanted text,
                                                                                  #modifying to get desired number of rows
#finding park links and names
  park_link = park_loc.parent.parent.findChildren('h3')                           #finding <h3> tag which holds the href and names of the park
  for nlink in park_link:
      link = nlink.findChild('a')
      name=link.text
      park_links.append(link['href'])                                             #extract and append to list park_links and park_names
      park_names.append(name)                                                     ###Retrieve the list of all parks from page 2 for each state

  park_links=[weblink+line for line in park_links]                                #modifying relative link to full web link

#extracting and storing park description
  park_desc=park_loc.parent.parent.findChildren('p')
  for ndesc in park_desc:
    desc = ndesc.get_text().strip('\n')
    park_descs.append(desc)

#creating soup for each park's page (page 3) for current state
  park_pages = [requests.get(park_page) for park_page in park_links]              ###Retrieve each park’s page (page 3)
  park_soups = [BeautifulSoup(park_soup.content,
                              'html.parser') for park_soup in park_pages]

#extracting and storing park phone numbers using the find_phone() function        ###Retrieve each park’s contact information from page 3 for each park.
  park_phs = [find_phone(park_soups[i])\
              for i in range(len(park_links))]

#extracting and storing park addresses using the find_add() function              
  park_add_1 = [find_add(park_soups[i])[0]\
                for i in range(len(park_links))]
  park_add_2 = [find_add(park_soups[i])[1]\
                for i in range(len(park_links))]
  park_add_3 = [find_add(park_soups[i])[2]\
                for i in range(len(park_links))]
  park_city = [find_add(park_soups[i])[3]\
               for i in range(len(park_links))]
  park_state = [find_add(park_soups[i])[4]\
                for i in range(len(park_links))]
  park_zip = [find_add(park_soups[i])[5]\
              for i in range(len(park_links))]

#storing all the attributes to a dataframe 
  df=pd.DataFrame(np.column_stack([park_names,park_cats,park_descs,
                                   park_add_1,park_add_2,park_add_3,park_city,
                                   park_state,park_zip,park_phs]),
                   columns=['Name','Category','Description',
                            'Street Address Line 1','Line 2','Line 3',
                            'City','State','Zip Code','Phone Number'])
  
  return df

In [None]:
weblink='https://www.nps.gov'
page = requests.get(weblink)
soup = BeautifulSoup(page.content, 'html.parser')                                 ###Retrieve the page with the drop-down list (page 1)
state_loc = soup.find("a", text="Alabama")
state_pages = state_loc.parent.findNextSiblings('li')
links = [state_loc['href']]
for e_page in state_pages:
    link = e_page.findChild('a')
    links.append(link['href'])
state_links=[weblink+line for line in links]                                      ###Retrieve the links to all states from page 1

park_data=[]
for i in range(len(state_links)):
  park_data.append(find_park_data(state_links[i]))

parks_data = pd.concat(park_data)
#parks_data.remove_duplicates()                                                   #if duplicates are not to be included
#dup=parks_data[parks_data.duplicated()]

In [None]:
parks_data.reset_index(drop=True, inplace=True)
parks_data.to_csv('nation_parks.csv')                                             ###results stored in csv file
#print("%s seconds" % (time.time() - start_time))

127.91666388511658 seconds
