In [67]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np
import re
import pandas as pd
import json
from selenium import webdriver

In [2]:
pd.options.display.float_format = '{:,.1f}'.format

In [3]:
url = 'https://www.alltrails.com/parks/us/virginia/shenandoah-national-park'
response = requests.get(url)

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
def search_text(soup_use, search_pattern, group_number=0):
    text = []
    for index, string in enumerate(soup_use.find_all('div')):
        pattern = re.search(search_pattern,str(string))
        if pattern is not None:
            text.append(pattern.group(group_number))
    return text

In [6]:
text = search_text(soup, '/.*/&quot;')

In [7]:
len(text)

2

In [8]:
text_use = text[0]

In [9]:
trail_list = []
for v in text[1].split(','):
    try:
        find_text = re.search('/virginia/(.*)&quot;',v).group(1)
        if find_text.find('-') != -1 and find_text.find('park') == -1:
            trail_list.append(find_text)
    except:
        pass

In [10]:
trail_list

['old-rag-mountain-loop-trail',
 'white-oak-canyon-and-cedar-run-trails',
 'dark-hollow-falls-trail',
 'hawksbill-gap-loop',
 'rose-river-trail',
 'bearfence-mountain-trail',
 'south-river-falls-trail',
 'stony-man-via-appalachian-trail',
 'little-stony-man-loop-via-appalachian-trail',
 'white-oak-canyon-trail',
 'marys-rock-via-appalachian-trail-north-approach',
 'little-devils-stairs-trail',
 'hawksbill-summit',
 'riprap-trail',
 'blackrock-summit-via-trayfoot-mountain-and-appalachian-trail',
 'upper-hawksbill-trail',
 'jones-run-falls-trail',
 'dark-hollow-via-appalachian-trail-and-story-of-the-forest-trail',
 'doyles-river-falls-trail',
 'doyles-river-loop-via-appalachian-trail',
 'port-republic',
 'overall-run-loop',
 'turk-mountain-trail',
 'marys-rock-south-trail',
 'snead-farm-dickey-ridge-loop-trail',
 'front-royal']

In [11]:
link_prefix = 'https://www.alltrails.com/trail/us/virginia/'
link_list = [link_prefix+v.lower().replace(' ','-').replace('(','').replace(')','').replace("'",'') for v in trail_list]

In [12]:
link_list

['https://www.alltrails.com/trail/us/virginia/old-rag-mountain-loop-trail',
 'https://www.alltrails.com/trail/us/virginia/white-oak-canyon-and-cedar-run-trails',
 'https://www.alltrails.com/trail/us/virginia/dark-hollow-falls-trail',
 'https://www.alltrails.com/trail/us/virginia/hawksbill-gap-loop',
 'https://www.alltrails.com/trail/us/virginia/rose-river-trail',
 'https://www.alltrails.com/trail/us/virginia/bearfence-mountain-trail',
 'https://www.alltrails.com/trail/us/virginia/south-river-falls-trail',
 'https://www.alltrails.com/trail/us/virginia/stony-man-via-appalachian-trail',
 'https://www.alltrails.com/trail/us/virginia/little-stony-man-loop-via-appalachian-trail',
 'https://www.alltrails.com/trail/us/virginia/white-oak-canyon-trail',
 'https://www.alltrails.com/trail/us/virginia/marys-rock-via-appalachian-trail-north-approach',
 'https://www.alltrails.com/trail/us/virginia/little-devils-stairs-trail',
 'https://www.alltrails.com/trail/us/virginia/hawksbill-summit',
 'https://

In [13]:
len(link_list)

26

In [14]:
col_name = 'name'
col_popularity = 'popularity'
col_type = 'type'
col_length = 'length'
col_difficulty = 'difficulty_rating'
col_route_type = 'route_type'
col_avg_rating = 'avg_rating'
col_features = 'features'
col_activities = 'activities'
col_num_reviews = 'num_reviews'
col_duration_minutes = 'duration_minutes'
col_link = 'col_link'
col_list = [col_name, col_avg_rating, col_popularity, col_difficulty, col_length, col_duration_minutes, col_num_reviews,
           col_route_type, col_features, col_activities, col_type, col_link]

In [15]:
class GetTrailDetail:
    def __init__(self, link):
        self.link = link
        self.soup = self.get_soup()
        self.searched_text = None
        self.text_dict = None
    
    def get_soup(self):
        #request html data
        sub_response = requests.get(self.link)
        sub_soup = BeautifulSoup(sub_response.text, 'html.parser')
        return sub_soup

    def convert2df(self, pattern='("trail".*),"profile_photo_url":', group_number=1):
        try:
            sub_text = '{' + search_text(self.soup, pattern,group_number)[0]+'}}'.replace("'",'"')
            self.searched_text = sub_text 
        except:
            print(f'{self.link} Failed')
        #convert to json dictionary
        try:
            sub_dict = json.loads(sub_text)['trail']

            #convert to dataframe
            df_sub = pd.DataFrame.from_dict(sub_dict, orient='index').T
            self.text_dict = sub_dict
            print(f'{self.link} Done')
            df_sub[col_link] = self.link
            return df_sub
        except:
            print(f"{self.link} Failed")
            pass

In [16]:
df_concat = pd.DataFrame()
for link in link_list:
    D = GetTrailDetail(link)
    df_concat = pd.concat([df_concat, D.convert2df()], sort=False, axis=0)

https://www.alltrails.com/trail/us/virginia/old-rag-mountain-loop-trail Done
https://www.alltrails.com/trail/us/virginia/white-oak-canyon-and-cedar-run-trails Done
https://www.alltrails.com/trail/us/virginia/dark-hollow-falls-trail Done
https://www.alltrails.com/trail/us/virginia/hawksbill-gap-loop Done
https://www.alltrails.com/trail/us/virginia/rose-river-trail Done
https://www.alltrails.com/trail/us/virginia/bearfence-mountain-trail Done
https://www.alltrails.com/trail/us/virginia/south-river-falls-trail Done
https://www.alltrails.com/trail/us/virginia/stony-man-via-appalachian-trail Done
https://www.alltrails.com/trail/us/virginia/little-stony-man-loop-via-appalachian-trail Done
https://www.alltrails.com/trail/us/virginia/white-oak-canyon-trail Done
https://www.alltrails.com/trail/us/virginia/marys-rock-via-appalachian-trail-north-approach Done
https://www.alltrails.com/trail/us/virginia/little-devils-stairs-trail Done
https://www.alltrails.com/trail/us/virginia/hawksbill-summit Do

In [19]:
df_final = df_concat[col_list].copy()

df_final[col_length] = (df_final[col_length]*0.000621371).map(lambda x: round(x,1))

df_final[col_duration_minutes] = df_final[col_duration_minutes].map(lambda x: f'{int(x//60)}hr {int(x%60)}mins')

In [20]:
nps_link = 'https://www.nps.gov/shen/planyourvisit/downloadable-guides.htm#CP_JUMP_5709005'
N = GetTrailDetail(nps_link)
nps_soup = N.soup

In [21]:
nps_trail_link = []
nps_trail_name = []
nps_prefix = 'https://www.nps.gov'
nps_dict = {}
for string in nps_soup.find_all('td'):
    for a in string.find_all('a', href=True):
        pdf_file = a['href']
        if pdf_file.find('Trail') != -1:
            trail_link = nps_prefix+pdf_file
            trail_name = re.search('upload/(.*)_', trail_link).group(1)
            with open('C:/Users/huang/Dropbox/Travel/Shenandoah/'+trail_name+'.pdf', 'wb') as f:
                f.write(requests.get(trail_link).content)
            print(f'Download {trail_link}')
            nps_trail_link.append(trail_link)
            nps_trail_name.append(trail_name)
            nps_dict[trail_name] = trail_link
            

Download https://www.nps.gov/shen/planyourvisit/upload/DickeyRidge_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/ComptonGap_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/MathewsArm_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/KeyserRun_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/ThorntonGap_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/Skyland_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/WhiteoakCanyon_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/OldRag_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/Hawksbill_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/BigMeadows_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/RapidanCamp_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourvisit/upload/Bearfence_RoadTrail.pdf
Download https://www.nps.gov/shen/planyourv

In [22]:
nps_dict

{'DickeyRidge': 'https://www.nps.gov/shen/planyourvisit/upload/DickeyRidge_RoadTrail.pdf',
 'ComptonGap': 'https://www.nps.gov/shen/planyourvisit/upload/ComptonGap_RoadTrail.pdf',
 'MathewsArm': 'https://www.nps.gov/shen/planyourvisit/upload/MathewsArm_RoadTrail.pdf',
 'KeyserRun': 'https://www.nps.gov/shen/planyourvisit/upload/KeyserRun_RoadTrail.pdf',
 'ThorntonGap': 'https://www.nps.gov/shen/planyourvisit/upload/ThorntonGap_RoadTrail.pdf',
 'Skyland': 'https://www.nps.gov/shen/planyourvisit/upload/Skyland_RoadTrail.pdf',
 'WhiteoakCanyon': 'https://www.nps.gov/shen/planyourvisit/upload/WhiteoakCanyon_RoadTrail.pdf',
 'OldRag': 'https://www.nps.gov/shen/planyourvisit/upload/OldRag_RoadTrail.pdf',
 'Hawksbill': 'https://www.nps.gov/shen/planyourvisit/upload/Hawksbill_RoadTrail.pdf',
 'BigMeadows': 'https://www.nps.gov/shen/planyourvisit/upload/BigMeadows_RoadTrail.pdf',
 'RapidanCamp': 'https://www.nps.gov/shen/planyourvisit/upload/RapidanCamp_RoadTrail.pdf',
 'Bearfence': 'https://ww

In [23]:
import numpy as np

In [24]:
def map_pdf_link(value):
    for trail, link in nps_dict.items():
        if value.lower().replace(' ','').strip().find(trail.lower()) != -1:
            c = link
            break
        else:
            c = np.nan
    return c

In [27]:
pd.set_option('display.width',4000)
pd.set_option('display.max_colwidth', 4000)

In [28]:
col_pdf = 'map_pdf'
df_final[col_pdf] = df_final[col_name].map(lambda x: map_pdf_link(x))

In [30]:
col_district = 'ditrict'
col_skyline_drive_mile = 'skyline_drive_mile'
col_nps_features = 'nps_features'
col_nps_difficulty = 'nps_difficulty'

In [44]:
nps_df_html = pd.read_html('https://www.nps.gov/shen/planyourvisit/hiking-opportunities.htm')

In [47]:
col_map_name = 'nps_name'

In [45]:
df_north = nps_df_html[1]
df_central = nps_df_html[2]
df_south = nps_df_html[3]
for df, col in zip([df_north, df_central, df_south],['North','Central','South']):
    df.columns = [col_map_name, col_skyline_drive_mile, col_nps_features, col_nps_difficulty, col_length]
    df[col_district] = col

In [46]:
df_nps = pd.concat([df_north, df_central, df_south], axis=0, sort=False)

In [48]:
def map_trail_name(value):
    split_str = value.lower().split(' ')
    try:
        abbr_value = split_str[0] + split_str[1]
    except IndexError:
        abbr_value = split_str[0]
    abbr_value_0 = split_str[0]
#     print(abbr_value)
    for name in df_nps[col_map_name].values:
        clean_name = name.lower().replace(' ','')
        if clean_name.find(abbr_value.replace('trail','').replace('loop','')) != -1:
            map_name = name
#             print(name)
            break
        elif clean_name.find(abbr_value_0.replace('trail','').replace('loop','')) != -1:
            map_name = name
            break
        else:
            map_name = np.nan
    return map_name

In [49]:
df_final[col_map_name] = df_final[col_name].map(lambda x: map_trail_name(x))

In [50]:
df_nps.head()

Unnamed: 0,nps_name,skyline_drive_mile,nps_features,nps_difficulty,length,ditrict
0,Fox Hollow Loop,4.6,"Forest trail with cemetery, self-guided hike booklet available, no pets allowed","easy, Track Trail for Kids",1.2,North
1,Snead Farm Loop,5.1,Trail to old barn and a viewpoint,moderate,3.0,North
2,Lands Run,9.2,Fire road which crosses a stream,moderate,1.2,North
3,"Compton Peak, West and East",10.4,"Viewpoint and geology, columnar jointing","moderate, steep section to geologic feature",2.4,North
4,Fort Windham Rocks,10.4,Appalachian Trail to geologic feature,easy,0.8,North


In [52]:
df_final.reset_index(drop=True, inplace=True)

df_final.loc[df_final[col_map_name].isna(),[col_name, col_map_name]]

In [54]:
df_nps.head()

Unnamed: 0,nps_name,skyline_drive_mile,nps_features,nps_difficulty,length,ditrict
0,Fox Hollow Loop,4.6,"Forest trail with cemetery, self-guided hike booklet available, no pets allowed","easy, Track Trail for Kids",1.2,North
1,Snead Farm Loop,5.1,Trail to old barn and a viewpoint,moderate,3.0,North
2,Lands Run,9.2,Fire road which crosses a stream,moderate,1.2,North
3,"Compton Peak, West and East",10.4,"Viewpoint and geology, columnar jointing","moderate, steep section to geologic feature",2.4,North
4,Fort Windham Rocks,10.4,Appalachian Trail to geologic feature,easy,0.8,North


In [61]:
df_final_merge = df_final.merge(df_nps[[col_map_name, col_skyline_drive_mile, col_nps_features, col_nps_difficulty, col_district]],how='left', on=col_map_name)

df_final_merge.drop(col_map_name, axis=1, inplace=True)

df_final_merge.to_excel('C:/Users/huang/Dropbox/Travel/Shenandoah/shenandoah_trail.xlsx',index=False)