This notebook was used for testing the web scraping funcitons written in src/scraper.py

In [80]:
import time
import pprint
import pandas as pd
from bs4 import BeautifulSoup
from pymongo import MongoClient
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.common.exceptions import TimeoutException, WebDriverException
from fake_useragent import UserAgent
    

In [82]:
def get_driver(url, class_name, headless=True, retries=0):
    
    try:
        options = Options()
        ua = UserAgent()
        userAgent = ua.random
        print(userAgent)
        options.add_argument(f'user-agent={userAgent}')
        if headless:
            options.add_argument('--headless')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-gpu')
        options.add_argument('--window-size=1920,1080')
        options.add_argument("--incognito")
        options.add_argument("--enable-javascript")
        options.add_experimental_option("excludeSwitches", ['enable-automation'])
#         options.add_argument("user-data-dir=selenium")
        
        driver = webdriver.Chrome("/home/tim/DSI/capstones/alltrails/chromedriver",options=options)
        driver.get(url)
        WebDriverWait(driver,5).until(
                EC.presence_of_element_located((By.CLASS_NAME,class_name)))
    except (TimeoutException,WebDriverException):
        retries+=1
        print(f'Retry #: {retries}')
        if retries<5:
            driver.close()
            return get_driver(url,class_name,headless=False,retries=retries)
        else:
            driver.quit()
    else:
        page = driver.page_source
        driver.close()
        return page
        
#         try:
#             WebDriverWait(driver,30).until(
#                 EC.presence_of_element_located((BY.CLASS_NAME,class_name)))
#         except:
            
#         else:
#             page = driver.page_source
#             driver.close()
#             return page
        
#     except (TimeoutException,WebDriverException):
#         retries+=1
#         print(f'Retry #: {retries}')
#         if retries<10:
#             return get_driver(url,class_name,retries=retries)
#         else:
#             driver.quit()
        
        
        

In [5]:
def create_df(soup):
    df = pd.DataFrame()
    cards = soup.findAll('div', attrs = {'class':'styles-module__containerDescriptive___3aZqQ styles-module__trailCard___2oHiP'})
    for trail in cards:
        url = 'https://www.alltrails.com/'+trail.attrs['itemid']
        name = trail.find('a', attrs = {'class':'xlate-none styles-module__link___12BPT'})['title']
        loc = trail.find('a', attrs = {'class':'xlate-none styles-module__location___11FHK styles-module__info___1Mbn6 styles-module__link___3T9FO'})['title']
        row = pd.Series([name,loc,url])
        df = df.append(row, ignore_index=True)
    df.columns=['Name','Location','URL']
    return df
        
    

In [6]:
def create_db(soup):
    urls=[]
    cards = soup.findAll('div', attrs = {'class':'styles-module__containerDescriptive___3aZqQ styles-module__trailCard___2oHiP'})
    for trail in cards:
        urls.append('https://www.alltrails.com/'+trail.attrs['itemid'])

    df = get_trail_info(urls)
    return df

In [58]:
def page_parser(page,url):
    
    #header
    header = page.find('div',id='title-and-menu-box')
    trail_name = header.find('h1').text
    difficulty = header.find('span').text
    reviews = header.find('meta', attrs = {'itemprop':'reviewCount'})['content']
    loc = header.find('a', attrs = {'class':'xlate-none styles-module__location___11FHK styles-module__location___3wEnO'})['title']
    photos = header.find('span',attrs = {'class':'styles-module__title___skfpX'}).text
    
    #trail stats section
    trail_stats = page.findAll('span', attrs={'class':'styles-module__detailData___kQ-eK'})
    length = trail_stats[0].text
    elev_gain = trail_stats[1].text
    route = trail_stats[2].text
    
    #metadata
    lat = page.find('meta', attrs = {'itemprop':'latitude'})['content']
    long = page.find('meta', attrs = {'itemprop':'longitude'})['content']
    
    #tags
    tags = page.find_all('span', attrs = {'class':'big rounded active'})
    tags = [tag.text for tag in tags]
    
    description = page.find('p',id='text-container-description').text
#     rating = page.find('div', attrs={'class':'styles-module__ratingDisplay___1vR1p'}).text
    rating=0

    
    
    
    row = pd.Series([trail_name,loc,rating,reviews,difficulty,length,elev_gain,route,photos,lat,long,tags,description,url], 
                    index=['Name','Location','Rating','Reviews','Difficulty','Length','Elevation gain','Route type','Photos','lat','long','tags','description','URL'])
    return row

In [78]:
def get_trail_info(urls):
    class_name='styles-module__content___1GUwP'
    df = pd.DataFrame(columns=['Name','Location','Rating','Reviews','Difficulty','Length','Elevation gain','Route type','Photos','lat','long','tags','description','URL'])
    for url in urls:
#         try:
#             page = get_driver(url,class_name, headless=False)
#             time.sleep(5)
#             row = page_parser(BeautifulSoup(page,'html.parser'),url)
#             df = df.append(row, ignore_index=True)
#             name = df['Name']
#             print(f'scraped: {name}')
#         except:
#             print(f'Error scraping {url}')
#             break
            page = get_driver(url,class_name, headless=False)
            row = page_parser(BeautifulSoup(page,'html.parser'),url)
            df = df.append(row, ignore_index=True)
            name = df['Name']
            print(f'scraped: {name}')
        
#     driver.get(urls[0])
# #     time.sleep(2)
#     row = page_parser(BeautifulSoup(driver.page_source,'lxml'),urls[0])
#     df=df.append(row, ignore_index=True)
    return df
            
        

In [9]:
def get_trails(driver):
    url = 'https://www.alltrails.com/us/colorado'
    driver.get(url)
    wait = WebDriverWait(driver,20)

    while True:
        try:
            show_more = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.styles-module__button___1nuva')))
            show_more.click()
            time.sleep(1)
        except:
            break
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

In [10]:
def go_login(un, pw):
    options = Options()
    options.add_argument("start-maximized")
    options.add_experimental_option("excludeSwitches", ['enable-automation'])
    driver=webdriver.Chrome("/home/tim/DSI/capstones/alltrails/chromedriver", options=options)
    url = 'https://www.alltrails.com/login?ref=header'
    driver.get(url)
    time.sleep(5)
    
    username = driver.find_element_by_name('userEmail')
    username.send_keys(un)
    password = driver.find_element_by_name('userPassword')
    password.send_keys(pw)

    login = driver.find_element_by_css_selector(".styles-module__submit___2REmT")
    login.click()
    time.sleep(5)


In [29]:
trails = pd.read_csv('data/trails.csv')
urls = trails.URL.tolist()

In [83]:
df = get_trail_info(urls)

Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36
scraped: 0    Emerald Lake Trail
Name: Name, dtype: object
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36
Retry #: 1


WebDriverException: Message: chrome not reachable
  (Session info: chrome=91.0.4472.77)


In [None]:
db = create_db(soup)

Retry #: 1
Retry #: 2
Retry #: 3
Retry #: 4
Retry #: 5
Retry #: 6
Retry #: 7
Retry #: 8
Retry #: 9
Retry #: 10
Error scraping https://www.alltrails.com//trail/us/colorado/emerald-lake-trail
Retry #: 1
Retry #: 2
Retry #: 3
Retry #: 4
Retry #: 5
Retry #: 6
Retry #: 7
Retry #: 8
Retry #: 9
Retry #: 10
Error scraping https://www.alltrails.com//trail/us/colorado/royal-arch-trail
Retry #: 1
Retry #: 2
Retry #: 3
Retry #: 4
Retry #: 5
Retry #: 6
Retry #: 7
Retry #: 8
Retry #: 9
Retry #: 10
Error scraping https://www.alltrails.com//trail/us/colorado/sky-pond-via-glacier-gorge-trail
Retry #: 1
Retry #: 2
Retry #: 3
Retry #: 4
Retry #: 5
Retry #: 6
Retry #: 7
Retry #: 8
Retry #: 9
Retry #: 10
Error scraping https://www.alltrails.com//trail/us/colorado/beaver-brook-chavez-trail-loop
Retry #: 1
Retry #: 2
Retry #: 3
Retry #: 4
Retry #: 5
Retry #: 6
Retry #: 7
Retry #: 8
Retry #: 9
Retry #: 10
Error scraping https://www.alltrails.com//trail/us/colorado/the-loch-lake-trail-via-glacier-gorge-trail
R

Error scraping https://www.alltrails.com//trail/us/colorado/crater-lakes-trail-via-south-boulder-creek-trail
Retry #: 1
Retry #: 2
Retry #: 3
Retry #: 4
Retry #: 5
Retry #: 6
Retry #: 7
Retry #: 8
Retry #: 9
Retry #: 10


In [50]:
un = 'timrchilders@gmail.com'
pwd = 'galvanize'
go_login(un,pwd)

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[name="userEmail"]"}
  (Session info: chrome=91.0.4472.77)


In [18]:
# un = 'timrchilders@gmail.com'
# pwd = 'galvanize'

options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ['enable-automation'])
# # options.add_argument("--headless")
# # firefox_profile = webdriver.FirefoxProfile()
# # firefox_profile.set_preference("browser.privatebrowsing.autostart", True)


driver=webdriver.Chrome("/home/tim/DSI/capstones/alltrails/chromedriver", options=options)

soup = get_trails(driver)
driver.quit()

In [19]:
print(soup.title)

<title class="xlate-none">  Best trails in Colorado
 | AllTrails</title>


In [20]:
trails = create_df(soup)
print(f"Scraped {len(trails)} trails")

Scraped 1000 trails


In [125]:
!ls

alltrails.ipynb  chromedriver  data  geckodriver.log  README.md  selenium  src


In [126]:
trails.to_csv('data/trails.csv', index=True)

In [121]:
PROXY = 'http://gate.smartproxy.com:7000'


useragent=UserAgent()
options = Options()
# options.add_argument("--headless")
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ['enable-automation'])
options.add_argument("user-data-dir=selenium")
ua = UserAgent()
userAgent = ua.random
print(userAgent)
options.add_argument(f'user-agent={userAgent}')


driver=webdriver.Chrome("/home/tim/DSI/capstones/alltrails/chromedriver", options=options)

# go_login(driver,un,pwd)
db = create_db(soup, driver)

driver.quit()

Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36


AttributeError: 'NoneType' object has no attribute 'find'

In [117]:
db

Unnamed: 0,Name,Location,Rating,Reviews,Difficulty,Length,Elevation gain,Route type,Photos,lat,long,tags,description,URL
0,Emerald Lake Trail,Rocky Mountain National Park,4.7,5976,moderate,3.2 mi,698 ft,Out & back,"Photos (6,456)",40.31195,-105.64567,"[Kid friendly, Hiking, Snowshoeing, Forest, La...",Rocky Mountain National Park charges a fee to ...,https://www.alltrails.com//trail/us/colorado/e...


<title class="xlate-none">  Best trails in Colorado
 | AllTrails</title>