In [34]:
# This tutorial covers:
#
# web scraper
# scrape HTML tags using class attribute
# Finding nth child with Beautiful Soup (revisit this section)
# Find tags using Regex
# Using ChromeDriver with Selenium
# Get iframe content using Selenium & Beautiful Soup
# Handling Ajax calls
# Handling & deleting cookies
#
# Running headless Chrome with Selenium

In [2]:
tutorial_url = 'https://likegeeks.com/python-web-scraping/'

In [38]:
headless_url = 'https://medium.com/@pyzzled/running-headless-chrome-with-selenium-in-python-3f42d1f5ff1d'

In [3]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

Web Scraper

In [4]:
html = urlopen('https://python.org/')

res = BeautifulSoup(html.read(),'html5lib')

if res.title is None:
    print('Tag not found')
else:
    print(res.title)

<title>Welcome to Python.org</title>


In [5]:
print(type(html))
print(type(res))

<class 'http.client.HTTPResponse'>
<class 'bs4.BeautifulSoup'>


In [6]:
# handling http exception, as urlopen may return an error

from urllib.error import HTTPError
from urllib.error import URLError

try:
    htmlzzz = urlopen('https://pythonzzz.org/')
except HTTPError as e:
    print(e)
except URLError:
    print('Server down or incorrect domain')
else:
    reszzz = BeautifulSoup(htmlzzz.read(),'html5lib')
    print(reszzz.title)

Server down or incorrect domain


Scrape HTML tags using class attribute

In [7]:
html = urlopen('https://python.org/')
res = BeautifulSoup(html.read(),'html5lib')

tags = res.findAll('h2', {'class':'widget-title'})
for tag in tags:
    print(tag.getText())

Get Started
Download
Docs
Jobs
Latest News
Upcoming Events
Success Stories
Use Python for…

                        >>> Python Enhancement Proposals (PEPs): The future of Python is discussed here.
                         RSS
                    

    >>> Python Software Foundation



In [8]:
for tag in tags:
    print(tag) # without getText this returns full tags

<h2 class="widget-title"><span aria-hidden="true" class="icon-get-started"></span>Get Started</h2>
<h2 class="widget-title"><span aria-hidden="true" class="icon-download"></span>Download</h2>
<h2 class="widget-title"><span aria-hidden="true" class="icon-documentation"></span>Docs</h2>
<h2 class="widget-title"><span aria-hidden="true" class="icon-jobs"></span>Jobs</h2>
<h2 class="widget-title"><span aria-hidden="true" class="icon-news"></span>Latest News</h2>
<h2 class="widget-title"><span aria-hidden="true" class="icon-calendar"></span>Upcoming Events</h2>
<h2 class="widget-title"><span aria-hidden="true" class="icon-success-stories"></span>Success Stories</h2>
<h2 class="widget-title"><span aria-hidden="true" class="icon-python"></span>Use Python for…</h2>
<h2 class="widget-title">
                        <span class="prompt">&gt;&gt;&gt;</span> <a href="/dev/peps/">Python Enhancement Proposals<span class="say-no-more"> (PEPs)</span></a>: The future of Python<span class="say-no-more">

In [9]:
tags = res.findAll('span','a','img')
# this code gets all span, anchor & image tags from the scraped HTML

In [10]:
tags = res.findAll("a", {"class": ["url", "readmorebtn"]})
# this code extracts all anchor tags that have 'readmorebtn' & 'url' class

Finding nth child with Beautiful Soup

In [11]:
tags = res.find("nav", {"role": "navigation"})

print(tags.getText())



                
                
                    Skip to content
                

                
                
                    ▼ Close
                

                


    
    
        Python
    
    
    
        PSF
    
    
    
        Docs
    
    
    
        PyPI
    
    
    
        Jobs
    
    
    
        Community
    
    



                
                    ▲ The Python Network
                

            


In [12]:
test = res.find('div', {'class' : 'skip-link screen-reader-text'})
print(test.getText())


                    Skip to content
                


Find tags using Regex

In [13]:
import re

In [14]:
tags = res.findAll('img', {'src':re.compile("/*.png")})

print(tags) # revisit this

[<img alt="python™" class="python-logo" src="/static/img/python-logo.png"/>]


In [15]:
tags = res.findAll('img')

tags

[<img alt="python™" class="python-logo" src="/static/img/python-logo.png"/>]

Using ChromeDriver with Selenium

In [16]:
from selenium import webdriver

In [17]:
browser = webdriver.Chrome()
browser.get('https://www.python.org/')
nav = browser.find_element_by_id("mainnav")
 
print(nav.text)

Python is a programming language that lets you work more quickly and integrate your systems more effectively.
You can learn to use Python and see almost immediate gains in productivity and lower maintenance costs. Learn more about Python..
Download for Windows
Python 3.7.0
Note that Python 3.5+ cannot be used on Windows XP or earlier.
Not the OS you are looking for? Python can be used on many operating systems and environments. View the full list of downloads.
Python’s standard documentation: download, browse or watch a tutorial.
Get started below, or visit the Documentation page to browse by version.

Python 3.x Docs Python 2.x Docs


In [18]:
browser = webdriver.PhantomJS()
browser.get("https://www.python.org/")
print(browser.find_element_by_class_name("introduction").text)
browser.close()



Python is a programming language that lets you work quickly and integrate systems more effectively. Learn More


In [19]:
# Other ways to access elements:

# browser.find_element_by_id("id")
 
# browser.find_element_by_css_selector("#id")
 
# browser.find_element_by_link_text("Click Here")
 
# browser.find_element_by_name("Home")

# browser.find_elements_by_id("id")
 
# browser.find_elements_by_css_selector("#id")
 
# browser.find_elements_by_link_text("Click Here")
 
# browser.find_elements_by_name("Home")

In [20]:
browser = webdriver.PhantomJS()
browser.get("https://www.python.org/")
page = BeautifulSoup(browser.page_source,"html5lib")
 
links = page.findAll("a")
 
for link in links:
 
    print(link)
 
browser.close()



<a href="#content" title="Skip to content">Skip to content</a>
<a aria-hidden="true" class="jump-link" href="#python-network" id="close-python-network">
                    <span aria-hidden="true" class="icon-arrow-down"><span>▼</span></span> Close
                </a>
<a class="current_item selectedcurrent_branch selected" href="/" title="The Python Programming Language">Python</a>
<a href="/psf-landing/" title="The Python Software Foundation">PSF</a>
<a href="https://docs.python.org" title="Python Documentation">Docs</a>
<a href="https://pypi.python.org/" title="Python Package Index">PyPI</a>
<a href="/jobs/" title="Python Job Board">Jobs</a>
<a href="/community/" title="Python Community">Community</a>
<a aria-hidden="true" class="jump-link" href="#top" id="python-network">
                    <span aria-hidden="true" class="icon-arrow-up"><span>▲</span></span> The Python Network
                </a>
<a href="/"><img alt="python™" class="python-logo" src="/static/img/python-logo.png

Get iframe content using Selenium

In [21]:
browser = webdriver.PhantomJS()
browser.get("https://developer.mozilla.org/en-US/docs/Web/HTML/Element/iframe")
iframe = browser.find_element_by_tag_name("iframe")
browser.switch_to.default_content()
browser.switch_to.frame(iframe)
iframe_source = browser.page_source
print(iframe_source)
print(browser.current_url)



<!DOCTYPE html><html lang="en"><head>
    <meta charset="utf-8">
    <meta http-equiv="x-ua-compatible" content="ie=edge">
    <title>HTML Demo: &lt;iframe&gt;</title>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <link href="../../css/codemirror-tabbed-5-31-0.css" rel="stylesheet">
    <link href="../../css/tabbed-editor.css" rel="stylesheet">
    <script>"use strict";function postToKuma(e){window.parent.postMessage(e,"https://developer.mozilla.org")}postToKuma({markName:"interactive-editor-loading"}),document.addEventListener("readystatechange",function(e){switch(e.target.readyState){case"interactive":postToKuma({markName:"interactive-editor-interactive",measureName:"ie-time-to-interactive",startMark:"interactive-editor-loading",endMark:"interactive-editor-interactive"});break;case"complete":postToKuma({markName:"interactive-editor-complete",measureName:"ie-time-to-complete",startMark:"interactive-editor-loading",endMark:"interactive-editor-complete"})}

Get iframe content using Beautiful Soup

In [23]:
html = urlopen("https://developer.mozilla.org/en-US/docs/Web/HTML/Element/iframe")
res = BeautifulSoup(html.read(), 'html5lib')
tag = res.find("iframe")
print(tag['src'])

https://interactive-examples.mdn.mozilla.net/pages/tabbed/iframe.html


Handle Ajax Calls Using (Selenium+ PhantomJS)

In [24]:
import time

In [30]:
browser = webdriver.PhantomJS()
browser.get("https://www.w3schools.com/xml/ajax_intro.asp")



In [31]:
browser.find_element_by_tag_name("button").click()
time.sleep(2)
browser.get_screenshot_as_file("image.png")
browser.close()

In [32]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [33]:
browser = webdriver.PhantomJS()
browser.get("https://resttesttest.com/")
browser.find_element_by_id("submitajax").click()

try:
    element = WebDriverWait(browser, 10).until(EC.text_to_be_present_in_element((By.ID, "statuspre"),"HTTP 200 OK"))
finally:
    browser.get_screenshot_as_file("image.png")
browser.close()



Handling & deleting cookies

In [35]:
browser = webdriver.PhantomJS()
browser.get("https://likegeeks.com/")
print(browser.get_cookies())



[{'domain': 'likegeeks.com', 'expires': 'Tue, 10 Jul 2018 10:47:28 GMT', 'expiry': 1531219648, 'httponly': False, 'name': 'pll_language', 'path': '/', 'secure': False, 'value': 'en'}, {'domain': '.likegeeks.com', 'expires': 'Wed, 10 Jul 2019 02:01:52 GMT', 'expiry': 1562724112, 'httponly': True, 'name': '__cfduid', 'path': '/', 'secure': True, 'value': 'dde31ac03ad6957197c0c3b5a5830aefd1531188112'}]


In [36]:
browser = webdriver.PhantomJS()
browser.get("https://likegeeks.com/")
browser.delete_all_cookies()



Running headless Chrome with Selenium 

In [39]:
from selenium.webdriver.chrome.options import Options
import os

In [40]:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")

In [50]:
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get("https://www.google.com")
lucky_button = driver.find_element_by_css_selector("[name=btnI]")
lucky_button.click()

driver.get_screenshot_as_file("capture.png")

True