# Exploring the requests-html capabilities

In [None]:
# Official documentation of the requests-html package: https://requests-html.readthedocs.io/en/latest/

### Initial setup

In [1]:
# Loading the necessary packages
from requests_html import HTMLSession

In [2]:
# establish/open a session
session = HTMLSession()

In [3]:
# submitting a GET request
r = session.get("https://en.wikipedia.org/wiki/Association_football")
r.status_code

200

In [4]:
# The html response to the GET request is contained in the '.html' method
r.html

<HTML url='https://en.wikipedia.org/wiki/Association_football'>

### Links

In [5]:
# We can extract all link addresses directly with '.links'
urls = r.html.links
urls

{'/wiki/File:Yellow_card.svg',
 '/wiki/Flag_football',
 'https://id.loc.gov/authorities/subjects/sh85123840',
 '/wiki/List_of_association_football_rivalries',
 '/wiki/Help:Authority_control',
 '/wiki/2005_in_association_football',
 'https://www.fifa.com/worldcup/news/y=2015/m=12/news=2014-fifa-world-cuptm-reached-3-2-billion-viewers-one-billion-watched--2745519.html',
 '/wiki/Shrewsbury_School',
 'https://wikimediafoundation.org/',
 '/wiki/Association_football#Players,_equipment,_and_officials',
 '/wiki/Mixed-sex_sports',
 'https://stq.wikipedia.org/wiki/Foutbal',
 'http://www.heraldscotland.com/sport/football/no-longer-the-game-of-two-halves.19185657',
 'https://web.archive.org/web/20050314003412/http://www.fifa.com/en/marketing/newmedia/index/0%2C3509%2C10%2C00.html',
 '/wiki/Biribol',
 '/wiki/1918_in_association_football',
 '/wiki/Association_football_headgear',
 'https://gan.wikipedia.org/wiki/%E8%85%B3%E7%90%83',
 '/wiki/Eight-man_football',
 '/wiki/Exhibition_game',
 'https://en.

In [6]:
# Note that those are the relative URLs 

In [7]:
# To get absolute URLs we can use '.absolute_links' instead of '.links'
full_path_urls = r.html.absolute_links
full_path_urls

{'https://en.wikipedia.org/wiki/List_of_men%27s_national_association_football_teams',
 'https://id.loc.gov/authorities/subjects/sh85123840',
 'https://en.wikipedia.org/wiki/Footbag_net',
 'https://www.fifa.com/worldcup/news/y=2015/m=12/news=2014-fifa-world-cuptm-reached-3-2-billion-viewers-one-billion-watched--2745519.html',
 'https://en.wikipedia.org/wiki/File:Ousmane_Demb%C3%A9l%C3%A9_World_Cup_Trophy.jpg',
 'https://en.wikipedia.org/wiki/Boules',
 'https://en.wikipedia.org/wiki/List_of_association_football_stadiums_by_country',
 'https://en.wikipedia.org/wiki/1932_in_association_football',
 'https://wikimediafoundation.org/',
 'https://en.wikipedia.org/wiki/Wikipedia:Featured_articles',
 'https://en.wikipedia.org/wiki/Throwback_uniform',
 'https://en.wikipedia.org/wiki/Template_talk:Team_sports',
 'https://en.wikipedia.org/wiki/Rabona',
 'https://stq.wikipedia.org/wiki/Foutbal',
 'http://www.heraldscotland.com/sport/football/no-longer-the-game-of-two-halves.19185657',
 'https://foun

In [8]:
# An important thing to note is that these links (given by both methods) are returned in a SET, not a LIST
type(urls)

set

## Searching for elements

In [9]:
# A quick note: requests-html uses CSS selectors for searching
# We will cover them in the next section,
# but here is a more thorough look into it: https://www.w3schools.com/cssref/css_selectors.asp

In [10]:
# We can search for elements similarly to Beautiful Soup using the find() method
# It behaves as find_all()

# find all 'a' tags
links = r.html.find("a")
links

[<Element 'a' id='top'>,
 <Element 'a' href='/wiki/Wikipedia:Featured_articles' title='This is a featured article. Click here for more information.'>,
 <Element 'a' href='/wiki/Wikipedia:Protection_policy#semi' title='This article is semi-protected.'>,
 <Element 'a' href='/wiki/File:Football_(soccer)_Part_One.ogg' title='Listen to this article'>,
 <Element 'a' class=('mw-jump-link',) href='#mw-head'>,
 <Element 'a' class=('mw-jump-link',) href='#p-search'>,
 <Element 'a' class=('mw-disambig',) href='/wiki/Soccer_(disambiguation)' title='Soccer (disambiguation)'>,
 <Element 'a' href='/wiki/Football' title='Football'>,
 <Element 'a' class=('image',) href='/wiki/File:Ronaldinho_and_Khedira.jpg'>,
 <Element 'a' href='/wiki/Ronaldinho' title='Ronaldinho'>,
 <Element 'a' href='/wiki/UEFA_Champions_League' title='UEFA Champions League'>,
 <Element 'a' href='/wiki/A.C._Milan' title='A.C. Milan'>,
 <Element 'a' class=('mw-redirect',) href='/wiki/Real_Madrid_C.F.' title='Real Madrid C.F.'>,
 <El

In [11]:
links[4]

<Element 'a' class=('mw-jump-link',) href='#mw-head'>

In [12]:
# To get the raw HTML of an element use the '.html' method
links[4].html

'<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>'

In [13]:
type(links[4].html)

str

In [14]:
# To extract the text inside an element, use ".text", just like in Beautiful Soup
links[4].text

'Jump to navigation'

In [15]:
# To obtain a dictionary of the element's attributes, use '.attrs' (exactly as in Beautiful Soup)
links[10].attrs

{'href': '/wiki/UEFA_Champions_League', 'title': 'UEFA Champions League'}

In [16]:
# This package offers a couple of ways to filter tags based off text

# Choose only those tags that contain the string 'wikipedia' in their text (not in the 'href' attribute)
# Note: this is not case-sensitive
r.html.find("a", containing = "wikipedia")

[<Element 'a' href='//en.wikipedia.org/wiki/Wikipedia:Contact_us'>,
 <Element 'a' href='/wiki/Wikipedia:About' title='Wikipedia:About'>,
 <Element 'a' href='/wiki/Wikipedia:About' title='Find out about Wikipedia'>,
 <Element 'a' href='//shop.wikimedia.org' title='Visit the Wikipedia store'>,
 <Element 'a' href='https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en' title='Support us'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_NDL_identifiers' title='Category:Wikipedia articles with NDL identifiers'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_NARA_identifiers' title='Category:Wikipedia articles with NARA identifiers'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_LCCN_identifiers' title='Category:Wikipedia articles with LCCN identifiers'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_HDS_identifiers' title='Category:Wikipedia articles 

In [17]:
# display the text of those tags
[tag.text for tag in r.html.find("a", containing = "wikipedia")]

['Contact Wikipedia',
 'About Wikipedia',
 'About Wikipedia',
 'Wikipedia store',
 'Donate to Wikipedia',
 'Wikipedia articles with NDL identifiers',
 'Wikipedia articles with NARA identifiers',
 'Wikipedia articles with LCCN identifiers',
 'Wikipedia articles with HDS identifiers',
 'Wikipedia articles with GND identifiers',
 'Wikipedia articles with BNF identifiers',
 'Wikipedia indefinitely semi-protected pages',
 'https://en.wikipedia.org/w/index.php?title=Association_football&oldid=934524737']

In [18]:
# If we wish to find only the first element (similarly to Beautiful Soup .find()) we need to specify the 'first' parameter
r.html.find("p", first = True)

<Element 'p' class=('mw-empty-elt',)>

### Searching for text

In [19]:
# The package also offers searching text based on the parse library
# The search() method can be thought of as the opposite of str.format():
# it finds the text instead of inserting it in the specified place

# For further details see https://pypi.org/project/parse/ 
# and https://docs.python.org/3/library/string.html#format-string-syntax

In [20]:
# The method searches for a matching string, where '{}' is replaced by the returned text
r.html.search("known{}soccer")

<Result (' as football field, football ground, ',) {}>

In [21]:
# To access the text, get the first element (index 0)
r.html.search("known{}soccer")[0]

' as football field, football ground, '

In [22]:
# search() finds only the shortest matching string
# To search for all matching strings use search_all()
r.html.search_all("known{}soccer")

[<Result (' as football field, football ground, ',) {}>,
 <Result (' as <b>football</b> or <b>',) {}>,
 <Result (' as the <a href="/wiki/Laws_of_the_Game_(association_football)" title="Laws of the Game (association football)">Laws of the Game</a>. The game is played using a spherical ball of 68–70&#160;cm (27–28&#160;in) circumference, known as the <i><a href="/wiki/Ball_(association_football)" title="Ball (association football)">football</a></i>. Two teams of eleven players each compete to get the ball into the other team\'s goal (between the posts and under the bar), thereby scoring a goal. The team that has scored more goals at the end of the game is the winner; if both teams have scored an equal number of goals then the game is a draw. Each team is led by a <a href="/wiki/Captain_(association_football)" title="Captain (association football)">captain</a> who has only one official responsibility as mandated by the Laws of the Game: to represent their team in the coin toss prior to ki

In [23]:
len(r.html.search_all("known{}soccer"))

19

In [24]:
# Further details at:
# -- https://pypi.org/project/parse/
# -- https://docs.python.org/3/library/string.html#format-string-syntax

# CSS selectors

In [25]:
# CSS selectors are a notation for selecting (filtering) different HTML elements (aka tags)
# The name stems from the styling language CSS - 
# in order for a style to be applied, you first need a way to specify (or 'select') the element the style will be applyed on

In [26]:
# You can find a complete CSS selectors reference at: https://www.w3schools.com/cssref/css_selectors.asp
# Let's showcase some CSS selectors below, with examples from the same wiki page

### Select elements based on tag name

In [27]:
# Select by tag name as usual

# Select all 'span' tags
r.html.find("span")

[<Element 'span' id='Etymology'>,
 <Element 'span' id='Names'>,
 <Element 'span' class=('toctogglespan',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber

In [28]:
# Another example: select all 'div' tags
r.html.find("div")

[<Element 'div' class=('noprint',) id='mw-page-base'>,
 <Element 'div' class=('noprint',) id='mw-head-base'>,
 <Element 'div' class=('mw-body',) id='content' role='main'>,
 <Element 'div' class=('mw-body-content',) id='siteNotice'>,
 <Element 'div' class=('mw-indicators', 'mw-body-content')>,
 <Element 'div' class=('mw-indicator',) id='mw-indicator-featured-star'>,
 <Element 'div' class=('mw-indicator',) id='mw-indicator-pp-default'>,
 <Element 'div' class=('mw-indicator',) id='mw-indicator-spoken-icon'>,
 <Element 'div' class=('mw-body-content',) id='bodyContent'>,
 <Element 'div' class=('noprint',) id='siteSub'>,
 <Element 'div' id='contentSub'>,
 <Element 'div' id='jump-to-nav'>,
 <Element 'div' class=('mw-content-ltr',) dir='ltr' id='mw-content-text' lang='en'>,
 <Element 'div' class=('mw-parser-output',)>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=(

In [29]:
# That was the same as in Beautiful Soup
# However, CSS selectors now diverge a bit in style

### Select elements based upon ID   

In [30]:
# Remember that the 'id' attribute is unique - no two elements may have the same 'id'
# Thus, filtering by 'id' will return either one or no results

In [31]:
# To search by id use '#' (hashtag)

In [32]:
# Select the tag with id = Name
r.html.find("#Name")

[<Element 'span' class=('mw-headline',) id='Name'>]

In [33]:
# Note that the value is case-sensitive
r.html.find("#name")

[]

In [34]:
# Another example
# Since we know that it will return at most one element, we may get rid of the list by setting 'first' to 'True'
r.html.find("#Duration_and_tie-breaking_methods", first=True)

<Element 'span' class=('mw-headline',) id='Duration_and_tie-breaking_methods'>

### Selecting by class

In [35]:
# We can filter by class name with '.' (a dot)
# .class_name_here

In [36]:
# Select all tags with class='mw-headline'
r.html.find(".mw-headline")

[<Element 'span' class=('mw-headline',) id='Name'>,
 <Element 'span' class=('mw-headline',) id='History'>,
 <Element 'span' class=('mw-headline',) id="Women's_association_football">,
 <Element 'span' class=('mw-headline',) id="Early_women's_football">,
 <Element 'span' class=('mw-headline',) id='20th_and_21st_century'>,
 <Element 'span' class=('mw-headline',) id='Gameplay'>,
 <Element 'span' class=('mw-headline',) id='Laws'>,
 <Element 'span' class=('mw-headline',) id='Players,_equipment,_and_officials'>,
 <Element 'span' class=('mw-headline',) id='Ball'>,
 <Element 'span' class=('mw-headline',) id='Pitch'>,
 <Element 'span' class=('mw-headline',) id='Duration_and_tie-breaking_methods'>,
 <Element 'span' class=('mw-headline',) id='90-minute_ordinary_time'>,
 <Element 'span' class=('mw-headline',) id='Tie-breaking'>,
 <Element 'span' class=('mw-headline',) id='Ball_in_and_out_of_play'>,
 <Element 'span' class=('mw-headline',) id='Misconduct'>,
 <Element 'span' class=('mw-headline',) id=

In [37]:
# Select all tags with class = 'metadata'
r.html.find(".metadata")

[<Element 'table' class=('metadata', 'mbox-small') role='presentation' style='background-color:#f9f9f9;border:1px solid #aaa;color:#000'>,
 <Element 'div' aria-labelledby='sister-projects' class=('metadata', 'plainlinks', 'sistersitebox', 'plainlist', 'mbox-small') role='navigation' style='border:1px solid #aaa; padding:0; background:#f9f9f9;'>]

In [38]:
# We can stack different CSS selectors one after the other

In [39]:
# For instance, we can search for elements with two class values
r.html.find(".metadata.plainlinks")  # Note there is no space between the two

[<Element 'div' aria-labelledby='sister-projects' class=('metadata', 'plainlinks', 'sistersitebox', 'plainlist', 'mbox-small') role='navigation' style='border:1px solid #aaa; padding:0; background:#f9f9f9;'>]

### Selecting based on other attributes

In [40]:
# If we want to search for tags with attributes beside 'class' and 'id' we should use this notation:

# [attribute] -- selects all tags that have defined the attribute
# [attribute=value] -- selects all tags with that particular value of the attribute
# [attribute*=value] -- attribute contains the SUBSTRING 'value'
# [attribute~=value] -- attribute contains the WORD 'value'
# [attribute|=value] -- attribute starts with 'value', followed with a dash '-', or is 'value' itself
# [attribute^=value] -- attribute begins with 'value'
# [attribute$=value] -- attribute ends with 'value'

In [41]:
# Select all tags that have 'target' attribute
r.html.find("[target]")

[<Element 'a' href='//upload.wikimedia.org/wikipedia/commons/3/30/O_Jogo_Bonito_%28The_Beautiful_Game%29.webm' target='new' title='Play media'>]

In [42]:
# For instance, select all tags with the 'role' attribute set to 'note'
r.html.find("[role=note]")

[<Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,

In [43]:
# Select all tags that contain the string 'wikipedia' in theur 'href' attribute
r.html.find("[href*=wikipedia]")

[<Element 'link' href='android-app://org.wikipedia/http/en.m.wikipedia.org/wiki/Association_football' rel=('alternate',)>,
 <Element 'link' href='/static/apple-touch/wikipedia.png' rel=('apple-touch-icon',)>,
 <Element 'link' href='/static/favicon/wikipedia.ico' rel=('shortcut', 'icon')>,
 <Element 'link' href='//en.wikipedia.org/w/api.php?action=rsd' rel=('EditURI',) type='application/rsd+xml'>,
 <Element 'link' href='https://en.wikipedia.org/wiki/Association_football' rel=('canonical',)>,
 <Element 'a' href='//upload.wikimedia.org/wikipedia/commons/3/30/O_Jogo_Bonito_%28The_Beautiful_Game%29.webm' target='new' title='Play media'>,
 <Element 'a' class=('internal',) href='//upload.wikimedia.org/wikipedia/commons/a/a1/Football_%28soccer%29_Part_One.ogg' title='Football (soccer) Part One.ogg'>,
 <Element 'a' class=('internal',) href='//upload.wikimedia.org/wikipedia/commons/c/cb/Football_%28soccer%29_Part_Two.ogg' title='Football (soccer) Part Two.ogg'>,
 <Element 'a' class=('external', 

In [44]:
# Note that this technique works for 'class' and 'id', as well

### Combining different filters together into a compound selector

In [45]:
# We can stack all the different selectors we looked at up until now for a more precise filtering

In [46]:
# Looking at the last example, we see that there are 5 'link' elements selected along the 'a' tags
# We can look only at the 'a' tags using a compound selector

# we have selector that filters 'a' tags and a selector that filters tags with 'href' containing 'wikipedia'
# by combining those we can select only the 'a' tags containing 'wikipedia' in their 'href' attribute
r.html.find("a[href*=wikipedia]")

[<Element 'a' href='//upload.wikimedia.org/wikipedia/commons/3/30/O_Jogo_Bonito_%28The_Beautiful_Game%29.webm' target='new' title='Play media'>,
 <Element 'a' class=('internal',) href='//upload.wikimedia.org/wikipedia/commons/a/a1/Football_%28soccer%29_Part_One.ogg' title='Football (soccer) Part One.ogg'>,
 <Element 'a' class=('internal',) href='//upload.wikimedia.org/wikipedia/commons/c/cb/Football_%28soccer%29_Part_Two.ogg' title='Football (soccer) Part Two.ogg'>,
 <Element 'a' class=('external', 'text') href='https://en.wikipedia.org/w/index.php?title=Template:Association_football&action=edit'>,
 <Element 'a' class=('external', 'text') href='https://en.wikipedia.org/w/index.php?title=Template:International_football&action=edit'>,
 <Element 'a' class=('external', 'text') href='https://en.wikipedia.org/w/index.php?title=Template:Association_football_laws&action=edit'>,
 <Element 'a' class=('external', 'text') href='https://en.wikipedia.org/w/index.php?title=Template:Association_footba

In [47]:
# Select all 'a' tags with class 'internal'
r.html.find("a.internal")

[<Element 'a' class=('internal',) href='/wiki/File:AstonVilla1896-97.jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Mia1997.JPG' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Women%27s_football_match_Menai_Bridge_against_Penrhos_(24622680915).jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:U20-WorldCup2007-Okotie-Onka_edit2.jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Slidetackle.JPG' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Howard_Webb3.jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Football_pitch_metric.svg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Didier_Drogba_Manuel_Neuer_last_penalty_kick_Champions_League_Final_2012.jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Shunsuke1_20080622.jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:FIFA-Headquarte

In [48]:
# Select all 'div' tags with classes 'thumb' and 'tright'
r.html.find("div.thumb.tright")

[<Element 'div' class=('thumb', 'tmulti', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tmulti', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>]

In [49]:
# Select all 'div' notes, again, but including the class as well
r.html.find("div[role=note][class='hatnote navigation-not-searchable']")

# Some important notes:
# - when using [] syntax for class, instead of '.', you need to specify every value, i.e. all classes
# - when an attribute value contains space, you need to enclose it in quotes

[<Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,

### Incorporating tag hierarchy

In [50]:
# Sometimes we may want to search for an element nested inside another tag
# We can achieve this with a space

In [51]:
# Selecting all 'span' tags that are inside an 'h2' tag
r.html.find("h2 span")

[<Element 'span' class=('mw-headline',) id='Name'>,
 <Element 'span' class=('mw-headline',) id='History'>,
 <Element 'span' class=('mw-headline',) id='Gameplay'>,
 <Element 'span' class=('mw-headline',) id='Laws'>,
 <Element 'span' class=('mw-headline',) id='Governing_bodies'>,
 <Element 'span' class=('mw-headline',) id='International_competitions'>,
 <Element 'span' class=('mw-headline',) id='Domestic_competitions'>,
 <Element 'span' class=('mw-headline',) id='Professionalism'>,
 <Element 'span' class=('mw-headline',) id='Hooliganism'>,
 <Element 'span' class=('mw-headline',) id='Variants_and_casual_play'>,
 <Element 'span' class=('mw-headline',) id='See_also'>,
 <Element 'span' class=('mw-headline',) id='Notes'>,
 <Element 'span' class=('mw-headline',) id='References'>,
 <Element 'span' class=('mw-headline',) id='External_links'>]

In [52]:
# If we use '>' instead of a space, the parent tag should be the direct parent

# Selecting only paragraphs that are directly contained in a 'div' (their immidiate parent is a 'div')
r.html.find("div > p")

[<Element 'p' class=('mw-empty-elt',)>,
 <Element 'p' class=('mw-empty-elt',)>,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <

In [53]:
# When we are done with scraping the page, it is a good idea to close the session object we opened at the begining
# If not, and we open new sessions, a lot of background processes may draw memory and processor resources

In [54]:
# close session object
session.close()

# Scraping data generated by JavaScript

In [38]:
# When coding in Jupyter and Spyder, we need to use the class AsyncHTMLSession to make JavaScript work
# In other environments you can use the normal HTMLSession
from requests_html import AsyncHTMLSession

In [39]:
# establish a new asynchronous session
session = AsyncHTMLSession()

# The only difference we will experience between the regular HTML Session and the asynchronous one,
# is the need to write the keyword 'await' in front of some statements

In [40]:
# In this example we're going to use Nike's homepage: https://www.nike.com/
# Several of the links on this page, as well as other elements, are generated by JavaScript
# We will compare the result of scraping those before and after running the JavaScript code

In [41]:
# Since we used async session, we need to use the keyword 'await'
# If you use the regular HTMLSession, there is no need for 'await'
r = await session.get("https://www.nike.com/")
r.status_code

200

In [42]:
# So far, nothing different from our previous example has happened
# The JavaScript code has not yet been executed

In [43]:
# Here are some tags obtained before rendering the JavaScript code, i.e. extarcted from the raw HTML
divs = r.html.find("div")
p = r.html.find("p")
list_items = r.html.find("li")
links = r.html.find("a")
urls = r.html.absolute_links

In [44]:
# Now, we need to execute the JavaScript code that will generate additional tags

In [45]:
# The requests-html package provides a very simple interface for that - just use the 'render()' method
# ('arender()' when using async session)
# It runs the JavaScript code which updates the HTML. This may take a bit
# The updated HTML is stored in the old variable 'r.html' - you do not need to assign a new variable to the method
# As before, the 'await' keyword is supplied only because of the Async session
await r.html.arender()

In [46]:
# NOTE: The first time you run 'a/render()' Chromium will be downloaded and installed on your computer

In [47]:
# Now the HTML is updated and we can search for the same tags again
new_divs = r.html.find("div")
new_p = r.html.find("p")
new_li = r.html.find("li")
new_links = r.html.find("a")
new_urls = r.html.absolute_links

In [48]:
# We can see the difference in the number of found elements before and after the JavaScript executed

In [49]:
len(divs), len(new_divs)

(442, 0)

In [50]:
len(p), len(new_p)

(115, 1)

In [51]:
len(list_items), len(new_li)

(113, 0)

In [52]:
len(links), len(new_links)

(410, 0)

In [53]:
len(urls), len(new_urls)

(343, 0)

In [54]:
# Remember that 'urls' is a set, and not a list?
# Well, there is a useful feature of sets that we will now take advantage of
# It takes two sets and selects only those items from the first set that are not present in the second one

In [55]:
# Take only the new items in the first set
new_urls.difference(urls)

set()

In [56]:
# Finally, close the session
session.close()

<coroutine object AsyncHTMLSession.close at 0x00000222CC0F29C8>

In [57]:
# You can check the documentation directly inside Jupyter
print(r.html.render.__doc__)

Reloads the response in Chromium, and replaces HTML content
        with an updated version, with JavaScript executed.

        :param retries: The number of times to retry loading the page in Chromium.
        :param script: JavaScript to execute upon page load (optional).
        :param wait: The number of seconds to wait before loading the page, preventing timeouts (optional).
        :param scrolldown: Integer, if provided, of how many times to page down.
        :param sleep: Integer, if provided, of how many long to sleep after initial render.
        :param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory.
        :param keep_page: If ``True`` will allow you to interact with the browser page through ``r.html.page``.

        If ``scrolldown`` is specified, the page will scrolldown the specified
        number of times, after sleeping the specified amount of time
        (e.g. ``scrolldown=10, sleep=1``).

        If just ``sleep