In [4]:
# import BeautifulSoup and SoupStrainer from bs4, 
#as seen in the following code:
from bs4 import BeautifulSoup
from bs4 import SoupStrainer #,BeautifulSoup

In [5]:
# Use the HTML as shown in the following snippet or html_doc as a sample to
# explore some of the fundamental features of Beautiful Soup. 
# The response obtained for any chosen URL, using requests or urllib, 
# can also be used for content 
# in real scraping cases:
html_doc="""<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and
their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<h1>Secret agents</h1>
<ul>
 <li data-id="10784">Jason Walters, 003: Found dead in "A View to a
Kill".</li>
 <li data-id="97865">Alex Trevelyan, 006: Agent turned terrorist leader;
James' nemesis in "Goldeneye".</li>
 <li data-id="45732">James Bond, 007: The main man; shaken but not
stirred.</li>
</ul>
</body>
</html>"""

In [6]:
# Create the soupA object using lxml as a parser, along with the SoupStrainer object 
# tagsA (parsing only <a>). 
# We can obtain partial content to parse using SoupStrainer. 
# soupA, an object of Beautiful Soup, presents all of the <a> elements found for
# the SoupStrainer object tagsA as used in the following code:
tagsA = SoupStrainer("a")
soupA = BeautifulSoup(html_doc,'lxml',parse_only=tagsA)
print(soupA)
print("-"*45)
print(type(soupA))

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a><a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
---------------------------------------------
<class 'bs4.BeautifulSoup'>


In [7]:
# prettify() function returns a Unicode string, 
# presents the string in a clean, formatted structure.
# the prettify() function also accepts the parameter encoding:
print(soupA.prettify())

<a class="sister" href="http://example.com/elsie" id="link1">
 Elsie
</a>
<a class="sister" href="http://example.com/lacie" id="link2">
 Lacie
</a>
<a class="sister" href="http://example.com/tillie" id="link3">
 Tillie
</a>


In [8]:
# use the has_attr() function to confirm the attribute keys by name, 
# if it exists inside the parsed document as follows:
print(soupA.a.has_attr('class'))
print(40*"--")
print(soupA.a.has_attr('name'))


True
--------------------------------------------------------------------------------
False


# Searching, traversing, and iterating

In [9]:
# implement the find() function with different, allowed parameters in the code:
print(soupA.find("a")) #print(soupA.find(name="a"))
print(40*"--")
print(soupA.find("a",attrs={'class':'sister'}))
print(40*"--")
print(soupA.find("a",attrs={'class':'sister'},text="Lacie"))
print(40*"--")
print(soupA.find("a",attrs={'id':'link3'}))
print(40*"--")
print(soupA.find('a',id="link2"))
print(40*"--")

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
--------------------------------------------------------------------------------
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
--------------------------------------------------------------------------------
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
--------------------------------------------------------------------------------
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
--------------------------------------------------------------------------------
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
--------------------------------------------------------------------------------


In [10]:
# The find_all() is similar to the find() function with the
# additional attrs and text as a parameters and returns a list of matched (multiple)
# elements for the provided criteria or name attribute as follows: 
# find all <a> can also be written as #print(soupA.find_all(name="a"))
print(soupA.find_all("a"))
print(40*"--")
#find all <a>, but return only 1 of them
print(soupA.find_all("a",limit=1)) #attrs, text
print(40*"--")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
--------------------------------------------------------------------------------
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
--------------------------------------------------------------------------------


In [11]:
# The string, list of strings, regular expression objects, or any of these, 
# can be provided to the name and text attributes as a value for attrs parameters
import re
print(soupA.find("a",text=re.compile(r'cie'))) #import re
print(40*"--")
print(soupA.find_all("a",attrs={'id':re.compile(r'3')}))
print(40*"--")
print(soupA.find_all(re.compile(r'a')))
print(40*"--")

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
--------------------------------------------------------------------------------
[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
--------------------------------------------------------------------------------
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
--------------------------------------------------------------------------------


In [12]:
# The find_all() function has in-built support for global attributes 
# such as class name along with a name
soup = BeautifulSoup(html_doc,'lxml')
print(soup.find_all("p","story")) #class=story
print(40*"--")
print(soup.find_all("p","title"))
#soup.find_all("p",attrs={'class':"title"})
print(40*"--")

[<p class="story">Once upon a time there were three little sisters; and
their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>]
--------------------------------------------------------------------------------
[<p class="title"><b>The Dormouse's story</b></p>]
--------------------------------------------------------------------------------


In [13]:
# Multiple name and attrs values can also be passed through a list 
# as shown in the following syntax:
# The preceding syntax can be observed in the following code:
print(soup.find_all("p",attrs={'class':["title","story"]}))
print(40*"--")
print(soup.find_all(["p","li"]))
print(40*"--")

[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and
their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>]
--------------------------------------------------------------------------------
[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and
their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>, <li data-id="10784">Jason Walters, 003: Found dead in "A 

In [14]:
# Use element text to search and list the content. A string parameter, similar to
# a text parameter, is used for such cases; it can also be used with, or without, 
# any tag names as in the following code:
print(soup.find_all(string="Elsie")) #text="Elsie"
print(45*"--")
print(soup.find_all(text=re.compile(r'Elsie'))) #import re
print(45*"--")
print(soup.find_all("a",string="Lacie")) #text="Lacie"

['Elsie']
------------------------------------------------------------------------------------------
['Elsie']
------------------------------------------------------------------------------------------
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


In [15]:
# using the find_all() function to achieve iteration through elements. 
# The source code below is used to retrieve all of the <li> elements 
# found inside the <ul> element and printing their tag name, 
# attribute data, ID, and text:
for li in soup.ul.find_all('li'):
    print(li.name, ' > ',li.get('data-id'),' > ', li.text)

li  >  10784  >  Jason Walters, 003: Found dead in "A View to a
Kill".
li  >  97865  >  Alex Trevelyan, 006: Agent turned terrorist leader;
James' nemesis in "Goldeneye".
li  >  45732  >  James Bond, 007: The main man; shaken but not
stirred.


In [16]:
# Element traversing can also be done with just a tag name, 
# and with, or without, using the find() or find_all() functions 
print(soupA.a) #tag a
print(45*"--")
print(soup.li) #tag li
print(45*"--")
print(soup.p)
print(45*"--")
print(soup.p.b) #tag p and b
print(45*"--")
print(soup.ul.find('li',attrs={'data-id':'45732'}))
print(45*"--")

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
------------------------------------------------------------------------------------------
<li data-id="10784">Jason Walters, 003: Found dead in "A View to a
Kill".</li>
------------------------------------------------------------------------------------------
<p class="title"><b>The Dormouse's story</b></p>
------------------------------------------------------------------------------------------
<b>The Dormouse's story</b>
------------------------------------------------------------------------------------------
<li data-id="45732">James Bond, 007: The main man; shaken but not
stirred.</li>
------------------------------------------------------------------------------------------


In [17]:
# The text and string attributes or the get_text() method can be used with the elements
# to extract their text while traversing through the elements used in the following code.
# There's also a parameter text and string in the find() or find_all() functions, which
# are used to search the content
print(soup.ul.find('li',attrs={'data-id':'45732'}).text)
print(45*"--")
print(soup.p.text) #get_text()
print(45*"--")
print(soup.li.text)
print(45*"--")
print(soup.p.string)
print(45*"--")

James Bond, 007: The main man; shaken but not
stirred.
------------------------------------------------------------------------------------------
The Dormouse's story
------------------------------------------------------------------------------------------
Jason Walters, 003: Found dead in "A View to a
Kill".
------------------------------------------------------------------------------------------
The Dormouse's story
------------------------------------------------------------------------------------------


# Using children and parents

In [18]:
html_doc="""<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and
their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<h1>Secret agents</h1>
<ul>
 <li data-id="10784">Jason Walters, 003: Found dead in "A View to a
Kill".</li>
 <li data-id="97865">Alex Trevelyan, 006: Agent turned terrorist leader;
James' nemesis in "Goldeneye".</li>
 <li data-id="45732">James Bond, 007: The main man; shaken but not
stirred.</li>
</ul>
</body>
</html>"""
soup = BeautifulSoup(html_doc,'lxml')

In [43]:
# The preceding list showed the features that can also be used for iteration. 
print(list(soup.find('p','story').children))
print(45*"--")
print(list(soup.find('p','story').contents))
print(45*"--")
print(list(soup.find('p','story').descendants))


['Once upon a time there were three little sisters; and\ntheir names were\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, ',\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']
------------------------------------------------------------------------------------------
['Once upon a time there were three little sisters; and\ntheir names were\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, ',\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']
------------------------------------------------------------------------------------------
['Once upon a time there were three little sisters; and\ntheir names were\n', <a class="sister" href

In [44]:
# Selected children and descendants tag names can be obtained using the name attribute.
# Parsed strings and the \n function (newline) are returned as None
# using List Comprehension Technique
print([a.name for a in soup.find('p','story').children])
print(45*"--")
print([a.name for a in soup.find('p','story').descendants])
print(45*"--")
print(list(filter(None,[a.name for a in soup.find('p','story').descendants])))


[None, 'a', None, 'a', None, 'a', None]
------------------------------------------------------------------------------------------
[None, 'a', None, None, 'a', None, None, 'a', None, None]
------------------------------------------------------------------------------------------
['a', 'a', 'a']


In [45]:
# Similar to the find() and find_all() functions, we can also traverse child elements
# using the findChild() and findChildren() functions. The findChild() function is
# used to retrieve the single child and the findChildren() function retrieves a list of
# children
print(soup.find('p','story').findChildren())
print(45*"--")
print(soup.find('p','story').findChild()) #soup.find('p','story').find()

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
------------------------------------------------------------------------------------------
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


In [46]:
# Similar to the children element, the parent element returns the parent object found 
# for the searched criteria. The main difference here is that the parent element returns 
# the single parent object from the tree.
#print parent element of <a> with class=sister
print(soup.find('a','sister').parent)
print(45*"--")
#print parent element name of <a> with class=sister
print(soup.find('a','sister').parent.name)
print(45*"--")
#print text from parent element of <a> with class=sister
print(soup.find('a','sister').parent.text)

<p class="story">Once upon a time there were three little sisters; and
their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
------------------------------------------------------------------------------------------
p
------------------------------------------------------------------------------------------
Once upon a time there were three little sisters; and
their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.


In [48]:
# The limitation of the single parents returned can be overcome by using
# the parents element; this returns multiple existing parent elements and 
# matches the searched criteria provided in the find() function.
for element in soup.find('a','sister').parents:
    print(element.name)

p
body
html
[document]


In [53]:
# Similar to the functions that exist for child traversing, parents can also be 
# traversed and retrieved using the findParent() and findParents() search functions.
# The findParent() function traverses to the immediate parent, while the findParents() 
# function returns all parents found for the criteria provided.
# It must also be noted that the children and parent traversing functions are used with
# the find() function where necessary arguments and conditions are provided

#find single Parent for selected <a> with class=sister
print(soup.find('a','sister').findParent())
print(45*"--")
#find Parents for selected <a> with class=sister
print(soup.find('a','sister').findParents())

<p class="story">Once upon a time there were three little sisters; and
their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
------------------------------------------------------------------------------------------
[<p class="story">Once upon a time there were three little sisters; and
their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and
their names were
<a class="sister" href="http://example.com/elsie" id="lin

# Use next and previous

In [54]:
# The properties next and next_element return the immediately parsed content 
# for the selected criteria. We can also append the next and next_element 
# functions to create a chain of code for traversal.
print(soup.find('p','story').next)
print(45*"--")
print(soup.find('p','story').next.next)
print(45*"--")
print(soup.find('p','story').next_element)
print(45*"--")
print(soup.find('p','story').next_element.next_element)
print(45*"--")
print(soup.find('p','story').next_element.next_element.next_element)

Once upon a time there were three little sisters; and
their names were

------------------------------------------------------------------------------------------
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
------------------------------------------------------------------------------------------
Once upon a time there were three little sisters; and
their names were

------------------------------------------------------------------------------------------
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
------------------------------------------------------------------------------------------
Elsie


In [57]:
# Similar to the next and next_elements functions, there also exist properties with
# traversal result that returns results from prior or previous parsed elements, such as
# the previous and previous_element, which are opposite to work reversely when compared 
# to the next and next_element functions.
# As seen in the following code, the previous and previous_element can also be
# appended to themselves to create a traversal series:

print(soup.find('p','story').previous) #returns empty or new-line.
print(soup.find('p','title').next.next.next) #returns empty or newline
print(45*"-")
print(soup.find('p','story').previous.previous)
print(45*"-")
print(soup.find('p','story').previous_element) #returns empty or new-line.
print(soup.find('p','story').previous_element.previous_element)
print(45*"-")
print(soup.find('p','story').previous_element.previous_element.previous_element)






---------------------------------------------
The Dormouse's story
---------------------------------------------


The Dormouse's story
---------------------------------------------
<b>The Dormouse's story</b>


In [58]:
# We now combine the next or next_element and previous or
# previous_element elements together to traverse
print(soup.find('p','title').next.next.previous.previous)

<p class="title"><b>The Dormouse's story</b></p>


In [59]:
# Iterating features for the next_element and previous_element are obtained using
# the next_elements and previous_elements, respectively. These iterators are used to
# move to the next or previous parsed content 
for element in soup.find('ul').next_elements:
 print(element)
print(40*"--")



<li data-id="10784">Jason Walters, 003: Found dead in "A View to a
Kill".</li>
Jason Walters, 003: Found dead in "A View to a
Kill".


<li data-id="97865">Alex Trevelyan, 006: Agent turned terrorist leader;
James' nemesis in "Goldeneye".</li>
Alex Trevelyan, 006: Agent turned terrorist leader;
James' nemesis in "Goldeneye".


<li data-id="45732">James Bond, 007: The main man; shaken but not
stirred.</li>
James Bond, 007: The main man; shaken but not
stirred.






--------------------------------------------------------------------------------


In [60]:
# The find_next() function implements the next_elements but returns only a single
# element that is found after the next or next_element element. There's also an advantage
# of using the find_next() function over the next_elements as we can implement
# additional search logic for elements.
# The following code demonstrates the use of the find_next() function, with, and without,
# search conditions; it also displays the outputs from the next element and
# next_elements to compare the actual usage.
print(soup.find('p','story').next)
print(40*"--")
print(soup.find('p','story').next_element)
print(40*"--")
print(soup.find('p','story').find_next()) #element after next_element
print(40*"--")



Once upon a time there were three little sisters; and
their names were

--------------------------------------------------------------------------------
Once upon a time there were three little sisters; and
their names were

--------------------------------------------------------------------------------
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
--------------------------------------------------------------------------------


In [61]:
# The find_all_next() function works in a similar way to the find_next() function, but
# returns all of the next elements. It's also used as an iterating version of
# the find_next() function. Additional search criteria and arguments such as limit 
# can be used to search and control the results returned
print(soup.find('p','story').find_all_next())
print(40*"--")
print(soup.find('p','story').find_all_next('li',limit=2))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, <p class="story">...</p>, <h1>Secret agents</h1>, <ul>
<li data-id="10784">Jason Walters, 003: Found dead in "A View to a
Kill".</li>
<li data-id="97865">Alex Trevelyan, 006: Agent turned terrorist leader;
James' nemesis in "Goldeneye".</li>
<li data-id="45732">James Bond, 007: The main man; shaken but not
stirred.</li>
</ul>, <li data-id="10784">Jason Walters, 003: Found dead in "A View to a
Kill".</li>, <li data-id="97865">Alex Trevelyan, 006: Agent turned terrorist leader;
James' nemesis in "Goldeneye".</li>, <li data-id="45732">James Bond, 007: The main man; shaken but not
stirred.</li>]
--------------------------------------------------------------------------------
[<li data-id="10784">Jason Walters, 003: Found dead in "A View to a
Kill".</li>, <li data-id="97865">

In [62]:
# The find_previous() function implements previous_elements but returns only the
# single element that was found before the previous or previous_element. 
# It also has an advantage over the previous_elements as we can implement additional 
# search logic for elements.
print(soup.find('ul').previous.previous.previous)
print(40*"--")
print(soup.find('ul').find_previous())
print(40*"--")
print(soup.find('ul').find_previous('p','title'))

<h1>Secret agents</h1>
--------------------------------------------------------------------------------
<h1>Secret agents</h1>
--------------------------------------------------------------------------------
<p class="title"><b>The Dormouse's story</b></p>


In [63]:
# The find_all_previous() function is an iterated version of the find_previous(); it
# returns all previous elements satisfied with the available criteria as seen in the 
# following code:
print(soup.find('ul').find_all_previous('p'))
print(40*"--")

[<p class="story">...</p>, <p class="story">Once upon a time there were three little sisters; and
their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="title"><b>The Dormouse's story</b></p>]
--------------------------------------------------------------------------------


In [67]:
# next_sibling and previous_sibling are yet another way of traversing along the
# parsed tree looking for next and previous siblings. 
# A sibling or siblings are termed to the element that appears or is found 
# on the same level, in the parsed tree or those elements that share the same parent.
print(soup.find('p','title').next_sibling) #returns empty or new-line
print(soup.find('p','title').next_sibling.next_sibling)
#print(soup.find('p','title').next_sibling.next)
print(40*"--")
print(soup.find('ul').previous_sibling) #returns empty or new-line
print(soup.find('ul').previous_sibling.previous_sibling)



<p class="story">Once upon a time there were three little sisters; and
their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
--------------------------------------------------------------------------------


<h1>Secret agents</h1>


In [66]:
# Iteration is also possible with siblings, 
# using the next_siblings and previous_siblings elements
#using List Comprehension
title = [ele.name for ele in soup.find('p','title').next_siblings]
print(list(filter(None,title)))
print(40*"--")
ul = [ele.name for ele in soup.find('ul').previous_siblings]
print(list(filter(None,ul)))

['p', 'p', 'h1', 'ul']
--------------------------------------------------------------------------------
['h1', 'p', 'p', 'p']


# Example 1 – listing \<li\> elements with the data-id attribute 

In [71]:
# use the select() function to list the <li> element with the data-id attribute:
print(soup.select('li[data-id]'))

[<li data-id="10784">Jason Walters, 003: Found dead in "A View to a
Kill".</li>, <li data-id="97865">Alex Trevelyan, 006: Agent turned terrorist leader;
James' nemesis in "Goldeneye".</li>, <li data-id="45732">James Bond, 007: The main man; shaken but not
stirred.</li>]


In [69]:
# Indexes can be applied to fetch the exact elements 
print(soup.select('ul li[data-id]')[1]) #fetch index 1 only from resulted

<li data-id="97865">Alex Trevelyan, 006: Agent turned terrorist leader;
James' nemesis in "Goldeneye".</li>


In [70]:
# To extract the first match that has resulted the from CSS query, we can use either
# the list index, that is, 0 (zero) or the select_one() function in place of
# the select() function. The select_one() function returns the string of objects, 
# not the list: 
print(soup.select_one('li[data-id]'))

<li data-id="10784">Jason Walters, 003: Found dead in "A View to a
Kill".</li>


# Example 2 – traversing through elements

In [3]:
# CSS Selectors have various combinators such as +, >, a space character, 
# and so on, which show relationships between the elements. 
print(soup.select('p.story > a.sister'))#Selects all <a> with
print(45*"--")
print(soup.select('p b'))#Selects <b> inside <p>
print(45*"--")
print(soup.select('p + h1'))#Selects immediate <h1> after <p>
print(45*"--")
print(soup.select('p.story + h1'))#Selects immediate <h1> after <p> with
print(45*"--")
print(soup.select('p.title + h1'))#Selects immediate <h1> after <p> with

NameError: name 'soup' is not defined

# Example 3 – searching elements based on attribute values

# Building a web crawler

In [76]:
# Listing Quotes from first 5 or less pages found from 'http://quotes.toscrape.com/'
import requests
import re
from bs4 import BeautifulSoup
import csv
sourceUrl = 'http://quotes.toscrape.com/'
keys = ['quote_tags','author_url',
        'author_name','born_date',
        'born_location','quote_title']

In [2]:
# The read_url() function will be used to make a request and receive a response 
# using the requests function. This function will accept a url argument for pages:
def read_url(url):
#Read given Url, Returns requests object for page content
 response = requests.get(url)
 return response.text

In [1]:
# The implemented get_details() function is being coded for pagination and scraping
# logic. The read_url() function is supplied with a dynamically generated page URL to
# manage the pagination.
def get_details(page, dataWriter):
 """Get 'response' for first 5 pages, parse it and collect data for
'keys' headers"""
 nextPage = True
 pageNo = 1
 while (nextPage and pageNo <= 5):
 response = read_url(page + 'page/' + str(pageNo))
 soup = BeautifulSoup(response, 'lxml')
 rows = soup.find_all('div', 'quote')
 if (len(rows) > 0):
 print("Page ",pageNo," Total Quotes Found ",len(rows))
 for row in rows:
 if row.find('span',attrs={'itemprop':'text'}):
 title =
row.find(attrs={'itemprop':'text'}).text.strip()
 author =
row.find(attrs={'itemprop':'author'}).text.strip()
 authorLink =
row.find('a',href=re.compile(r'/author/')).get('href')
 tags =
row.find('div','tags').find(itemprop="keywords").get('content')
 print(title, ' : ', author,' : ',authorLink, ' :
',tags)
 if authorLink:
 authorLink = 'http://quotes.toscrape.com' + # dataSet is a handle defined to manage the external file quotes.csv. csv.writer() file
# handle is use for accessing CSV-based properties. 
# The writerow() function is passed with keys for writing a row containing 
# the column names from the list keys to the external file.
if __name__ == '__main__':
 dataSet = open('quotes.csv', 'w', newline='', encoding='utf-8')
 dataWriter = csv.writer(dataSet)
 # Write a Header or Column_names to CSV
 dataWriter.writerow(keys)
 #load details for provided URL
 get_details(sourceUrl, dataWriter)
 dataSet.close()
authorLink
 linkDetail = read_url(authorLink)
 soupInner = BeautifulSoup(linkDetail, 'lxml')
 born_date = soupInner.find('span','author-borndate').text.strip()
 born_location = soupInner.find('span','author-bornlocation').text.strip()
 # Write a list of values in file
dataWriter.writerow(
[tags,authorLink,author,born_date,born_location.replace('in ',''),title])
 nextPage = True
 pageNo += 1
 else:
 print("Quotes Not Listed!")

IndentationError: expected an indented block (2964536685.py, line 10)

IndentationError: unexpected indent (3925389348.py, line 14)