In [1]:
#https://www.crummy.com/software/BeautifulSoup/bs4/doc/#quick-start

In [4]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [5]:
from bs4 import BeautifulSoup

In [6]:
BeautifulSoup(html_doc, "html.parser")


<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [10]:
soup = BeautifulSoup(html_doc, "html.parser")

In [13]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [14]:
soup.title

<title>The Dormouse's story</title>

In [15]:
soup.title.name

'title'

In [16]:
soup.title.string

"The Dormouse's story"

In [17]:
soup.title.text

"The Dormouse's story"

In [18]:
soup.title.parent.name

'head'

In [19]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [20]:
soup.p["class"]

['title']

In [21]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [22]:
soup.find_all("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [24]:
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [28]:
for link in soup.find_all("a"):
    print(link.get("href"))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [30]:
soup.find_all("a")[0]["href"]

'http://example.com/elsie'

In [31]:
soup.find_all("a")[0].get("href")

'http://example.com/elsie'

In [33]:
soup.get_text()

"\nThe Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"

In [34]:
soup.text

"\nThe Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"

In [36]:
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [35]:
soup.find(id="link3").attrs

{'href': 'http://example.com/tillie', 'class': ['sister'], 'id': 'link3'}

In [37]:
soup.find("a", id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [39]:
soup.find("a", {"id":"link3"})

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [40]:
soup.find_all("b")

[<b>The Dormouse's story</b>]

In [41]:
import re 

In [43]:
for tag  in soup.find_all(re.compile("^b")):
    print(tag.name)

body
b


In [46]:
soup.find_all(["a", "b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [47]:
import re 
def not_lacie(href):
    return href and not re.compile("lacie").search(href)

In [48]:
soup.find_all(href=not_lacie)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [49]:
soup.find_all("p", "title")

[<p class="title"><b>The Dormouse's story</b></p>]

In [None]:
body > p.title > b

In [None]:
#link3

In [None]:
body > p:nth-child(3)

In [50]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')

In [51]:
css_soup

<p class="body strikeout"></p>

In [52]:
css_soup.find_all("p", class_="strikeout")

[<p class="body strikeout"></p>]

In [53]:
css_soup.select("p.strikeout.body")

[<p class="body strikeout"></p>]

In [54]:
soup


<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [55]:
soup.select("title")

[<title>The Dormouse's story</title>]

In [57]:
soup.select("p:nth-of-type(3)")

[<p class="story">...</p>]

In [None]:
body > p:nth-child(3)

In [58]:
soup.select("body > p:nth-child(3)")

[<p class="story">...</p>]

In [59]:
soup.select("body a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [60]:
soup.select("html head title")

[<title>The Dormouse's story</title>]

In [62]:
soup.select("head title")

[<title>The Dormouse's story</title>]

In [63]:
soup.select("p > a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [64]:
soup.select("p  a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [65]:
soup.select("p > #link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [66]:
soup.select("p #link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [73]:
soup.select("p #link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [72]:
soup.select("p .sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]