In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b class='boldest'>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<b> This is Bold example </b>
<b> Another Bold one </b>
<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b class="boldest">
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <b>
   This is Bold example
  </b>
  <b>
   Another Bold one
  </b>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [8]:
soup.title

<title>The Dormouse's story</title>

In [13]:
soup.head

<head><title>The Dormouse's story</title></head>

In [9]:
soup.title.name

'title'

In [10]:
soup.title.string

"The Dormouse's story"

In [14]:
print(soup.title.parent.name)

print(soup.p) #brings back the first p tag
 
print(soup.p['class'])

print(soup.a)

print(soup.find_all('a'))

print(soup.find(id="link3"))

head
<p class="title"><b class="boldest">The Dormouse's story</b></p>
['title']
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


In [16]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [17]:
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
 This is Bold example 
 Another Bold one 
...



In [18]:
tag = soup.b
tag

<b class="boldest">The Dormouse's story</b>

In [19]:
tag['class']

['boldest']

In [20]:
soup.title.string

"The Dormouse's story"

In [22]:
soup.strings

<generator object _all_strings at 0x1119782b0>

In [24]:
for string in soup.strings:
    print(repr(string))

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
' This is Bold example '
'\n'
' Another Bold one '
'\n'
'...'
'\n'


In [21]:
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'This is Bold example'
'Another Bold one'
'...'


In [23]:
#parents
link = soup.a
print(link)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
p
body
html
[document]


In [26]:
import re
for tag in soup.find_all(re.compile("^b")):  #^b means bring back tags which start with b
    print(tag.name)

body
b
b
b


In [36]:
for tag in soup.find_all(re.compile("^b.*y$")):  #y$ means bring back tags which end with y
    print(tag.name)

# . means any character
# * means any number of times (0-infinity)

body


In [37]:
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

html
title


In [38]:
soup.find_all(["a", "b"])

[<b class="boldest">The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,
 <b> This is Bold example </b>,
 <b> Another Bold one </b>]

In [39]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

In [40]:
soup.find_all(has_class_but_no_id)

[<p class="title"><b class="boldest">The Dormouse's story</b></p>,
 <b class="boldest">The Dormouse's story</b>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [41]:
# changing the parser to be XML
BeautifulSoup("<a><b /></a>", "xml")

<?xml version="1.0" encoding="utf-8"?>
<a><b/></a>

In [42]:
import requests
#r  = requests.get("http://www.pythonforbeginners.com/python-on-the-web/web-scraping-with-beautifulsoup/")
r  = requests.get("https://www.crummy.com/software/BeautifulSoup/bs4/doc/")
data = r.text

In [18]:
print(data)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">


<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    
    <title>Beautiful Soup Documentation &mdash; Beautiful Soup 4.4.0 documentation</title>
    
    <link rel="stylesheet" href="_static/default.css" type="text/css" />
    <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
    
    <script type="text/javascript">
      var DOCUMENTATION_OPTIONS = {
        URL_ROOT:    './',
        VERSION:     '4.4.0',
        COLLAPSE_INDEX: false,
        FILE_SUFFIX: '.html',
        HAS_SOURCE:  true
      };
    </script>
    <script type="text/javascript" src="_static/jquery.js"></script>
    <script type="text/javascript" src="_static/underscore.js"></script>
    <script type="text/javascript" src="_static/doctools.js"></script>
    <link rel="top" title="Beautiful Soup 

In [19]:
new_soup = BeautifulSoup(data, 'html.parser')
print(new_soup.get_text())





Beautiful Soup Documentation — Beautiful Soup 4.4.0 documentation



      var DOCUMENTATION_OPTIONS = {
        URL_ROOT:    './',
        VERSION:     '4.4.0',
        COLLAPSE_INDEX: false,
        FILE_SUFFIX: '.html',
        HAS_SOURCE:  true
      };
    







Navigation


index
Beautiful Soup 4.4.0 documentation »







Beautiful Soup Documentation¶

Beautiful Soup is a
Python library for pulling data out of HTML and XML files. It works
with your favorite parser to provide idiomatic ways of navigating,
searching, and modifying the parse tree. It commonly saves programmers
hours or days of work.
These instructions illustrate all major features of Beautiful Soup 4,
with examples. I show you what the library is good for, how it works,
how to use it, how to make it do what you want, and what to do when it
violates your expectations.
The examples in this documentation should work the same way in Python
2.7 and Python 3.2.
You might be looking for the documentation for Beautif