In [1]:
from bs4 import BeautifulSoup

# spoof some html
html="""
<html>
<head>
    <title>Mock Html Page</title>
</head>
<body>
    <h1 style="font-weight: bold; color: red;">Heading</h1>
    <div class="content">
        <p>Paragraph text goes here</p>
        <a href="http://google.com">
            Search on Google
        </a>
        <img src="/images/image.gif" />
    </div>
</body>
"""

In [2]:
soup = BeautifulSoup( html, "html.parser" )

In [3]:
# make it look nice, innit
print( soup.prettify() )

<html>
 <head>
  <title>
   Mock Html Page
  </title>
 </head>
 <body>
  <h1 style="font-weight: bold; color: red;">
   Heading
  </h1>
  <div class="content">
   <p>
    Paragraph text goes here
   </p>
   <a href="http://google.com">
    Search on Google
   </a>
   <img src="/images/image.gif"/>
  </div>
 </body>
</html>


In [4]:
soup.html

<html>
<head>
<title>Mock Html Page</title>
</head>
<body>
<h1 style="font-weight: bold; color: red;">Heading</h1>
<div class="content">
<p>Paragraph text goes here</p>
<a href="http://google.com">
            Search on Google
        </a>
<img src="/images/image.gif"/>
</div>
</body>
</html>

In [5]:
soup.head

<head>
<title>Mock Html Page</title>
</head>

In [6]:
soup.body.name

'body'

In [7]:
soup.p

<p>Paragraph text goes here</p>

In [8]:
# smush those elements together!
soup.body.contents

['\n',
 <h1 style="font-weight: bold; color: red;">Heading</h1>,
 '\n',
 <div class="content">
 <p>Paragraph text goes here</p>
 <a href="http://google.com">
             Search on Google
         </a>
 <img src="/images/image.gif"/>
 </div>,
 '\n']

In [9]:
# even less helpful?
soup.body.text

'\nHeading\n\nParagraph text goes here\n\n            Search on Google\n        \n\n\n'

In [10]:
# also not massively helpful
soup.h1.nextSibling

'\n'

In [11]:
soup.h1.nextSibling.nextSibling

<div class="content">
<p>Paragraph text goes here</p>
<a href="http://google.com">
            Search on Google
        </a>
<img src="/images/image.gif"/>
</div>

In [12]:
# lets get a better example of html
html = """
<html>
<head><title>Another Version</title></head>
<body>
    <div class="content">
        <h1 id="heading">This is a heading</h1>
        <p class="strapline">This will be more prominent</p>
        <p>Here is yet another paragraph about something</p>
        <p>It goes on, and on, and on</p>
        <p>This one even includes a link <a href="/path">to another page</a> so you can link out</p>
        <p class="class1 class2 class3">Multiple Classes here, mate</p>
    </div>
    <a class="footerlink" href="//www.google.com">Go to Google</a>
</body>
</html>"""

In [13]:
soup2 = BeautifulSoup( html, 'html.parser' )

In [14]:
print( soup2.prettify() )

<html>
 <head>
  <title>
   Another Version
  </title>
 </head>
 <body>
  <div class="content">
   <h1 id="heading">
    This is a heading
   </h1>
   <p class="strapline">
    This will be more prominent
   </p>
   <p>
    Here is yet another paragraph about something
   </p>
   <p>
    It goes on, and on, and on
   </p>
   <p>
    This one even includes a link
    <a href="/path">
     to another page
    </a>
    so you can link out
   </p>
   <p class="class1 class2 class3">
    Multiple Classes here, mate
   </p>
  </div>
  <a class="footerlink" href="//www.google.com">
   Go to Google
  </a>
 </body>
</html>


In [15]:
# so this is interesting - it only picks up the first one?
soup2.a

<a href="/path">to another page</a>

In [16]:
# ... same here.
soup2.p

<p class="strapline">This will be more prominent</p>

In [17]:
soup2.findAll('a')

[<a href="/path">to another page</a>,
 <a class="footerlink" href="//www.google.com">Go to Google</a>]

In [18]:
soup2.find('p')

<p class="strapline">This will be more prominent</p>

In [19]:
soup2.find('a')

<a href="/path">to another page</a>

In [20]:
# CSS selector syntax is not a thing, it seems.
soup2.find('a.footerlink')

In [21]:
paras = ' '.join( [p.text for p in soup2.findAll('p')] )
print( paras )

This will be more prominent Here is yet another paragraph about something It goes on, and on, and on This one even includes a link to another page so you can link out Multiple Classes here, mate


In [22]:
# so THAT's how you do a class lookup.
soup2.findAll('a', {"class":"footerlink"})

[<a class="footerlink" href="//www.google.com">Go to Google</a>]

In [23]:
# IDs are a bit easier.
soup2.findAll(id='heading')

[<h1 id="heading">This is a heading</h1>]

In [24]:
# We'll refer back to the first one to do a style lookup
soup.findAll(style="font-weight: bold;")

[]

In [25]:
# So it has to be a complete attribute match
soup.findAll(style="font-weight: bold; color: red;")

[<h1 style="font-weight: bold; color: red;">Heading</h1>]

In [26]:
# can lookup via a list
soup2.findAll( ['p', 'a'] )

[<p class="strapline">This will be more prominent</p>,
 <p>Here is yet another paragraph about something</p>,
 <p>It goes on, and on, and on</p>,
 <p>This one even includes a link <a href="/path">to another page</a> so you can link out</p>,
 <a href="/path">to another page</a>,
 <p class="class1 class2 class3">Multiple Classes here, mate</p>,
 <a class="footerlink" href="//www.google.com">Go to Google</a>]

In [27]:
# or a dictionary
soup2.findAll({ 'p': True, 'a': True })

[<p class="strapline">This will be more prominent</p>,
 <p>Here is yet another paragraph about something</p>,
 <p>It goes on, and on, and on</p>,
 <p>This one even includes a link <a href="/path">to another page</a> so you can link out</p>,
 <a href="/path">to another page</a>,
 <p class="class1 class2 class3">Multiple Classes here, mate</p>,
 <a class="footerlink" href="//www.google.com">Go to Google</a>]

In [28]:
# finding links, for example...
links = soup2.find('a')

print( links )

<a href="/path">to another page</a>


In [29]:
# apparently, findAll has a limit...
soup2.findAll('p', limit=2)

[<p class="strapline">This will be more prominent</p>,
 <p>Here is yet another paragraph about something</p>]

In [30]:
# href gets special treatment, it seems.
print( links['href'] + " is captioned " + links.text + " and is a tag of type " + links.name )

/path is captioned to another page and is a tag of type a


In [31]:
links.findNext('a')

<a class="footerlink" href="//www.google.com">Go to Google</a>

In [33]:
# equivalent to findAll() in the body.
soup2.body('p')

[<p class="strapline">This will be more prominent</p>,
 <p>Here is yet another paragraph about something</p>,
 <p>It goes on, and on, and on</p>,
 <p>This one even includes a link <a href="/path">to another page</a> so you can link out</p>,
 <p class="class1 class2 class3">Multiple Classes here, mate</p>]

In [35]:
# what about isolating classes within larger class lists?
soup2.find('p', {'class': 'class1'})

<p class="class1 class2 class3">Multiple Classes here, mate</p>