In [1]:
from bs4 import BeautifulSoup as bs

In [2]:
soup = bs("<html>data</html>")
print(soup)

<html><body><p>data</p></body></html>


In [7]:
# Tag
soup = bs("<b class='oldeat'>Extremely Bold</b>")
tag = soup.b
print(type(tag))
print(tag)

<class 'bs4.element.Tag'>
<b class="oldeat">Extremely Bold</b>


In [9]:
print(tag.name)
print(tag.attrs)
print(tag['class'])

b
{'class': ['oldeat']}
['oldeat']


In [10]:
tag['class'] = 'verybold'
tag['id'] = 1
print(tag)

<b class="verybold" id="1">Extremely Bold</b>


In [15]:
css_soup = bs('<p class="body strikeout"></p>')
tag = css_soup.p
print(tag.attrs)
print(tag['class'])

{'class': ['body', 'strikeout']}
['body', 'strikeout']


In [16]:
id_soup = bs('<p id="my id"></p>')
id_soup.p['id']

'my id'

In [17]:
# NavigableString
soup = bs("<b class='oldeat'>Extremely Bold</b>")
tag = soup.b
print(tag.string)

Extremely Bold


In [19]:
# tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,
tag.string.replace_with("No longer bold")
print(tag)

<b class="oldeat">No longer bold</b>


In [21]:
# BeautifulSoup 对象表示的是一个文档的全部内容
# 大部分时候,可以把它当作 Tag 对象
print(soup)
print(soup.name)

<html><body><b class="oldeat">No longer bold</b></body></html>
[document]


In [23]:
# 注释部分comment
# 一个特殊的NavigableString的子类
soup = bs("<b><!--Hey, buddy. Want to buy a used parser?--></b>")
comment = soup.b.string
type(comment)

bs4.element.Comment

In [25]:
html_doc = """
    <html><head><title>The Dormouse's story</title></head>
        <body>
    <p class="title"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>

    <p class="story">...</p>
"""
soup = bs(html_doc, "html.parser")

In [28]:
# 通过点取属性只能获得当前名字的第一个tag
print(soup.head)
print(soup.body)
print(soup.title)
print(soup.a)

<head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
<title>The Dormouse's story</title>
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


In [29]:
# 获取所有同样tag的内容
soup.find_all("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [35]:
# 将tag的子节点以列表的形式输出
print(soup.body.contents)
for child in soup.body.children:
    print(child)

['\n', <p class="title"><b>The Dormouse's story</b></p>, '\n', <p class="story">Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>, '\n', <p class="story">...</p>, '\n']


<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>


<p class="story">...</p>




In [36]:
# .descendants 属性可以对所有tag的子孙节点进行递归循环
for child in soup.head.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story


In [37]:
for string in soup.strings:
    print(repr(string))

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n    '
'Elsie'
',\n    '
'Lacie'
' and\n    '
'Tillie'
';\n    and they lived at the bottom of a well.'
'\n'
'...'
'\n'


In [39]:
# 去掉换行
# 全部是空格的行会被忽略掉,段首和段末的空白会被删除
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\n    and they lived at the bottom of a well.'
'...'


In [41]:
# 使用parten, parents 得到父节点
title_tag = soup.title
print(title_tag)
print(title_tag.parent)

<title>The Dormouse's story</title>
<head><title>The Dormouse's story</title></head>


In [46]:
# 回退和前进
last_a_tag = soup.find("a", id="link3")
print(last_a_tag)
print(last_a_tag.next_sibling)  # 兄弟节点
print(last_a_tag.next_element)  # 下一个解析对象、
for element in last_a_tag.next_elements:
    print(repr(element))

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
;
    and they lived at the bottom of a well.
Tillie
'Tillie'
';\n    and they lived at the bottom of a well.'
'\n'
<p class="story">...</p>
'...'
'\n'


In [47]:
# 搜索文档树
soup.find_all('b')

[<b>The Dormouse's story</b>]

In [48]:
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)

body
b


In [49]:
soup.find_all(["a", "b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [51]:
for tag in soup.find_all(True):  # 不会返回字符节点
    print(tag.name)

html
head
title
body
p
b
p
a
a
a
p


In [52]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

soup.find_all(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
     <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
     and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

find_all: 

支持tag、属性、css、函数等方式搜索。

通过 string 参数可以搜搜文档中的字符串内容.与 name 参数的可选值一样, string 参数接受 字符串 , 正则表达式 , 列表, True . 

如果只想搜索tag的直接子节点,可以使用参数 recursive=False

In [55]:
# CSS选择器
# Beautiful Soup支持大部分的CSS选择器
# 在 Tag 或 BeautifulSoup 对象的 .select() 方法中传入字符串参数
# 即可使用CSS选择器的语法找到tag
print(soup.select("title"))
print(soup.select("body a"))

[<title>The Dormouse's story</title>]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [60]:
# 修改文档树
tag = soup.blockquote
print(tag)
print(tag.name)
tag.name = "title"
tag['class'] = 'verybold'
tag['id'] = 1
print(tag)
tag.string = "New link text."
print(tag)

<blockquote class="verybold" id="1">The Dormouse's story</blockquote>
blockquote
<title class="verybold" id="1">The Dormouse's story</title>
<title class="verybold" id="1">New link text.</title>


In [63]:
# Tag.clear() 方法移除当前tag的内容:
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = bs(markup)
tag = soup.a

tag.clear()
tag

<a href="http://example.com/"></a>

In [65]:
# 输出
# prettify() 方法
# 将Beautiful Soup的文档树格式化后以Unicode编码输出
# 每个XML/HTML标签都独占一行
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = bs(markup)
print(soup.prettify())

<html>
 <body>
  <a href="http://example.com/">
   I linked to
   <i>
    example.com
   </i>
  </a>
 </body>
</html>



In [67]:
# 编码
# 任何HTML或XML文档都有自己的编码方式
# 比如ASCII 或 UTF-8
# 但是使用Beautiful Soup解析后,文档都被转换成了Unicode:
markup = "<h1>Sacr\xc3\xa9 bleu!</h1>"
soup = bs(markup)
soup.h1

<h1>SacrÃ© bleu!</h1>