# BeautifulSoup相关属性的使用

In [1]:
from bs4 import BeautifulSoup

In [2]:
soup = BeautifulSoup('<b class="boldest">西北工业大学</b>','lxml')

In [3]:
# BeautifulSoup属性
print(type(soup))
print(soup.name)

<class 'bs4.BeautifulSoup'>
[document]


In [4]:
# tag属性
tag = soup.b
print(type(tag))
print(tag.name)

<class 'bs4.element.Tag'>
b


In [5]:
# 修改name
tag.name = "p"
print(tag)
print(tag.attrs)

<p class="boldest">西北工业大学</p>
{'class': ['boldest']}


In [6]:
# 获取class属性的值
print(tag["class"])
# 修改class属性的值
tag["class"] = "nwpu"
# 增加一个id属性
tag["id"] = "1"
print(tag)
# 删除id属性
del tag['id']
print(tag)

['boldest']
<p class="nwpu" id="1">西北工业大学</p>
<p class="nwpu">西北工业大学</p>


In [7]:
# NavigableString属性
print(tag.string)
print(type(tag.string))

西北工业大学
<class 'bs4.element.NavigableString'>


In [8]:
#  replace_with() 的使用
tag.string.replace_with("西北工业大学计算机学院")
print(tag)

<p class="nwpu">西北工业大学计算机学院</p>


In [9]:
# comment属性
markup = "<b><!--I am a student of NWPU--></b>"
soup = BeautifulSoup(markup,'lxml')
comment = soup.b.string
print(type(comment))
print(comment)

<class 'bs4.element.Comment'>
I am a student of NWPU


# 遍历文档树

In [10]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

In [11]:
from bs4 import BeautifulSoup
import re
soup = BeautifulSoup(html_doc,'lxml')

In [12]:
# contents和children
head_tag = soup.head
print(head_tag)
print(head_tag.contents)

title_tag = head_tag.contents[0]
print(title_tag)
print(title_tag.contents)

for child in title_tag.children:
    print(child)

<head><title>The Dormouse's story</title></head>
[<title>The Dormouse's story</title>]
<title>The Dormouse's story</title>
["The Dormouse's story"]
The Dormouse's story


In [13]:
# descendants
print(type(head_tag.descendants))
for child in head_tag.descendants:
    print(child)


# parent
print(title_tag.parent)
# BeautifulSoup对象的parent属性是None。
html_tag = soup.html
print(type(html_tag.parent))
print(soup.parent)

<class 'generator'>
<title>The Dormouse's story</title>
The Dormouse's story
<head><title>The Dormouse's story</title></head>
<class 'bs4.BeautifulSoup'>
None


In [14]:
# parents：遍历<a>便签到根节点的所有节点。
link = soup.a
print(link)
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
p
body
html
[document]


In [21]:
# next_sibling和previous_sibling
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>",'lxml')
print(sibling_soup.b.next_sibling)
print(sibling_soup.c.previous_sibling)

print(sibling_soup.b.previous_sibling)
print(sibling_soup.c.next_sibling)

<c>text2</c>
<b>text1</b>
None
None


In [15]:
# next_siblings和previous_siblings
for sibling in soup.a.next_siblings:
    print(repr(sibling))
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'
' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
'Once upon a time there were three little sisters; and their names were\n'


# 搜索文档树

In [16]:
# find_all()函数使用
# 传入name属性
print(soup.find_all(name='a'))

# 传入attrs属性
print(soup.find_all(attrs={'class','sister'}))

# 传入recursive属性
# 在所有子节点中查找名为title的所有子节点
print(soup.find_all("title"))
# 在所有直接子节点中查找名为title的所有子节点
print(soup.html.find_all("title", recursive=False))

# 传入text属性
print(soup.find_all(text="Elsie"))
print(soup.find_all(text=["Tillie", "Elsie", "Lacie"]))
print(soup.find_all(text=re.compile("Dormouse")))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[<title>The Dormouse's story</title>]
[]
['Elsie']
['Elsie', 'Lacie', 'Tillie']
["The Dormouse's story", "The Dormouse's story"]


In [17]:
# 子节点内容与其父节点内容一致
def is_the_only_string_within_a_tag(s):
    return (s == s.parent.string)
print(soup.find_all(text=is_the_only_string_within_a_tag))

["The Dormouse's story", "The Dormouse's story", 'Elsie', 'Lacie', 'Tillie', '...']


In [18]:
# find()函数的使用
print(soup.find('p'))
print(type(soup.find('p')))

# 在find_all()函数中使用limit参数
print(soup.find_all('p', limit=1))

<p class="title"><b>The Dormouse's story</b></p>
<class 'bs4.element.Tag'>
[<p class="title"><b>The Dormouse's story</b></p>]


In [19]:
# 比较二者对空值的处理
print(soup.find("NWPU"))
print(soup.find_all("NWPU"))

None
[]


# 利用Beautiful Soup分析豆瓣电影榜

In [20]:
from bs4 import BeautifulSoup
from lxml import etree
import re
html = etree.parse(r"data\豆瓣电影排行榜.html", etree.HTMLParser())
html = etree.tostring(html).decode("utf-8")
soup = BeautifulSoup(html, 'lxml')
# 获取所有电影
films = soup.find_all(name='div', class_='pl2')
# 获取所有的电影名称和评分,将结果保存在一个字典中
result = {}
#  遍历films列表，其中的每一个元素都是一个Tag对象
for film in films:
    # 将获取到的迭代器转化为list，并将list中的第一个元素转化为字符串形式
    string = str(list(film.find('a', class_='').children)[0])
    # 利用正则表达式将string中的电影名提取出来
    film_name = re.sub(r'[\r\n\s/]*', ' ',  string)
    # 获取电影评分
    star_div = film.find(name='div', class_='star clearfix')
    star_num_div = film.find(name='div', class_='star clearfix')
    star_num = star_num_div.find(name='span', class_='rating_nums').string
    result[film_name] = float(str(star_num))
print(result)

{' 地 久 天 长 ': 7.9, ' 绿 皮 书 ': 8.9, ' 孟 买 酒 店 ': 8.4, ' 调 音 师 ': 8.3, ' 雪 暴 ': 6.2, ' 我 们 ': 6.5, ' 海 市 蜃 楼 ': 7.8, ' 五 尺 天 涯 ': 8.0, ' 我 的 一 级 兄 弟 ': 8.2, ' 反 贪 风 暴 4 ': 6.0}
