In [1]:
import re
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><?-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

In [2]:
"""simple demo"""

# BeautifulSoup 会自动对 html 格式进行 label 的补全，如会加上 </body> 等
soup = BeautifulSoup(html, 'lxml')  
# print(soup.prettify())  # 按标准缩进格式输出
print(soup.title.string)    # use 'string' to get content

The Dormouse's story


In [3]:
"""bs: select node as attributes"""

soup = BeautifulSoup(html, 'lxml')
print(soup.title)
print(type(soup.title))     # bs4.elemnt.Tag
print(soup.title.string)
print(soup.head)
print(soup.head.title.string)  # 嵌套选择
print(soup.p)
print(soup.title.name)

<title>The Dormouse's story</title>
<class 'bs4.element.Tag'>
The Dormouse's story
<head><title>The Dormouse's story</title></head>
The Dormouse's story
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
title


In [4]:
"""bs: get node attrs"""

print(soup.p.attrs)  # 返回的是 ‘属性’: ‘属性值’ 的字典，所有属性
print(soup.p.attrs['name'])
print(soup.p['name'])   # 简化版，值得一提的是，如果对应多个属性值，会返回字符串列表

{'class': ['title'], 'name': 'dromouse'}
dromouse
dromouse


In [5]:
"""Association selection"""
# content(s), children, descendant(s), parent(s), next/previous_sibling(s)

print(soup.p.contents)  # list，只匹配第一个找到的p
print(soup.p.children) # iterator
for i, child in enumerate(soup.p.children):
    print(i, child)
for i, descendant in enumerate(soup.p.descendants):
    print(i, descendant)

[<b>The Dormouse's story</b>]
<list_iterator object at 0x7fc07029f340>
0 <b>The Dormouse's story</b>
0 <b>The Dormouse's story</b>
1 The Dormouse's story


In [6]:
"""find_all or find: method seletor"""
# find, find_all, find_parent(s), find_next_sibling(s)......
html = """
<title id='list-1'>link 猜猜猜</title>
<div>
<ul class='element'> 你是？ </ul>
<ul class='element'> 我是？ </ul>
<ul class='element'> 他是？ </ul>
<ul class='element'> 谁是 link？</ul>
<a href='www.hao123.com'>click this link</a>
</div>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name='ul'))  # 返回bs4.element.Tag类型的列表
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(id='list-1'))   # 简化版，注意这里没了引号
print(soup.find_all(class_='element'))  # 注意下划线
print(soup.find_all(string=re.compile('link')))   # 参数可以是正则表达式也可以是字符串，返回的是匹配的整个节点文本组成的列表

[<ul class="element"> 你是？ </ul>, <ul class="element"> 我是？ </ul>, <ul class="element"> 他是？ </ul>, <ul class="element"> 谁是 link？</ul>]
[<title id="list-1">link 猜猜猜</title>]
[<title id="list-1">link 猜猜猜</title>]
[<ul class="element"> 你是？ </ul>, <ul class="element"> 我是？ </ul>, <ul class="element"> 他是？ </ul>, <ul class="element"> 谁是 link？</ul>]
['link 猜猜猜', ' 谁是 link？', 'click this link']


In [7]:
"""4.CSS选择器"""
# 只需调用select方法，传入CSS选择器即可，需要熟悉Web开发的CSS选择器
# 这是一个非常强大的选择器，还有pyquery库与其对应

'4.CSS选择器'