In [13]:
from bs4 import BeautifulSoup
html = """
    <html><head><title>The Dormouse's story</title></head>
    <body><p class="title"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little
sisters; and their names were
    <a href="https://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="https://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="https://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """

soup = BeautifulSoup(html,'lxml')
soup

<html><head><title>The Dormouse's story</title></head>
<body><p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little
sisters; and their names were
    <a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="https://example.com/lacie" id="link2">Lacie</a> and
    <a class="sister" href="https://example.com/tillie" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [14]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little
sisters; and their names were
   <a class="sister" href="https://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="https://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="https://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
    and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [3]:
for a in soup.findAll(name='a'): # 找出所有a标签
    print('attrs: ',a.attrs)
    """
    在输出结果中出现了换行是因为 tag.string 返回的是标签内的文本内容，而标签内的文本和标签之间的空格和换行符也会作为字符串一部分被返回，因此输出结果出现了换行符。
    要去掉这些空格和换行符，可以使用 strip() 方法，
    """
    print('string:',a.string.strip()) #
    print('--------------------')

attrs:  {'href': 'https://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
string: Elsie
--------------------
attrs:  {'href': 'https://example.com/lacie', 'class': ['sister'], 'id': 'link2'}
string: Lacie
--------------------
attrs:  {'href': 'https://example.com/tillie', 'class': ['sister'], 'id': 'link3'}
string: Tillie
--------------------


In [4]:
# 找出所有 class ＝”sister”， id＝” link1＂的标签
for tag in soup.findAll(attrs={"class":"sister","id":"link1"}):
    print('tag:',tag.name)
    print('attrs:',tag.attrs)
    print('string:',tag.string.strip())

tag: a
attrs: {'href': 'https://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
string: Elsie


In [5]:
# ＃找出所有包含内容为 Elsie 的标签
for tag in soup.findAll(name='a',text="Elsie"):
    print('tag:',tag.name)
    print('attrs:',tag.attrs)
    print('string:',tag.string.strip())

tag: a
attrs: {'href': 'https://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
string: Elsie


In [6]:
import re # 用正则 的方式找出所有 id＝ ” link 数字” 的标签
for tag in soup.findAll(attrs={'id':re.compile('link\d')}):
    print(tag)

<a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="https://example.com/lacie" id="link2">Lacie</a>
<a class="sister" href="https://example.com/tillie" id="link3">Tillie</a>


In [7]:
# 用正则的方式找出所有包含内容结尾为”ie”的 a 标签
for tag in soup.findAll('a',text=re.compile('.*?ie')):
    print(tag)

<a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="https://example.com/lacie" id="link2">Lacie</a>
<a class="sister" href="https://example.com/tillie" id="link3">Tillie</a>


In [8]:
def parser(tag):
    '''
    自定义解析函数：解析出标签名为 ’a’，属性不为空且 id 属性为 linkl1的标签
    :param tag:
    :return:
    '''
    if tag.name=='a' and tag.attrs and tag.attrs['id'] == 'link1':
        return True

In [9]:
"""
使用 soup.findAll() 方法，将解析后的 BeautifulSoup 对象作为参数传递给该方法，并将自定义解析函数 parser 作为第一个参数传递给 findAll() 方法。该方法会遍历所有的标签，当遇到符合自定义解析函数的条件时，该标签就会被记录下来并返回。
"""
for tag in soup.findAll(parser):
    print(tag)

<a class="sister" href="https://example.com/elsie" id="link1">Elsie</a>
