In [1]:
from bs4 import BeautifulSoup

html_doc = """
<html><head>
<title id="one">The Dormouse's story</title>
</head>
<body>
<p class="story"><!--...--></p>
<p class="title">
    p标签的内容
    <b>The Dormouse's story</b>
</p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


"""

soup = BeautifulSoup(html_doc, 'lxml')

result = soup.find(name='p')

print(result)

<p class="story"><!--...--></p>


# find() -- return "first" qulified tag

In [2]:
result = soup.find(attrs={"class":"title"})
print(result)

<p class="title">
    p标签的内容
    <b>The Dormouse's story</b>
</p>


In [3]:
result = soup.find(text="Tillie")
print(result)

Tillie


In [5]:
result = soup.find(
    name="p",
    attrs={"class":"story"}
)
print(result)

<p class="story"><!--...--></p>


# find_all -- list

In [6]:
result = soup.find_all('a')
print(result)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [7]:
result = soup.find_all("a", limit=1)
print(result)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


In [8]:
result = soup.find_all("a", limit=1)[0]
print(result)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


In [11]:
result = soup.find_all(attrs={"class":"sister"})
print(result)
print(type(result))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<class 'bs4.element.ResultSet'>


# select_one -- css selector

In [10]:
result = soup.select_one('.sister')
print(result)
print(type(result))

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<class 'bs4.element.Tag'>


# select -- css selector -- list

In [12]:
result = soup.select('.sister')
print(result)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [13]:
result = soup.select('#one')
print(result)

[<title id="one">The Dormouse's story</title>]


In [14]:
result = soup.select('head title')
print(result)

[<title id="one">The Dormouse's story</title>]


In [15]:
result = soup.select('title, .story')
print(result)

[<title id="one">The Dormouse's story</title>, <p class="story"><!--...--></p>, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>]


In [16]:
result = soup.select('a[id="link3"]')
print(result)

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


In [17]:
result = soup.select('.title')
print(result)

[<p class="title">
    p标签的内容
    <b>The Dormouse's story</b>
</p>]


In [18]:
result = soup.select('.title')[0]
print(result)

<p class="title">
    p标签的内容
    <b>The Dormouse's story</b>
</p>


# content inside tag, get_text()

In [19]:
result = soup.select('.title')[0].get_text()
print(result)


    p标签的内容
    The Dormouse's story



In [20]:
result = soup.select('#link1')
print(result)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


# get url, get('href')

In [23]:
result = soup.select('#link1')[0].get('href')
print(result)

http://example.com/elsie
