## Beautiful Soup

- css나 xpath를 쉽게 사용하기 위해서 나온 모듈
- html 파싱할 때 꼭 사용
- 내부적으로는 lxml가 포함

In [1]:
from bs4 import BeautifulSoup

In [2]:
# 테스트 html (response data 또는 파일에서 읽어온 data로 가정)
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<h1>this is h1 area</h1>
<h2>this is h2 area</h2>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
<a data-io="link3" href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
</p>
<p class="story">story...</p>
</body>
</html>
"""

In [3]:
# bs4 초기화
soup = BeautifulSoup(html, 'html.parser')

# 타입 확인
print('soup', type(soup))

soup <class 'bs4.BeautifulSoup'>


In [4]:
# 코드 정리
print('prettify', soup.prettify())

prettify <html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <h1>
   this is h1 area
  </h1>
  <h2>
   this is h2 area
  </h2>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
  </p>
  <p class="story">
   story...
  </p>
 </body>
</html>



In [5]:
h1 = soup.html.body.h1
print('h1', h1)

h1 <h1>this is h1 area</h1>


In [7]:
p1 = soup.html.body.p
print('p1', p1)

p1 <p class="title"><b>The Dormouse's story</b></p>


In [11]:
# \n이 있고, <p>
p2 = p1.next_sibling.next_sibling
print('p2', p2)

p2 <p class="story">Once upon a time there were three little sisters
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
</p>


In [13]:
# 텍스트 출력
print("p >> ", p1.string)

p >>  The Dormouse's story


In [14]:
# 다음 엘리먼트 확인
print(list(p2.next_elements))

# 반복 출력 확인
for v in p2.next_elements:
    print(v)

['Once upon a time there were three little sisters\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 'Elsie', '\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 'Lacie', '\n', <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>, 'Tillie', '\n', '\n', <p class="story">story...</p>, 'story...', '\n', '\n', '\n']
Once upon a time there were three little sisters

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie


<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie


<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie




<p class="story">story...</p>
story...








In [18]:
soup = BeautifulSoup(html, 'html.parser')

# a 태그 모두 선택
links1 = soup.find_all("a")  # limit=2

# 타입 확인
print('links', type(links1))

# 리스트 요소 확인
print(links1)

links <class 'bs4.element.ResultSet'>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>]


In [16]:
# 다중 조건
link2 = soup.find("a", {"class": "sister", "data-io": "link3"})
# 출력
print(link2)

<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>


In [21]:
# CSS 선택자 중요
# 태그 + 클래스 + 자식 선택자
link1 = soup.select_one("p.title > b")
# 태그 + id 선택자
link2 = soup.select_one("a#link1")
# 태그 + 속성 선택자
link3 = soup.select_one("a[data-io='link3']")

# 전체 구조 및 텍스트 출력
print(link1)
print(link1.string)
print(link2)
print(link2.string)
print(link3)
print(link3.string)

<b>The Dormouse's story</b>
The Dormouse's story
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie
<a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie


In [22]:
# 선택자에 맞는 전체 선택
# 태그 + 클래스 + 자식
link4 = soup.select("p.story > a")
# 태그 + 클래스 + 자식 + 태그 + 순서
link5 = soup.select("p.story > a:nth-of-type(2)")
# 태그 + 클래스
link6 = soup.select("p.story")

# 전체 구조 및 텍스트 출력
print(link4)
print(link5)
print(link6[1])

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" data-io="link3" href="http://example.com/tillie" id="link3">Tillie</a>]
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
<p class="story">story...</p>
