# 3.BeautifulSoup 기초
* HTML과 XML문서를 파싱하기 위한 파이썬 패키지

In [2]:
from bs4 import BeautifulSoup
import requests

In [3]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="this title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""


In [4]:
soup = BeautifulSoup(html_doc, 'html.parser') #html_parser 생략 가능, default
print(f"{type(soup)}\n", soup)

<class 'bs4.BeautifulSoup'>
 <html><head><title>The Dormouse's story</title></head>
<body>
<p class="this title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>


In [5]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="this title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


### find 함수
* 조건에 만족하는 첫번째 tag 검색

In [6]:
type(soup.find('p'))

bs4.element.Tag

In [7]:
soup.find('a')

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [8]:
soup.find('a',id = 'link2')

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [9]:
soup.find(id = 'link2')

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [10]:
soup.find('a',class_ = 'sister',id = 'link3')

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [11]:
attrs = {
    'class':'sister',
    'id':'link3'
}
soup.find('a',attrs)

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

### find_all 함수
* 조건에 맞는 모든 tag를 리스트로 반환

In [12]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [34]:
for tag in soup.find_all('a'):
    print(tag)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


### get_text 함수
* tag안의 value
* 부모 tag의 겨우 모든 자식 tag의 자식 value를 추측

In [14]:
soup.get_text()

"The Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"

In [15]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="this title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [16]:
soup.find('p').get_text()

"The Dormouse's story"

In [17]:
for name in soup.find_all('a'):
    print(name.get_text())

Elsie
Lacie
Tillie


### attribute 값 추출하기
* 검색한 tag에서 attribute 값 추출

In [18]:
soup.find('p').attrs

{'class': ['this', 'title']}

In [19]:
soup.find('p')['class']

['this', 'title']

In [20]:
for tag in soup.find_all('a'):
    print(tag['href'])
    

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


### select 함수
* select는 CSS selector로 tag 찾기
* 아이디 찾기: #id명
* 클래스 찾기: .class명
* 자손 찾기: tag1 tag2
* 직계 자식 찾기: tag1>tag2
* 속성값 찾기: [name = 'test']

In [21]:
soup.select('p')

[<p class="this title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [22]:
soup.select_one('p')

<p class="this title"><b>The Dormouse's story</b></p>

In [23]:
print(soup.prettify())
print("토토미 잘생김")

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="this title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
토토미 잘생김


In [24]:
# 자손 태그 찾기 - 자손 관계 (tag1 tag2)
soup.select('html a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [25]:
#직계 자식 태그(tag 1 > tag 2)
soup.select('html > title') #this doesn't work because it isn't directly decent

[]

In [26]:
#id 선택자
soup.select('#link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [27]:
#class 선택자
soup.select('.sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [28]:
#속성값 찾기
soup.select('[href="http://example.com/elsie"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [29]:
# 시작 문자열 
soup.select('[href^="http"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [30]:
# 종료 문자열
soup.select('[href$="ie"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [31]:
# 포함 문자열
soup.select('[href*="example"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]