# 3. BeautifulSoup 기초
* HTML과 XML 문서를 파싱하기 위한 파이썬 패키지

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
html_doc

'<html><head><title>The Dormouse\'s story</title></head>\n<body>\n<p class="title"><b>The Dormouse\'s story</b></p>\n\n<p class="story">Once upon a time there were three little sisters; and their names were\n<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,\n<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and\n<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</p>\n\n<p class="story">...</p>\n'

In [3]:
soup = BeautifulSoup(html_doc,'html.parser') # lxml, html5lib
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [4]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


### find 함수
* 조건에 만족하는 첫번째 tag 검색

In [5]:
soup.find('p')

<p class="title"><b>The Dormouse's story</b></p>

In [6]:
type(soup.find('p'))

bs4.element.Tag

In [7]:
soup.find('a')

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [8]:
soup.find('a',id='link2')

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [9]:
soup.find('a',class_='sister',id='link3')

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [10]:
attrs = {
    'class' : 'sister',
    'id' : 'link3'
}
soup.find('a',attrs=attrs)

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

### find_all 함수
* 조건에 맞는 모든 tag를 리스트로 반환

In [11]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [12]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [13]:
for tag in soup.find_all('a'):
    print(tag)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


### get_text 함수
* tag안의 valve
* 부모 tag의 경우 모든 자식 tag의 value를 추출

In [14]:
soup.get_text()

"The Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"

In [15]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [16]:
soup.find('p').get_text()

"The Dormouse's story"

In [17]:
for tag in soup.find_all('a'):
    print(tag.get_text())

Elsie
Lacie
Tillie


### attribute 값 추출
* 검색한 tag에서 attribute값 추출
* tag['attr명']

In [18]:
soup.find('p').attrs

{'class': ['title']}

In [19]:
soup.find('p')['class']

['title']

In [20]:
for tag in soup.find_all('a'):
    print(tag['href'], tag['class'],tag['id'])

http://example.com/elsie ['sister'] link1
http://example.com/lacie ['sister'] link2
http://example.com/tillie ['sister'] link3


### select 함수
* select는 CSS selector로 tag 찾기
* 아이디 찾기 : #id명
* 클래스 찾기 : .class명
* 자손 찾기 tag1 : tag2
* 직계 자식 찾기 : tag1 > tag2
* 속성값 찾기 :[name='test']

In [25]:
soup.find('p')

<p class="title"><b>The Dormouse's story</b></p>

In [26]:
soup.select('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [27]:
soup.select_one('p')

<p class="title"><b>The Dormouse's story</b></p>

In [28]:
# 자손 태그 찾기 - 자손 관계 (tag1 tag2)
soup.select('html title ')

[<title>The Dormouse's story</title>]

In [29]:
soup.select('html a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [30]:
# 직계 자식 태그 (tag1 > tag2)
soup.select('html > title')

[]

In [33]:
# id 선택자
soup.select('#link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [34]:
# class 선택자
soup.select('.sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [37]:
# 속성값 찾기 
soup.select('[href = "http://example.com/elsie"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [38]:
# 시작 문자열
soup.select('[href^="http"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [39]:
# 종료 문자열
soup.select('[href$="ie"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [43]:
# 포함 문자열
soup.select('[href*="example"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]