# Beautiful Soup 사용법

In [1]:
from bs4 import BeautifulSoup

### 파일로 부터 가져오기

In [2]:
with open("00_Example.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [3]:
print(soup)

<!DOCTYPE html>

<html lang="ko">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Web Crawling Example</title>
</head>
<body>
<div>
<p>a</p>
<p>b</p>
<p>c</p>
</div>
<div class="ex_class sample">
<p>1</p>
<p>2</p>
<p>3</p>
</div>
<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>
<h1>This is a heading.</h1>
<p>This is a paragraph.</p>
<p>This is another paragraph.</p>
<a class="a sample" href="www.naver.com">Naver</a>
</body>
</html>


### 인터넷에서 가져오기

In [4]:
import urllib.request
import urllib.parse

web_url = 'https://www.genie.co.kr/chart/top200'
with urllib.request.urlopen(web_url) as response:
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')

In [5]:
soup

<br/>
<br/>
<center>
<img src="http://www.geniemusic.co.kr/images/common/logo_r1.png"/><br/>
<h2> <meta content="text/html;charset=utf-8" http-equiv="Content-Type"/> 접속요청이 보안정책에 의해 차단되었습니다. 당사 고객센터로 문의해주십시오.<br/><br/>
The security policy of the connection request is blocked. Contact your customer service representative.<br/><br/>
지니뮤직 고객센터 1577-5337<br/><br/>
</h2>
</center>
<br/>

In [6]:
import requests

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko'}
req = requests.get(web_url, headers = header)
soup = BeautifulSoup(req.text, 'html.parser')

### 태그를 이용해서 가져오는 방법

In [8]:
with open("00_Example.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [9]:
first_div = soup.find('div')
first_div

<div>
<p>a</p>
<p>b</p>
<p>c</p>
</div>

In [12]:
div_list = soup.find_all('div')
len(div_list)

3

In [13]:
first_div_all_p = first_div.find_all('p')
first_div_all_p

[<p>a</p>, <p>b</p>, <p>c</p>]

In [14]:
for p_tag in first_div_all_p:
    print(p_tag.get_text())

a
b
c


In [15]:
all_ps = soup.find_all('p')
all_ps

[<p>a</p>,
 <p>b</p>,
 <p>c</p>,
 <p>1</p>,
 <p>2</p>,
 <p>3</p>,
 <p>X</p>,
 <p>Y</p>,
 <p>Z</p>,
 <p>This is a paragraph.</p>,
 <p>This is another paragraph.</p>]

### tag와 속성을 이용해서 가져오기

In [16]:
ex_id_div = soup.find('div', {'id': 'ex_id'})
ex_id_div

<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>

In [17]:
# CSS Selector로 찾기
ex_id_div = soup.select_one('div#ex_id')
ex_id_div

<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>

In [18]:
ex_id_div = soup.select_one('#ex_id')
ex_id_div

<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>

In [19]:
ex_id_div = soup.select('#ex_id')
ex_id_div[0]

<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>

In [22]:
ex_class_div = soup.select('.ex_class.sample')
ex_class_div

[<div class="ex_class sample">
 <p>1</p>
 <p>2</p>
 <p>3</p>
 </div>]

In [23]:
sample_class = soup.select('.sample')
sample_class

[<div class="ex_class sample">
 <p>1</p>
 <p>2</p>
 <p>3</p>
 </div>,
 <a class="a sample" href="www.naver.com">Naver</a>]

In [27]:
soup.select('.a')

[<a class="a sample" href="www.naver.com">Naver</a>]

### 결과 가져오기
- get_text()    method
- string        attribute

In [28]:
for p_tag in first_div_all_p:
    print(p_tag.get_text())

a
b
c


In [30]:
a_tag = soup.find('a')
a_tag.string

'Naver'

In [31]:
# attribute
a_tag['href']

'www.naver.com'