In [73]:
from bs4 import BeautifulSoup

### 파일 가져오기

In [74]:
with open("00_Example.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [75]:
print(soup)

<!DOCTYPE html>

<html lang="ko">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Web Crawling Example</title>
</head>
<body>
<div>
<p>a</p>
<p>b</p>
<p>c</p>
</div>
<div class="ex_class sample">
<p>1</p>
<p>2</p>
<p>3</p>
</div>
<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>
<h1>This is a heading.</h1>
<p>This is a paragraph.</p>
<p>This is another paragraph.</p>
<a class="a" href="www.naver.com">Naver</a>
</body>
</html>


### 인터넷에서 가져오기

In [76]:
import urllib.request
import urllib.parse

web_url = 'https://www.genie.co.kr/chart/top200'
with urllib.request.urlopen(web_url) as response:
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')

In [77]:
soup

<br/>
<br/>
<center>
<img src="http://www.geniemusic.co.kr/images/common/logo_r1.png"/><br/>
<h2> <meta content="text/html;charset=utf-8" http-equiv="Content-Type"/> 접속요청이 보안정책에 의해 차단되었습니다. 당사 고객센터로 문의해주십시오.<br/><br/>
The security policy of the connection request is blocked. Contact your customer service representative.<br/><br/>
지니뮤직 고객센터 1577-5337<br/><br/>
</h2>
</center>
<br/>

In [78]:
import requests

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko'}
req = requests.get(web_url, headers = header)
soup = BeautifulSoup(req.text, 'html.parser')

### 태그를 이용해서 가져오는 방법

In [79]:
with open("00_Example.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [80]:
first_div = soup.find('div')        #  find 일부찾기 / 여기서는 div찾기
first_div

<div>
<p>a</p>
<p>b</p>
<p>c</p>
</div>

In [81]:
div_list = soup.find('div')        # find 일부찾기 / 여기서는 div찾기
div_list

<div>
<p>a</p>
<p>b</p>
<p>c</p>
</div>

In [82]:
div_list = soup.find('div')        # find all 모두 찾기 / 
len(div_list)

7

In [83]:
first_div_all_p = first_div.find_all('p')
first_div_all_p

[<p>a</p>, <p>b</p>, <p>c</p>]

In [84]:
for p_tag in first_div_all_p:
    print(p_tag.get_text())

a
b
c


### tag와 속성을 이용해서 가져오기

In [85]:
ex_id_div = soup.find('div', {'id':'ex_id'})
ex_id_div

<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>

In [86]:
## CSS Selector로 찾기
ex_id_div = soup.select_one('div#ex_id')
ex_id_div

<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>

In [87]:
## CSS Selector로 찾기
ex_id_div = soup.select_one('#ex_id')
ex_id_div

<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>

In [88]:
ex_id_div = soup.select('#ex_id')
ex_id_div[0]

<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>

In [89]:
ex_class_div = soup.select('.ex_class')
ex_class_div                    # 리스트 안에 들어가있는 모양으로출력됨.

[<div class="ex_class sample">
 <p>1</p>
 <p>2</p>
 <p>3</p>
 </div>]

In [90]:
sample_class = soup.select('.sample')
sample_class

[<div class="ex_class sample">
 <p>1</p>
 <p>2</p>
 <p>3</p>
 </div>]

In [91]:
soup.select('.a')           # 'Never'는 get_text 또는 String임.

[<a class="a" href="www.naver.com">Naver</a>]

### 결과 가져오기

- get_text()    methd
- string        attribute

In [92]:
for p_tag in first_div_all_p:
    print(p_tag.get_text())

a
b
c


In [93]:
# get_text()    methd
a_tag = soup.find('a')
a_tag.string

'Naver'

In [97]:
# Attribute
a_tag['href']

'www.naver.com'

In [99]:
a_tag.attrs['href']

'www.naver.com'