### 웹에 접속하여 태그를 크롤링
- SSL 에러 관련
- import ssl
- ssl._create_default_https_context = ssl._create_unverified_context


In [1]:
# urllib 패키지의 request 모듈(python 파일)에서 urlopen() 함수를 가져옴
from urllib.request import urlopen

# Retrieve HTML string from the URL
# 한글 출력, b': 바이트 스트림을 의미, 한글 깨짐
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [3]:
html = urlopen("https://en.wikipedia.org/wiki/Kevin_Bacon").read()
print(type(html))
print(str(html, 'utf-8')[:500])  # bytes -> str

<class 'bytes'>
<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Kevin Bacon - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"4b56cef3-9988-4c03-b44a-fe764444a337","wg


In [4]:
import sys
print(sys.getdefaultencoding())

utf-8


In [None]:
# ERROR, 한글 표현 불가능
html = urlopen("https://ko.wikipedia.org/wiki/케빈_베이컨").read()
print(type(html))
print(str(html, 'utf-8')[:500])  # bytes -> str

In [6]:
# Chrome은 한글 입력시 문자 코드로 변경하여 처리함으로 한글 처리 가능
# Chrome의 URL을 복사하여 사용
html = urlopen("https://ko.wikipedia.org/wiki/%EC%BC%80%EB%B9%88_%EB%B2%A0%EC%9D%B4%EC%BB%A8").read()
print(type(html))
print(str(html, 'utf-8')[:500])  # bytes -> str

<class 'bytes'>
<!DOCTYPE html>
<html class="client-nojs" lang="ko" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>케빈 베이컨 - 위키백과, 우리 모두의 백과사전</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"ko","wgMonthNames":["","1월","2월","3월","4월","5월","6월","7월","8월","9월","10월","11월","12월"],"wgRequestId":"484689b2-6bd0-41ad-a031-416bdf74a2d9","wgCSPNonce":false,"wgCanonicalNamespace":"","wg


In [None]:
# UnicodeEncodeError
html = urlopen("https://ko.wikipedia.org/wiki/" + "케빈_베이컨").read()
print(type(html))
print(str(html, 'utf-8')[:500])  # bytes -> str

In [7]:
from urllib.parse import quote # 한글 처리 함수
quote("케빈_베이컨")

'%EC%BC%80%EB%B9%88_%EB%B2%A0%EC%9D%B4%EC%BB%A8'

In [8]:
html = urlopen("https://ko.wikipedia.org/wiki/" + quote("케빈_베이컨")).read()
print(type(html))
print(str(html, 'utf-8')[:500])  # bytes -> str

<class 'bytes'>
<!DOCTYPE html>
<html class="client-nojs" lang="ko" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>케빈 베이컨 - 위키백과, 우리 모두의 백과사전</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"ko","wgMonthNames":["","1월","2월","3월","4월","5월","6월","7월","8월","9월","10월","11월","12월"],"wgRequestId":"484689b2-6bd0-41ad-a031-416bdf74a2d9","wgCSPNonce":false,"wgCanonicalNamespace":"","wg


In [None]:
# 존재하지 않는 URL은 많은 메시지 출력과 서버라면 처리중 다운이됨.
html = urlopen("http://www.pythonscraping.com/exercises/exercise1000.html")

In [10]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import quote  # 한글 처리 함수
from urllib.error import HTTPError

def getbs(url):
    try:
        html = urlopen(url)
        bs = BeautifulSoup(html, 'html.parser')
    except HTTPError as e:
        print(e)
        return None
    else:
        return bs

In [11]:
bs = getbs("http://www.pythonscraping.com/exercises/exercise1000.html")
if bs == None:
    print('주소가 없습니다.')

HTTP Error 404: Not Found
주소가 없습니다.


In [None]:
# CSS class 기반 검색, 여러개의 태그 출력
# http://www.pythonscraping.com/pages/warandpeace.html 접속
# Chrome -> F12 -> Elements tab -> Ctrl + F -> class="green" 검색 안됨.
# Chrome -> F12 -> Elements tab -> Ctrl + F -> .green 검색됨.

In [14]:
bs = getbs("http://www.pythonscraping.com/pages/warandpeace.html")
tags = bs.select('span.green')

for tag in tags[0:5]:
    print(tag)

<span class="green">Anna
Pavlovna Scherer</span>
<span class="green">Empress Marya
Fedorovna</span>
<span class="green">Prince Vasili Kuragin</span>
<span class="green">Anna Pavlovna</span>
<span class="green">St. Petersburg</span>


In [15]:
tags = bs.select("span[class='green']") # []: 태그의 속성 선언
for tag in tags[0:5]:
    print(tag)

<span class="green">Anna
Pavlovna Scherer</span>
<span class="green">Empress Marya
Fedorovna</span>
<span class="green">Prince Vasili Kuragin</span>
<span class="green">Anna Pavlovna</span>
<span class="green">St. Petersburg</span>


In [17]:
tags = bs.select('#text')
print(type(tags))    # <class 'bs4.element.ResultSet'>
print(type(tags[0])) # <class 'bs4.element.Tag'>

<class 'bs4.element.ResultSet'>
<class 'bs4.element.Tag'>


In [18]:
print(type(str(tags[0]))) 
print(str(tags[0])[0:100])

<class 'str'>
<div id="text">
"<span class="red">Well, Prince, so Genoa and Lucca are now just family estates of t


In [19]:
# 자식 태그의 검색
# Chrome이 자동 생성한 <TBODY>는 포함안됨.
# Chrome -> F12 -> Elements tab -> Ctrl + F -> #giftList tr검색됨.
bs = getbs("http://www.pythonscraping.com/pages/page3.html")
tags = bs.select('#giftList tr')
print(tags)

[<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>, <tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>, <tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>, <tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td

In [20]:
# Item Title
# <class 'bs4.element.Tag'> 에서 연쇄적으로 select 호출 가능
ths = tags[0].select('th') # 첫번째 <tr>에서 <th> 수집
for th in ths:
    print(th)

<th>
Item Title
</th>
<th>
Description
</th>
<th>
Cost
</th>
<th>
Image
</th>


In [22]:
print(type(tags[1].select('td')[0].string))# <class 'bs4.element.NavigableString'>
print(type(tags[1].select('td')[0].text)) # <class 'str'>
print('   > test <   '.strip())

<class 'bs4.element.NavigableString'>
<class 'str'>
> test <


In [23]:
for i in range(1, len(tags)): # 행순환
    # 각각의 행에서 첫번째 td 만 추출, \n 개행 문자 제거
    print(tags[i].select('td')[0].text.strip())

Vegetable Basket
Russian Nesting Dolls
Fish Painting
Dead Parrot
Mystery Box


In [24]:
imgs = bs.select('img') # 모든 img tag
for img in imgs:
    print(img)

<img src="../img/gifts/logo.jpg" style="float:left;"/>
<img src="../img/gifts/img1.jpg"/>
<img src="../img/gifts/img2.jpg"/>
<img src="../img/gifts/img3.jpg"/>
<img src="../img/gifts/img4.jpg"/>
<img src="../img/gifts/img6.jpg"/>


In [25]:
# 소스상에 <tbody> 실제 존재하지 않음으로 명시하면 안됨.
imgs = bs.select('#giftList > tr > td > img') # 모든 img tag
for img in imgs:
    print(img['src'])

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


In [27]:
tags = bs.select('#giftList')
print('갯수:', len(tags))
print(tags)

갯수: 1
[<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../i

In [28]:
print(type(tags)) # <class 'bs4.element.ResultSet'>, select 사용 불가능
print(type(tags[0])) # <class 'bs4.element.Tag'>, <table id="giftList">

<class 'bs4.element.ResultSet'>
<class 'bs4.element.Tag'>


In [29]:
imgs = bs.select('#gift1 > img')
print(imgs)

[]


In [30]:
imgs = bs.select('#gift1 > td > img') # 직계 자식, 자식 관계를 생략하면 안됨.
print(imgs)

[<img src="../img/gifts/img1.jpg"/>]


In [31]:
imgs = bs.select('#gift1 img')  # 모든 자식이 해당
print(imgs)

[<img src="../img/gifts/img1.jpg"/>]


In [34]:
fname = bs.select('#gift1 > td > img')[0]['src']
print(fname)

../img/gifts/img1.jpg
