In [16]:
!pip install beautifulsoup4



In [2]:
html = """
<html>
    <head></head>
    <body>
        <div id="result">
            <p class="row">
                <a class="red">go to page1</a>
                <a class="blue">go to page2</a>
            </p>
        </div>
    </body>
</html>
"""

In [3]:
from bs4 import BeautifulSoup

dom = BeautifulSoup(html, "lxml")

In [4]:
type(dom.html.head), type(dom.html)

(bs4.element.Tag, bs4.element.Tag)

In [5]:
dom.a, dom.find('a')

(<a class="red">go to page1</a>, <a class="red">go to page1</a>)

In [6]:
for tag in dom.find_all('a'):
    print(tag.name, tag['class'], tag.text)

a ['red'] go to page1
a ['blue'] go to page2


In [18]:
dom.prettify()

'<html>\n <head>\n </head>\n <body>\n  <div>\n   <p>\n    <a>\n     go to page\n    </a>\n   </p>\n  </div>\n </body>\n</html>\n'

In [28]:
dom.html.body.p, dom.p, [_ for _ in dom.p.children]

(<p>
 <a>go to page</a>
 </p>, <p>
 <a>go to page</a>
 </p>, ['\n', <a>go to page</a>, '\n'])

In [7]:
dom.find_all('', {'id':'result'})

[<div id="result">
 <p class="row">
 <a class="red">go to page1</a>
 <a class="blue">go to page2</a>
 </p>
 </div>]

In [29]:
type(dom.div), type(dom.span)
try:
    dom.span.attr
    dom.h1.text
except AttributeError as e:
    print("Not found")

Not found


### http://pythonscraping.com/pages/page3.html (Test Url)

In [8]:
from urllib import parse
import requests

header = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}

def download(url, params={}, retries=3):
    resp = None

    try:
        resp = requests.get(url, params=params, headers = header)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= e.response.status_code < 600 and retries > 0:
            print(retries)
            resp = download(url, params, retries - 1)
        else:
            print(e.response.status_code)
            print(e.response.reason)
            print(e.request.headers)

    return resp

In [9]:
url = 'http://pythonscraping.com/pages/page3.html'
html = download(url)
dom = BeautifulSoup(html.text, 'lxml')
footer = dom.find('div', {'id':'footer'})

In [11]:
parent = footer.find_parent()
parent.name, parent.attrs

('div', {'id': 'wrapper'})

In [12]:
children = parent.find_all()

In [13]:
children = parent.find_all(recursive=False)
for row in children:
    print(row.name, row.attrs)

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
table {'id': 'giftList'}
div {'id': 'footer'}


In [15]:
descendants = parent.find_all()
for row in descendants:
    print(row.name, row.attrs)

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
p {}
br {}
br {}
table {'id': 'giftList'}
tr {}
th {}
th {}
th {}
th {}
tr {'id': 'gift1', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img1.jpg'}
tr {'id': 'gift2', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img2.jpg'}
tr {'id': 'gift3', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img3.jpg'}
tr {'id': 'gift4', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img4.jpg'}
tr {'id': 'gift5', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img6.jpg'}
div {'id': 'footer'}
br {}


In [16]:
divChildren = children[2].find_all(recursive=False)
for row in divChildren:
    print(row.name, row.attrs, row.text)

p {} 
We haven't figured out how to make online shopping carts yet, but you can send us a check to:
123 Main St.
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.


In [17]:
divTag = children[2]
children[1].name, divTag.find_previous_sibling().name

('h1', 'h1')

In [18]:
divTag.find_next_sibling()

<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gift

In [19]:
children[3] == divTag.find_next_sibling()

True

In [20]:
alist = dom.find_all('tr')
for row in alist:
    print(row.find_all(recursive = False)[2].text.strip())

Cost
$15.00
$10,000.52
$10,005.00
$0.50
$1.50


In [43]:
import requests
google = download("https://www.google.com/search", params={"q":"박보영"})

In [50]:
html = download("http://pythonscraping.com/pages/page3.html")
exercise = BeautifulSoup(html.text, "lxml")

In [54]:
footer = exercise.find("", {"id":"footer"})

In [65]:
footer.name, footer["id"], \
footer.find_parent().name, footer.find_parent()["id"], \
footer.find_parent().find_parent().name #(.name = 태그이름) (.attrs = 속성(키-밸류))

('div', 'footer', 'div', 'wrapper', 'body')

In [22]:
[_.name for _ in parent.find_all(recursive=False)]

['img', 'h1', 'div', 'table', 'div']

In [70]:
[_.name for _ in footer.find_previous_siblings()]

['table', 'div', 'h1', 'img']

In [23]:
[_.text.strip() for _ in parent.find_all(recursive=False)[3].find_all("th")]

['Item Title', 'Description', 'Cost', 'Image']

In [76]:
exercise.find_all("td")

[<td>
 Vegetable Basket
 </td>, <td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td>, <td>
 $15.00
 </td>, <td>
 <img src="../img/gifts/img1.jpg"/>
 </td>, <td>
 Russian Nesting Dolls
 </td>, <td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td>, <td>
 $10,000.52
 </td>, <td>
 <img src="../img/gifts/img2.jpg"/>
 </td>, <td>
 Fish Painting
 </td>, <td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td>, <td>
 $10,005.00
 </td>, <td>
 <img src="../img/gifts/img3.jpg"/>
 </td>, <td>
 Dead Parrot
 </td>, <td>
 This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
 </td>, <td>
 $0.

In [94]:
td = exercise.find("", {"id":"gift1"})

In [102]:
table = footer.find_previous_sibling()
tr = table.find_all("tr", {"class":"gift"})
for _ in tr:
    print(_.find_all(recursive=False)[2].text.strip())

#for _ in execise.find_all("tr", {"class":"gift"}):
# print(_.find_all(recursive=False)[2].text.strip())

$15.00
$10,000.52
$10,005.00
$0.50
$1.50


In [103]:
import re
[_.text.strip() for _ in exercise.find_all("td", text=re.compile("[0-9]+.\d+"))]

['$15.00', '$10,000.52', '$10,005.00', '$0.50', '$1.50']

In [104]:
[_["src"] for _ in exercise.find_all("img", {"src":re.compile("../img/gifts/img[0-9]+.jpg")})]

['../img/gifts/img1.jpg',
 '../img/gifts/img2.jpg',
 '../img/gifts/img3.jpg',
 '../img/gifts/img4.jpg',
 '../img/gifts/img6.jpg']

In [116]:
[requests.compat.urljoin(html.request.url, _["src"]) for _ in exercise.find_all("img", {"src":re.compile("../img/gifts/img\d+.jpg")})]

['http://pythonscraping.com/img/gifts/img1.jpg',
 'http://pythonscraping.com/img/gifts/img2.jpg',
 'http://pythonscraping.com/img/gifts/img3.jpg',
 'http://pythonscraping.com/img/gifts/img4.jpg',
 'http://pythonscraping.com/img/gifts/img6.jpg']

In [113]:
html.request.url

'http://pythonscraping.com/pages/page3.html'

In [189]:
from urllib import request, error, parse
html = download("https://www.google.com/search", {"q":"박보영"})
practice = BeautifulSoup(html.text, "lxml")

In [190]:
html.request.url

'https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3D%25EB%25B0%2595%25EB%25B3%25B4%25EC%2598%2581&q=EgRzWzdDGLjxsOYFIhkA8aeDS_A3nNucx6i2cnS7v5MQm7snKi5hMgFy'

In [155]:
practice.find_all("h3", {"class":"LC201b"})

[]

In [156]:
for h3 in practice.find_all("h3", {"class":"LC201b"}):
    print(h3.text.strip())
    print(_.find_parent()["href"])

In [None]:
result = dict()
for _ in google.find_all("h3", {"class":"LC201b"}):
    result[_.text.strip()] = .find_parent()
    print(h3.text.strip())
    print(_.find_parent()["href"])

In [176]:
html = download("https://search.naver.com/search.naver", {"query":"박보영"})
test = BeautifulSoup(html.text, "lxml")

In [177]:
print(html)

<Response [200]>


In [162]:
print(practice)

<!DOCTYPE html>
<html lang="ko"> <head> <meta charset="utf-8"/> <meta content="always" name="referrer"/> <meta content="telephone=no,address=no,email=no" name="format-detection"/> <meta content="width=device-width,initial-scale=1.0,maximum-scale=2.0" name="viewport"/> <meta content="박보영 : 네이버 통합검색" property="og:title"/> <meta content="https://ssl.pstatic.net/sstatic/search/common/og_v3.png" property="og:image"/> <meta content="'박보영'의 네이버 통합검색 결과입니다." property="og:description"/> <meta content="'박보영'의 네이버 통합검색 결과입니다." lang="ko" name="description"/> <title>박보영 : 네이버 통합검색</title> <link href="https://ssl.pstatic.net/sstatic/search/favicon/favicon_140327.ico" rel="shortcut icon"/> <link href="https://ssl.pstatic.net/sstatic/search/opensearch-description.https.xml" rel="search" title="Naver" type="application/opensearchdescription+xml"/><link href="https://ssl.pstatic.net/sstatic/search/pc/css/search1_190221.css" rel="stylesheet" type="text/css"/> <link href="https://ssl.pstatic.net/sstatic/s

In [165]:
practice.find_all("a")

[<a href="#lnb"><span>메뉴 영역으로 바로가기</span></a>,
 <a href="#content"><span>본문 영역으로 바로가기</span></a>,
 <a class="spnew logo_naver" href="http://www.naver.com" onclick="return goOtherCR(this, 'a=sta.naver&amp;r=&amp;i=&amp;u='+urlencode(this.href));">NAVER</a>,
 <a class="bt_atcp _btn_arw fold" href="#" onclick="return false;"><span class="blind _text">자동완성 펼치기</span><span class="ico_arrow spnew"></span></a>,
 <a class="spat ico_info" href="https://help.naver.com/support/alias/search/word/word_16.naver" onclick="__atcmpCR(event, this, 'plus.help', '','','');" target="_blank"><span class="blind">도움말 보기</span></a>,
 <a class="btn_turnon active" href="#" onclick="__atcmpCR(event, this, 'plus.use', '','','');">ON<span class="blind">선택됨</span></a>,
 <a class="btn_turnoff" href="#" onclick="__atcmpCR(event, this, 'plus.unuse', '','','');">OFF</a>,
 <a class="btn btn_login" href="https://nid.naver.com/nidlogin.login?url=https%3A%2F%2Fsearch.naver.com%2Fsearch.naver%3Fquery%3D%25EB%25B0%2595%25EB%2

In [170]:
practice.find_all("dt")

[<dt><label for="inpop0">기본검색</label></dt>,
 <dt><label for="inpop1">상세검색</label></dt>,
 <dt><span class="tit_relate _related_keyword_lis">연관검색어</span><a class="link_help" href="https://help.naver.com/support/alias/search/word/word_1.naver" onclick="return goOtherCR(this, 'a=rsk.guide&amp;r=&amp;i=&amp;u='+urlencode(urlexpand(this.href)));" target="_blank"><i class="sprenew api_ico_help">도움말</i></a></dt>,
 <dt>출생</dt>,
 <dt>신체</dt>,
 <dt>소속사</dt>,
 <dt>학력</dt>,
 <dt>데뷔</dt>,
 <dt>수상</dt>,
 <dt>사이트</dt>,
 <dt><a class="sh_people_title sh_people_link" href="https://music.naver.com/album/index.nhn?albumId=2500698" nocr="" onclick="return goOtherCR(this, 'u='+urlencode(this.href)+'&amp;r=1&amp;a='+nco_area_126986_475608_1967805_264842+'.albumtitle&amp;i='+'1800000D_00000001F00A');" target="_blank" title="영화 너의결혼식 OST Part 1">영화 너의결혼식 ...</a></dt>,
 <dt><a class="sh_people_title sh_people_link" href="https://music.naver.com/album/index.nhn?albumId=565464" nocr="" onclick="return goOtherCR(t

In [24]:
for _ in dom.find_all("dt"):
    if "-".join([_.name for _ in _.find_parents(limits=4)]) == "dl-li-ul-div":
        a = _.find("a")
        
        if a:
            print("a.text.strip()")
            print(a["href"])

In [218]:
html = download("https://search.daum.net/search", {"q":"박보영"})
daum = BeautifulSoup(html.text, "lxml")

In [219]:
i = 0
for _ in daum.find_all("div", {"class":"wrap_tit"}):
    a = _.find("a")
    
    if a:
        i += 1
        print(a.text.strip())
        print(a["href"])

print(i)

"세젤흔녀로 변신한 박보영"..유제원 감독X박보영 '어비스' (종합)
http://v.media.daum.net/v/20190503153440722?f=o
'어비스' 이성재 "다시 태어난다면? 예쁜 박보영으로"..박보영 '폭소'
http://v.media.daum.net/v/20190503145429254?f=o
박보영이 가장 흔한 여자, 설득력 있을까 '어비스'
http://v.media.daum.net/v/20190503172414326?f=o
[현장]'어비스' 박보영 "김사랑과 차이? 커졌다 작아졌다.."
http://v.media.daum.net/v/20190503165134297?f=o
김영광 박보영 열애 터진 이유
http://adam24eve.tistory.com/858
박보영 실제 키는 도대체 몇일까?
http://papa0717.tistory.com/223
박보영 나이 키 몸매 대박
http://k3k2y.tistory.com/35
박보영 키 나이 인스타그램 드라마 어비스
http://listup.tistory.com/248
드라마 어비스 인물 소개, 예고편(박보영, 안효섭 주연)
http://cafe.daum.net/subdued20club/ReHf/2282606?q=%EB%B0%95%EB%B3%B4%EC%98%81
박보영과 역대급 케미뽐낸 상대배우 고르기
http://cafe.daum.net/subdued20club/ReHf/2280152?q=%EB%B0%95%EB%B3%B4%EC%98%81
런닝맨 나올 때마다 케미 보여준 송지효X박보영.jpgif
http://cafe.daum.net/ok1221/9Zdf/1524913?q=%EB%B0%95%EB%B3%B4%EC%98%81
박보영이 왜 못 오를 나무냐는 박수홍.jpg
http://cafe.daum.net/ASMONACOFC/gAVU/1243818?q=%EB%B0%95%EB%B3%B4%EC%98%81
박보영
https://ko.wikipedia.org/wiki/%EB%B0%95%EB%B3%B4

### Google

In [None]:
url = 'https://www.google.com/search'
params = {'q':'박보영'}
html = download(url, params)

dom = BeautifulSoup(html.text, 'lxml')

for tag in dom.find_all('', {'class':'r'}):
    print(tag.find('h3'))