# DOM

### BeautifulSoup 설치

In [1]:
# !pip install beautifulsoup4
from bs4 import BeautifulSoup

In [2]:
# Test html 
html = '''
<html>
    <head></head>
    <body>
        <div id = 'result'>
           <p class = 'row'>
               <a class = 'red'>Go to page1</a>
               <a class = 'blue'>Go to page2</a>
            </p>
        </div>
    </body>
</html>
'''

dom = BeautifulSoup(html, 'lxml')

In [3]:
type(dom.html.head), type(dom.html)

(bs4.element.Tag, bs4.element.Tag)

In [4]:
dom.a, dom.find('a')

(<a class="red">Go to page1</a>, <a class="red">Go to page1</a>)

In [5]:
for tag in dom.find_all('a'):
    print(tag.name, tag['class'], tag.text)

a ['red'] Go to page1
a ['blue'] Go to page2


In [6]:
dom.find_all('a', {'class':'blue'})

[<a class="blue">Go to page2</a>]

In [7]:
dom.find_all(['div', 'a'])

[<div id="result">
 <p class="row">
 <a class="red">Go to page1</a>
 <a class="blue">Go to page2</a>
 </p>
 </div>, <a class="red">Go to page1</a>, <a class="blue">Go to page2</a>]

In [8]:
dom.find_all('', {'id':'result'}) 

[<div id="result">
 <p class="row">
 <a class="red">Go to page1</a>
 <a class="blue">Go to page2</a>
 </p>
 </div>]

In [9]:
http://pythonscraping.com/pages/page3.html (Test Url)

SyntaxError: invalid syntax (<ipython-input-9-f02b11275570>, line 1)

In [10]:
from urllib import parse
import requests

header = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}

def getDownload(url, params={}, retries=3):
    resp = None
    
    try:
        resp = requests.get(url, params=params, headers=header)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= e.response.status_code < 600 and retries > 0:
            print(retries)
            resp = getDownload(url, params, retries-1)
        else:
            print(e.response.status_code)
            print(e.response.reason)
            print(e.response.headers)
            
    return resp

In [11]:
# html.text
url = 'http://pythonscraping.com/pages/page3.html'
html = getDownload(url)
dom = BeautifulSoup(html.text, 'lxml')
footer = dom.find('div', {'id':'footer'})


In [12]:
parent = footer.find_parent()
parent.name, parent.attrs

('div', {'id': 'wrapper'})

In [13]:
# children tag의 하부 tree 뿐 아니라 형제 Tree의 하부 까지 모두 찾는다.
children = parent.find_all()
# children

In [14]:
# recursive=False로 지정하면, 원래의 children tag의 하부 Tree 만 찾는다.
children = parent.find_all(recursive=False)
for row in children:
    print(row.name, row.attrs)

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
table {'id': 'giftList'}
div {'id': 'footer'}


In [15]:
descendants = parent.find_all()
for row in descendants:
    print(row.name, row.attrs)

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
p {}
br {}
br {}
table {'id': 'giftList'}
tr {}
th {}
th {}
th {}
th {}
tr {'id': 'gift1', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img1.jpg'}
tr {'id': 'gift2', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img2.jpg'}
tr {'id': 'gift3', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img3.jpg'}
tr {'id': 'gift4', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img4.jpg'}
tr {'id': 'gift5', 'class': ['gift']}
td {}
td {}
span {'class': ['excitingNote']}
td {}
td {}
img {'src': '../img/gifts/img6.jpg'}
div {'id': 'footer'}
br {}


In [16]:
divChildren = children[2].find_all(recursive = False)
for row in divChildren:
    print(row.name, row.attrs, row.text)

p {} 
We haven't figured out how to make online shopping carts yet, but you can send us a check to:
123 Main St.
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.


In [17]:
divTag = children[2]
children[1].name, divTag.find_previous_sibling().name

('h1', 'h1')

In [18]:
divTag.find_next_siblings() # 이웃

[<table id="giftList">
 <tr><th>
 Item Title
 </th><th>
 Description
 </th><th>
 Cost
 </th><th>
 Image
 </th></tr>
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005

In [19]:
children[3] == divTag.find_next_sibling()

True

### Cost 값만 모두 가져와 봅시다

In [21]:
aList = dom.find_all('tr')
for row in aList:
    print(row.find_all(recursive = False)[2].text.strip())

Cost
$15.00
$10,000.52
$10,005.00
$0.50
$1.50


## Google

In [23]:
# url = 'https://www.google.com/search?ei=olCDXIG-EpGNr7wPucGkuAc&q=%ED%8C%8C%EC%9D%B4%EC%8D%AC&oq=%ED%8C%8C%EC%9D%B4%EC%8D%AC&gs_l=psy-ab.3..35i39l2j0i67j0l6j0i20i263.9312.9312..9658...0.0..0.111.111.0j1......0....1..gws-wiz.y-nzWDsTdd0'
# html = getDownload(url)

url = 'https://www.google.com/search'
params = {'q':'성소'}
html = getDownload(url,params)

dom = BeautifulSoup(html.text, 'lxml')

for tag in dom.find_all('', {'class':'r'}):
    print(tag.find('h3').text)
    print(tag.find('a')['href'])

성소(우주소녀) - 나무위키
https://namu.wiki/w/%EC%84%B1%EC%86%8C(%EC%9A%B0%EC%A3%BC%EC%86%8C%EB%85%80)
성소 - 나무위키
https://namu.wiki/w/%EC%84%B1%EC%86%8C
성소 (가수) - 위키백과, 우리 모두의 백과사전
https://ko.wikipedia.org/wiki/%EC%84%B1%EC%86%8C_(%EA%B0%80%EC%88%98)
비주얼 파티 중이라는 우주소녀 성소 근황 | 1boon
https://1boon.daum.net/benter/sungso
[SC이슈]"컴백 NO, 예능은 OK"…우주소녀 성소, 中스케줄 논란 - 조선일보
http://news.chosun.com/site/data/html_dir/2018/09/03/2018090301347.html
우주소녀, 13명→10명 체제 컴백…"성소 등 3명은 참여NO" - SBS 뉴스
https://news.sbs.co.kr/news/endPage.do?news_id=N1004917577
우주소녀 성소, CG 아닌가요? | 1boon
https://1boon.kakao.com/newsade/WJSN-SS
우주소녀(WJSN) 성소, 중국에서의 근황 눈길 '더 예뻐진 외모' - MSN.com
https://www.msn.com/ko-kr/entertainment/music/%EC%9A%B0%EC%A3%BC%EC%86%8C%EB%85%80-wjsn-%EC%84%B1%EC%86%8C-%EC%A4%91%EA%B5%AD%EC%97%90%EC%84%9C%EC%9D%98-%EA%B7%BC%ED%99%A9-%EB%88%88%EA%B8%B8-%E2%80%98%EB%8D%94-%EC%98%88%EB%BB%90%EC%A7%84-%EC%99%B8%EB%AA%A8%E2%80%99/ar-BBRn6hi


## Naver

In [24]:
# url = 'https://search.naver.com/search.naver?sm=tab_hty.top&where=nexearch&query=%ED%8C%8C%EC%9D%B4%EC%8D%AC&oquery=%ED%8C%8C%EC%9D%B4%EC%8D%AC+%EC%9E%90%EC%8A%B5%EC%84%9C&tqi=U45RzdpVuENssbhJI5Cssssstts-309327'
# html = getDownload(url)

url = 'https://search.naver.com/search.naver'
params = {'query':'성소'}
html = getDownload(url,params)

dom = BeautifulSoup(html.text, 'lxml')

for tags in dom.find_all('',{'class':('type01','sc_kindic')}):
    for tag in tags.find_all('dt'):
        print(tag.text)
        print(tag.find('a')['href'])

우주소녀(WJSN) 성소, 인스타그램 속 여전한 미모…한국 활동은 ...
https://m.post.naver.com/viewer/postView.nhn?volumeNo=22868844&memberNo=38506&vType=VERTICAL
창해일성소
https://m.post.naver.com/viewer/postView.nhn?volumeNo=24687927&memberNo=30850&vType=VERTICAL
성소 聖召
https://m.post.naver.com/viewer/postView.nhn?volumeNo=19991347&memberNo=2913016&vType=VERTICAL
수원가톨릭대학교 역사홍보관 개관
https://www.catholictimes.org/article/article_view.php?aid=318765
“완벽한 장소는 없다! 다만 완벽한 고독이 있을 뿐이다!”
http://www.cpbc.co.kr/CMS/newspaper/view_body.php?cid=763050&path=201909
[사제인사] 부산교구
https://www.catholictimes.org/article/article_view.php?aid=318793
[사제인사] 부산교구, 10월 4일 부
http://www.cpbc.co.kr/CMS/newspaper/view_body.php?cid=762923&path=201909
더 강해지는 영웅 무기, 킹스레이드 '소울웨폰' 영상 공개
https://www.gamemeca.com/mv.php?inflow=naver_s&gid=1583818
 부산성소병원   
http://www.seongso.co.kr/
 성소 - 나무위키   
https://namu.wiki/w/%EC%84%B1%EC%86%8C
 안동성소병원   
http://www.sungso.com/
 안동성소병원편도제거수술후기 
https://blog.naver.com/ccmjj?Redirect=Log&logNo=221440296682
 성소후원회 9월