In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib.font_manager as fm
font_name = fm.FontProperties(fname='C:/Windows/Fonts/malgun.ttf').get_name()
plt.rc("font",family=font_name)

import matplotlib as mlp
mlp.rcParams['axes.unicode_minus'] = False

import seaborn as sns

plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')

## XML(Extension Markup Language)

https://docs.python.org/3/library/xml.etree.elementtree.html?highlight=xml#xml.etree.ElementTree.XML

1.특징
    1) 메타 언어 : GML -> SGML -> XML
    2) 데이터를 위한 언어
    3) 데이터베이스 용도
    4) 데이터 표준화 : 이 기종 시스템간의 정보교환, 웹서비스, 유비쿼터스, 사물인터넷 기타등등...
 
2. 문법(물리적인 구성요소)
    1) XML(.xml)
    2) DTD(.dtd) : XML 작성을 위한 설계문서
    3) XML Schema(.xsd) : XML 작성을 위한 설계문서
    4) XSL(.xsl, xslt) : XML문서에 스타일을 입혀주는 언어
 
3. 결과 문서의 종류
    1) Well-Formed XML Document(잘 짜여진 문서)
    2) Valid XML Document(유효한 문서)
 
4. Element 문법
    - 작성 규칙
        a) 모든 XML문서는 반드시 단 하나의 루트 엘리먼트를 가진다.
        b) 루트 엘리먼트는 여러 개의 자식 엘리먼트를 가질 수 있고
            또 그 자식도 자신의 자식을 가질 수 있다.
        c) 시작태그와 끝태그는 반드시 짝을 이루어야 한다.
        d) "<"는 값으로 사용 불가. ">"는 사용할 수 있으나 가급적 사용 금지
        e) "<"와 ">" 다음에 공백문자가 올 수 없으며 반드시 시작태그와 끝태그의 이름이 같아야 한다.
 
    - 종류
        a) 내용을 가지는 엘리먼트
        b) 내용이 없는 엘리먼트
 
    - 내용에 대한 종류
        a) 문자 데이터
        b) 자식 엘리먼트
        c) 엔티티 또는 문자 참조
        d) CDATA Section
        e) 프로세싱 지시자
        f) 주석 : <!-- 주석 내용 -->
        g) 공백 문자열
 
5. DTD 
    - 종류
        내부 DTD
        외부 DTD
 
    - 구성 요소
        엘리먼트 선언
        속성(attribute) 선언
        Entity 선언
        Notation 선언
        프로세싱 지시자
        파라미터 엔티티 참조
        주석
        공백
        conditional section
 
    - 문서 유형 선언
        <!DOCTYPE 루트엘리먼트명 SYSTEM 또는 PUBLIC "식별자">
            DOCTYPE은 반드시 대문자
            SYSTEM은 특정 단체나 업체내부에서 사용되는 경우
            PUBLIC은 공개
            식별자는 다운로드 받을 수 있는 경로
            PUBLIC일 경우 추가된 식별자
            +-//DTD를 개발 및 유지보수 업체명//DTD명 및 버전번호//사용된 언어
 
    - 엘리먼트 선언
        <!ELEMENT 엘리먼트명 컨텐트 유형>
            컨텐트 유형
                #PCDATA    : 내용으로 문자데이터만 갖는 엘리먼트
                자식 엘리먼트
                    , : 작성 순서
                    | : 선택
                    ? : 생략하거나 한번만 작성
                    + : 한번 이상
                    * : 생략하거나 여러번 작성
                    기호 없음 : 단 한번만 작성
                EMPTY
                MIXED : 문자데이터 또는 자식엘리먼트를 혼합형태
                ANY
 
    - ATTRIBUTE 선언
        <!ATTLIST 엘리먼트명 속성명 속성유형 디폴트선언>
            속성 유형
                CDATA :  문자 데이터
                ENUMERATION    : dtd에 나열된 값 중 하나가 와야 함
                ID : 유일한 값을 지정
                IDREF/IDREFS : ID값을 참조
                NMTOKEN/NMTOKENS : 이름 작성 규칙을 준수하는 데이터만 사용
                NOTATION : dtd에 명시적으로 선언된 notation만 사용가능
                ENTITY : dtd에 명시적으로 선언된 entity만 사용 가능
 
    - validation check
        https://www.xmlvalidation.com/
 
6. 네임 스페이스
    CML
    -----
        <?xml version=1.0>
        <정보>
            <고유번호>111111-1111111</고유번호>
            <이름>홍길동</이름>
            ...
        </정보>
 
    PML
    -----
        <?xml version=1.0>
        <정보>
            <고유번호>LC100</고유번호>
            <이름>캠코더</이름>
            ...
        </정보>
 
    OML
    -----
        <?xml version=1.0>
        <주문정보 xmlns:고객="http://www.a.com/2017/Custom"
            xmlns:상품="http://www.a.com/2017/Product"
            xmlns="http://www.a.com/2017/Order">
            <주문번호>1</주문번호>
            <주문수량>10</주문수량>
            <결제>
                <방법>현금</방법>
                <금액>10000000</금액>
            </결제>
 
            <고객:고유번호>111111-1111111</고객:고유번호>
            <고객:이름>홍길동</고객:이름>
 
            <상품:고유번호>LC100</상품:고유번호>
            <상품:이름>캠코더</상품:이름>
            ...
        </주문정보>

## 1. XML 데이터 불러오기

In [26]:
import xml.etree.ElementTree as elemTree
tree1 = elemTree.parse('data/user.xml')
tree1

FileNotFoundError: [Errno 2] No such file or directory: 'data/user.xml'

In [53]:
xmlstr = '''<?xml version="1.0" encoding="utf-8" ?>
<users>
	<user grade="gold">
            <name>Kim Cheol Soo</name>
            <age>25</age>
            <birthday>19940215</birthday>
        </user>
        <user grade="diamond">
            <name>Kim Yoo Mee</name>
            <age>21</age>
            <birthday>19980417</birthday>
        </user>
</users>
'''
tree1 = elemTree.fromstring(xmlstr)
tree1

<Element 'users' at 0x00000146FA22D040>

## 2. XML 데이터 다루기

#### 1) 태그명 검색

In [55]:
#data = tree1.find('user')
data = tree1.find('user[1]')

##### Xpath : ./, ../, ...
data = tree1.find('./user[1]')

print(data)
print(type(data))
dir(data)
print(data.tag)
print(data.attrib)
print(data.get("grade"))

print('-----------------------------')

username = data.find('name')
print(username.tag)
print(username.attrib)
print(username.text)

<Element 'user' at 0x00000146FA230CC0>
<class 'xml.etree.ElementTree.Element'>
user
{'grade': 'gold'}
gold
-----------------------------
name
{}
Kim Cheol Soo


#### 2) 태그 조건으로 검색

In [56]:
#data = tree1.find('./user[@grade]')
data = tree1.find('./user[@grade][2]')


print(data.attrib)
print(data.keys())
print(data.items())

{'grade': 'diamond'}
['grade']
[('grade', 'diamond')]


#### 3) 여러개의 태그를 한꺼번에 가져오기

In [57]:
users = tree1.findall('./user')
users

for user in users:
    print(user.attrib)
    print(user.find('name').text)

{'grade': 'gold'}
Kim Cheol Soo
{'grade': 'diamond'}
Kim Yoo Mee


#### 4) document sample

In [12]:
str = '''<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>
'''

In [15]:
# singapore에 있는 이웃나라 이름은?
data = str.find(data[1])

# Panama에 있는 이웃나라 중 Costa Rica의 방향은?

# 각 나라의 gdppc를 조회

# 각 나라의 인접국가 조회

NameError: name 'data' is not defined

In [58]:
import urllib.request as req
import requests
health = req.get('http://www.khan.co.kr/rss/rssdata/kh_sports.xml')
health.text()

AttributeError: module 'urllib.request' has no attribute 'get'

---

## 2. JSON(Javascript Object Notation)

In [29]:
import json 

# dumps() : 데이터를 저장
# loads() : 데이터를 불러올 때



In [47]:
j1 = {'name':'홍길동','birth':'0519','age':20}
print(type(j1))
print(j1)

print('----------------------------------------')

#j2 = json.dumps(j1)
j2 = json.dumps(j1,indent=2)

print(j2)
print(type(j2))

print('----------------------------------------')

# 리스트나 튜플로 json으로 변환
#j3 = json.dumps([1,2,3])
j3 = json.dumps((1,2,3))
print(j3)
print(type(j3))

print('----------------------------------------')

j4 = json.loads(j2)
print(j4)

<class 'dict'>
{'name': '홍길동', 'birth': '0519', 'age': 20}
----------------------------------------
{
  "name": "\ud64d\uae38\ub3d9",
  "birth": "0519",
  "age": 20
}
<class 'str'>
----------------------------------------
[1, 2, 3]
<class 'str'>
----------------------------------------
{'name': '홍길동', 'birth': '0519', 'age': 20}


In [60]:
obj = """
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
"""
print(type(obj))

<class 'str'>


In [67]:
result = json.loads(obj)
print(type(result))
print(result)

print('---------------------------------')

print(result['batters']['batter'][0]['id'])

<class 'dict'>
{'id': '0001', 'type': 'donut', 'name': 'Cake', 'ppu': 0.55, 'batters': {'batter': [{'id': '1001', 'type': 'Regular'}, {'id': '1002', 'type': 'Chocolate'}, {'id': '1003', 'type': 'Blueberry'}, {'id': '1004', 'type': "Devil's Food"}]}, 'topping': [{'id': '5001', 'type': 'None'}, {'id': '5002', 'type': 'Glazed'}, {'id': '5005', 'type': 'Sugar'}, {'id': '5007', 'type': 'Powdered Sugar'}, {'id': '5006', 'type': 'Chocolate with Sprinkles'}, {'id': '5003', 'type': 'Chocolate'}, {'id': '5004', 'type': 'Maple'}]}
---------------------------------
1001


---

## BeautifulSoup

#### 1) 웹 소스 가져오기

In [1]:
import urllib.request as req 
from urllib.request import urlopen
from urllib.error import HTTPError,URLError


In [2]:
##### 텍스트 내용 읽어오기

html = urlopen('http://google.com')
print(type(html))
print(type(html.read()))

<class 'http.client.HTTPResponse'>
<class 'bytes'>


In [3]:
from urllib.error import HTTPError,URLError

try:
    naver = urlopen('https://www.naver.com/abc.jsp')
except HTTPError as e:
    print('HTTP error 입니다',e)
except URLError as e:
    print('URL error 입니다',e)
else:
    print(naver.read())

HTTP error 입니다 HTTP Error 404: Not Found


In [4]:
##### 이미지 가져오기

req.urlretrieve('https://t1.daumcdn.net/daumtop_chanel/op/20200723055344399.png','data/daum.png')
print('저장')

저장


In [12]:
img = urlopen('https://t1.daumcdn.net/daumtop_chanel/op/20200723055344399.png')
img_data = img.read()
img_data

f = open('data/daum2.png','wb')
f.write(img_data)
print('저장')
f.close()

저장


In [5]:
import urllib.parse

##### 요청 방식()
# 프로토콜://서버주소:포트번호/폴더 또는 파일명
# 프로토콜://서버주소:포트번호/파일명?변수명=값&변수명=값

# http://www.weather.go.kr/weather/lifenindustry/sevice_res.jsp

api ='http://www.weather.go.kr/weather/lifenindustry/sevice_rss.jsp'
value = {'stnld': 109}

params = urllib.parse.urlencode(value)
print(params)

url = api + '?' + params
print(url)

data = urlopen(url).read()
print(data)
data.decode('utf-8')
print(data)

stnld=109
http://www.weather.go.kr/weather/lifenindustry/sevice_rss.jsp?stnld=109


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 118: invalid start byte

#### 2) BeautifulSoup 사용법

In [6]:
from bs4 import BeautifulSoup

In [7]:
page = open('data/test_first.html').read()
page

soup = BeautifulSoup(page,'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Very Simple HTML Code by Netsong7
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    Happy PinkWink.
    <a href="http://netsong7.synology.me" id="pw-link">
     PinkWink
    </a>
   </p>
   <p class="inner-text second-item">
    Happy Data Science.
    <a href="https://www.python.org" id="py-link">
     Python
    </a>
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    Data Science is funny.
   </b>
  </p>
  <p class="outer-text">
   <b>
    All I need is Love.
   </b>
  </p>
 </body>
</html>


In [13]:
##### 원하는 위치에 접근하는 방법

list(soup.children)
#list(soup.children)[0]
#list(soup.children)[1]
#list(soup.children)[2]


['html',
 '\n',
 <html><head>
 <title>Very Simple HTML Code by Netsong7</title>
 </head><body>
 <div>
 <p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
 </p>
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>
 </div>
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>
 </body>
 </html>]

In [35]:
html = list(soup.children)[2]
list(html.children)
list(html.children)[0]

body = list(html.children)[1]
list(body.children)

['\n',
 <div>
 <p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
 </p>
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>
 </div>,
 '\n',
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 '\n',
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>,
 '\n']

In [37]:
##### 태그명으로 접근
soup.head
soup.body
soup.body.div
soup.body.div.p


<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>

In [40]:
##### find(), findall()

soup.find('p')
soup.find_all('p')


[<p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>,
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [41]:
soup.find('p',class_='outer-text')

<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>

In [42]:
soup.find('p',id='second')

<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>

In [56]:
soup.head
soup.head.next_sibling

soup.body
soup.body.previous_sibling

soup.body.p.next_sibling.next_sibling.next_sibling

soup.body.parent

<html><head>
<title>Very Simple HTML Code by Netsong7</title>
</head><body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>
</html>

In [64]:
##### 데이터(Text Node) 가져오기

soup.html.get_text()
soup.head.get_text()
soup.title.get_text()
soup.div.get_text()
soup.p.get_text()

for p in soup.find_all('p'):
    print(p.get_text())


                Happy PinkWink.
                PinkWink


                Happy Data Science.
                Python



                Data Science is funny.
            



                All I need is Love.
            



In [66]:
##### 속성에 접근하기

link = soup.find('a')
link

link['href']

links = soup.find_all('a')
for n in links:
    print(n['href'])

http://netsong7.synology.me
https://www.python.org


---
### 실습예제

#### 1) 네이버에서 환율정보 가져오기

+ https://finance.naver.com/marketindex/

In [80]:
from urllib.request import urlopen

url = 'https://finance.naver.com/marketindex/'
page = urlopen(url)
page

soup = BeautifulSoup(page,'html.parser')

In [82]:
span = soup.find_all("span", class_ = "value")
span

print('미 환율:',span[0].get_text())

미 환율: 1,115.30


In [92]:
div = soup.find_all('div',class_='head_info')
span = div[0].find_all('span')
span
print('미 환율:',span[0].get_text())

미 환율: 1,115.30


In [93]:
# CSS

soup.select_one('div.head_info > span.value')
span.get_text()

<span class="value">1,115.30</span>

#### 2) 스프래핑 연습

In [95]:
url = 'http://www.pythonscraping.com/pages/warandpeace.html'
page = urlopen(url)
soup = BeautifulSoup(page,'html.parser')

In [103]:
# 녹색 단어만 골라오기
span = soup.find_all('span',class_='green')
for w in span:
    print(w.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


In [106]:
green = soup.select('div#text > span.green')
green

for w in green:
    print(w.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


In [107]:
green = soup.find_all('span',{'class':'green'})

In [117]:
##### 제목 추출
# h1, h2, ,,, h6
h1=[]
h1 = soup.find_all(['h1','h2'])
for i in h1:
    print(i.get_text())
print([i.get_text() for i in h1])

War and Peace
Chapter 1
['War and Peace', 'Chapter 1']


In [122]:
green = soup.find_all('span',{'class':['green','red']})
for c in green:
    print(c.get_text())

Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by
that Antichrist- I really believe he is Antichrist- I will have
nothing more to do with you and you are no longer my friend, no longer
my 'faithful slave,' as you call yourself! But how do you do? I see
I have frightened you- sit down and tell me all the news.
Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
If you have nothing better to do, Count [or Prince], and if the
prospect of spending an evening with a poor invalid is not too
terrible, I shall be very charmed to see you tonight between 7 and 10-
Annette Scherer.
Heavens! what a virulent attack!
the prince
Anna Pavlovna
First of all, dear friend, tell me how you are. Set your friend's
mind at rest,
Can one be well while suffering morally? Can one be calm in times
like these if one

In [175]:
url = 'http://www.pythonscraping.com/pages/page3.html'
page = urlopen(url)
soup = BeautifulSoup(page,'html.parser')

In [161]:
##### 제목행은 건너뛰고 나머지 모든 행 리스트를 수집
# data = soup.find_all('tr',{'id':['gift1','gift2','gift3','gift4','gift5']})
# for i in data:
#     print(i.get_text())

for i in range(5):
    data = soup.find_all('tr',id = 'gift{}'.format(i))
    print(data)
    
    for j in data:
        print(j.get_text())

#print(a)


[]
[<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>]

Vegetable Basket

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!

$15.00



[<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>]

Russian Nesting Dolls

Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!


In [134]:
tb = soup.find('table',{'id':'giftList'}).tr.next_siblings
#list(tb)

for tr in tb:
    print(tr)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [139]:
data = soup.find_all('tr',{'id':['gift1','gift2','gift3','gift4','gift5']})
for i in data:
    print(i.get_text())


Vegetable Basket

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!

$15.00




Russian Nesting Dolls

Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!

$10,000.52




Fish Painting

If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!

$10,005.00




Dead Parrot

This is an ex-parrot! Or maybe he's only resting?

$0.50




Mystery Box

If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!

$1.50





In [174]:
tb = soup.find('table',{'id':'giftList'}).tr.next_siblings
#list(tb)

for tr in tb:
    print(tr)

for tr in tb:
    if(type(tr)==bs4.element.Tag):
        print(tr.get_text())
        print(list(tr.children)[3].img['scr'])



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [173]:
##### 가격중에 $15.00 수집

data = soup.find_all('td',)
for i in range(5):
    data = soup.find_all('tr',id = 'gift{}'.format(i))
    print(data)
    
    for j in data:
        if j.
        print(j.get_text())

SyntaxError: invalid syntax (<ipython-input-173-9a8c2f094c11>, line 9)

In [187]:
tb = soup.find('table',{'id':'giftList'}).td.next_sibling.next_sibling
print(tb.get_text())


$15.00



In [188]:
tb = soup.find('table',id='giftList').td.next_sibling.next_sibling
print(tb.get_text())


$15.00



___
## Selenium

+ PhantomJS 또는 Chrome Driver
    - 구글에서 'chrome driver'로 검색하여 현재 버전에 맞는 프로그램을 다운로드
    
+ pip install selenium


#### 1) 로그인 해야하는 경우

In [9]:
##### 접속 프로토콜이 http인경우
#session = 서버접속시 해당서버에 자동으로 생성되는 메모리 공간

import requests

url = 'https://www.hanbit.co.kr/member/login_proc.php'

session = requests.session()
sess = session.post(url,{'m_id':'xxx','m_passwd':'xxx','return_url':'https://www.hanbit.co.kr'})
print(sess)

my_url = 'https://www.hanbit.co.kr/myhanbit/myhanbit.html'
sess = session.get(my_url)
#print(sess.text)

soup = BeautifulSoup(sess.text,'html.parser')
mileage = soup.select_one('dl.mileage_section1').get_text()
print(mileage)

ecoin = soup.select_one('dl.mileage_section2').get_text()


<Response [200]>


NameError: name 'BeautifulSoup' is not defined

#### 2) selenium 사용법

In [10]:
from selenium import webdriver

In [13]:
driver = webdriver.Chrome('C:/Users/leewy/Downloads/chromedriver_win32/chromedriver')
driver.get('https://naver.com')

True

In [None]:
driver.save_screenshot('data/001.png')

In [32]:
driver = webdriver.Chrome('C:/Users/leewy/Downloads/chromedriver_win32/chromedriver')
driver.get('https://accounts.kakao.com/login?continue=https%3A%2F%2Flogins.daum.net%2Faccounts%2Fksso.do%3Frescue%3Dtrue%26url%3Dhttps%253A%252F%252Fwww.daum.net%252F')

elem_login = driver.find_element_by_id('id_email_2')
elem_login.clear()
elem_login.send_keys('leeus96@naver.com')

elem_pw = driver.find_element_by_id('id_password_3')
elem_pw.clear()
elem_pw.send_keys('l@2046192')

xpath = '''//*[@id="login-form"]/fieldset/div[8]/button[1]'''
driver.find_element_by_xpath(xpath).click()



In [33]:
driver.close()