In [None]:
Beautiful Soup 은 HTML과 XML 파일로부터 데이터를 끌어 내는 용도의 파이썬 라이브러리이다. 

이것은 너가 선호하는 parser과함께 navigating, searching, modifying the parse tree를 위해 일한다.


In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

# “three sisters” 문서를 뷰피플수프에 넣으면 BeautifulSoup 객체가 나오는데,
# 이 객체는 문서를 내포된 데이터 구조로 나타낸다:

In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())


<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [3]:
# 다음은 간단하게 데이터 구조를 항해하는 몇 가지 방법이다.
soup.title

<title>The Dormouse's story</title>

In [4]:
soup.title.name

'title'

In [5]:
soup.title.string

"The Dormouse's story"

In [6]:
soup.title.parent.name

'head'

In [7]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [8]:
soup.p['class']


['title']

In [9]:
soup.a


<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [10]:
soup.find_all('a')


[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [11]:
soup.find(id="link3")

# 일반적인 과업으로 한 페이지에서 <a> 태그에 존재하는 모든 URL을 뽑아 낼 일이 많다:

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [12]:
res = soup.find_all('a')

In [13]:
res

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [14]:
for link in soup.find_all('a'):
    print(link.get('href'))

    a=link.get('href').split('/')
    print('a=', a)

http://example.com/elsie
a= ['http:', '', 'example.com', 'elsie']
http://example.com/lacie
a= ['http:', '', 'example.com', 'lacie']
http://example.com/tillie
a= ['http:', '', 'example.com', 'tillie']


In [15]:
for link in soup.find_all('a'):
    print(link.get('id'))


link1
link2
link3


In [16]:
# 또 다른 과업으로 페이지에서 텍스트를 모두 뽑아낼 일이 많다.


print(soup.get_text())



The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [17]:
from bs4 import BeautifulSoup

with open("html_02.html") as fp:
    soup = BeautifulSoup(fp, 'lxml')

soup

<html><body><p># -*- coding: utf-8 -*-
"""
Created on Mon Jan  6 12:11:50 2020

@author: Kosmo_24
"""

<!DOCTYPE html>

</p>
<style>
h1 {
  color: blue;
  font-family: verdana;
  font-size: 300%;
}
p  {
  color: red;
 font-family: courier;
  font-size: 160%;
}
</style>
<h1>This is a heading</h1>
<p>This is a paragraph.</p>

p {border: 1px solid powderblue;
  margin: 50px;} 


 </body></html>

In [76]:
BeautifulSoup("Sacr&eacute; bleu!")


<html><body><p>Sacré bleu!</p></body></html>

In [19]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b 
type(tag)


bs4.element.Tag

In [20]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b
type(tag)


bs4.element.Tag

In [21]:
tag.name

'b'

In [22]:
tag.name = "blockquote"
tag


<blockquote class="boldest">Extremely bold</blockquote>

In [23]:
tag['class']

['boldest']

In [24]:
tag.attrs

{'class': ['boldest']}

In [25]:
tag['class'] = 'verybold'
tag['another-attribute'] = 1
tag


<blockquote another-attribute="1" class="verybold">Extremely bold</blockquote>

In [26]:
del tag['class']
del tag['another-attribute']
tag


<blockquote>Extremely bold</blockquote>

In [27]:
tag.string

'Extremely bold'

In [28]:
type(tag.string)


bs4.element.NavigableString

In [29]:
tag.string.replace_with("No longer bold")
tag

<blockquote>No longer bold</blockquote>

In [30]:
doc = BeautifulSoup("<document><content/>INSERT FOOTER HERE</document", "xml")
footer = BeautifulSoup("<footer>Here's the footer</footer>", "xml")
doc.find(text="INSERT FOOTER HERE").replace_with(footer)


'INSERT FOOTER HERE'

In [31]:
soup.name

'[document]'

In [32]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup,'lxml')
comment = soup.b.string
type(comment)


bs4.element.Comment

In [33]:
comment

'Hey, buddy. Want to buy a used parser?'

In [34]:
print(soup.b.prettify())

<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>


In [35]:
from bs4 import CData
cdata = CData("A CDATA block")
comment.replace_with(cdata)

print(soup.b.prettify())


<b>
 <![CDATA[A CDATA block]]>
</b>


In [36]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')


In [37]:
soup.head

<head><title>The Dormouse's story</title></head>

In [38]:
soup.title

<title>The Dormouse's story</title>

In [39]:
soup.body.b

<b>The Dormouse's story</b>

In [40]:
soup.a


<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [41]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [42]:
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [43]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [44]:
title_tag = head_tag.contents[0]
title_tag


<title>The Dormouse's story</title>

In [45]:
title_tag.contents

["The Dormouse's story"]

In [46]:
len(soup.contents)

2

In [49]:
soup.contents[0].name

In [52]:
soup.find_all("title")

[<title>The Dormouse's story</title>]

In [54]:
soup.find_all("p","title")

[<p class="title"><b>The Dormouse's story</b></p>]

In [56]:
soup.find_all("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [58]:
soup.find_all(id="link2")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [60]:
import re
soup.find(string=re.compile("sisters"))

'Once upon a time there were three little sisters; and their names were\n'

In [62]:
soup.find_all(id='link2')

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [64]:
soup.find_all(href=re.compile("elsie"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [66]:
soup.find_all(id=True)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [68]:
soup.find_all(href=re.compile("elsie"),id='link1')


[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [72]:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
data_soup.find_all(attrs={"data-foo": "value"})

[<div data-foo="value">foo!</div>]

In [74]:
data_soup.find_all(attrs={"data-foo": "value"})

[<div data-foo="value">foo!</div>]

In [75]:
name_soup = BeautifulSoup('<input name="email"/>')
name_soup.find_all(name="email")
# []
name_soup.find_all(attrs={"name": "email"})


[<input name="email"/>]