In [1]:
from bs4 import BeautifulSoup
import re

# get_news_section

In [2]:
html_doc = """
<html>
    <head>
        <title>Home</title>
    </head>
    <body>
        <div class="section">
            <h2>영역 제목</h2>
                <ul> 
                    <li><a href="/news/news1">기사 제목1</a></li>
                    <li><a href="/news/news2">기사 제목2</a></li>
                    <li><a href="/news/news3">기사 제목3</a></li>
                </ul>
        </div>
    </body>
</html>
"""

In [3]:
soup = BeautifulSoup(html_doc, 'html.parser')
# print(soup)
print(soup.prettify())

<html>
 <head>
  <title>
   Home
  </title>
 </head>
 <body>
  <div class="section">
   <h2>
    영역 제목
   </h2>
   <ul>
    <li>
     <a href="/news/news1">
      기사 제목1
     </a>
    </li>
    <li>
     <a href="/news/news2">
      기사 제목2
     </a>
    </li>
    <li>
     <a href="/news/news3">
      기사 제목3
     </a>
    </li>
   </ul>
  </div>
 </body>
</html>



In [4]:
print(soup.title)

<title>Home</title>


In [5]:
print(soup.title.string)

Home


In [6]:
print(soup.title.parent.name)

head


In [7]:
print(soup.div)

<div class="section">
<h2>영역 제목</h2>
<ul>
<li><a href="/news/news1">기사 제목1</a></li>
<li><a href="/news/news2">기사 제목2</a></li>
<li><a href="/news/news3">기사 제목3</a></li>
</ul>
</div>


In [8]:
print(soup.div['class'])

['section']


In [9]:
print(soup.li)

<li><a href="/news/news1">기사 제목1</a></li>


In [10]:
print(soup.find_all('li'))

[<li><a href="/news/news1">기사 제목1</a></li>, <li><a href="/news/news2">기사 제목2</a></li>, <li><a href="/news/news3">기사 제목3</a></li>]


In [11]:
print(soup.find_all(class_="section"))

[<div class="section">
<h2>영역 제목</h2>
<ul>
<li><a href="/news/news1">기사 제목1</a></li>
<li><a href="/news/news2">기사 제목2</a></li>
<li><a href="/news/news3">기사 제목3</a></li>
</ul>
</div>]


In [12]:
print(soup.find_all(href=re.compile("/news")))

[<a href="/news/news1">기사 제목1</a>, <a href="/news/news2">기사 제목2</a>, <a href="/news/news3">기사 제목3</a>]


In [13]:
news_list = soup.find_all(href=re.compile("/news"))
for news in news_list:
    print(news["href"])
    print(news.string)

/news/news1
기사 제목1
/news/news2
기사 제목2
/news/news3
기사 제목3


# get_side

In [14]:
html_table = """
<html>
    <div class="aside_section"> 
        <table class="tbl"> 
            <thead>
                <tr> 
                    <th scope="col">컬럼1</th> 
                    <th scope="col">컬럼2</th> 
                </tr> 
            </thead>
            <tbody>
            <tr> 
                <th><a href="/aside1">항목1</a></th> 
                <td>항목1값1</td> 
                <td>항목1값2</td> 
            </tr>
            <tr>
                <th><a href="/aside2">항목2</a></th> 
                <td>항목2값1</td> 
                <td>항목2값2</td> 
            </tr>
            </tbody>
        </table>
    </div>
</html>
"""

In [15]:
soup = BeautifulSoup(html_table, 'html.parser')
# print(soup)
print(soup.prettify())

<html>
 <div class="aside_section">
  <table class="tbl">
   <thead>
    <tr>
     <th scope="col">
      컬럼1
     </th>
     <th scope="col">
      컬럼2
     </th>
    </tr>
   </thead>
   <tbody>
    <tr>
     <th>
      <a href="/aside1">
       항목1
      </a>
     </th>
     <td>
      항목1값1
     </td>
     <td>
      항목1값2
     </td>
    </tr>
    <tr>
     <th>
      <a href="/aside2">
       항목2
      </a>
     </th>
     <td>
      항목2값1
     </td>
     <td>
      항목2값2
     </td>
    </tr>
   </tbody>
  </table>
 </div>
</html>



In [16]:
print(soup.table)

<table class="tbl">
<thead>
<tr>
<th scope="col">컬럼1</th>
<th scope="col">컬럼2</th>
</tr>
</thead>
<tbody>
<tr>
<th><a href="/aside1">항목1</a></th>
<td>항목1값1</td>
<td>항목1값2</td>
</tr>
<tr>
<th><a href="/aside2">항목2</a></th>
<td>항목2값1</td>
<td>항목2값2</td>
</tr>
</tbody>
</table>


In [17]:
print(soup.thead.find_all(scope=re.compile("col")))

[<th scope="col">컬럼1</th>, <th scope="col">컬럼2</th>]


In [18]:
col_list = [ col.string for col in soup.thead.find_all(scope=re.compile("col"))]
print(col_list)

['컬럼1', '컬럼2']


In [19]:
tr_list = soup.tbody.find_all("tr")
print("tr list", tr_list)

tr list [<tr>
<th><a href="/aside1">항목1</a></th>
<td>항목1값1</td>
<td>항목1값2</td>
</tr>, <tr>
<th><a href="/aside2">항목2</a></th>
<td>항목2값1</td>
<td>항목2값2</td>
</tr>]


In [20]:
for tr in tr_list:
    for td in tr.find_all("td"):
        print("tr td", td.string)

tr td 항목1값1
tr td 항목1값2
tr td 항목2값1
tr td 항목2값2
