# ライブラリ

In [73]:
from bs4 import BeautifulSoup
import requests
import re

In [3]:
html = """
<!DOCTYPE html>
<html lang="ja">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Document</title>
</head>
<body>
    <h1>タイトル</h1>
    <h2>演習内容</h2>
    <ol id="step1" class="study-list">
        <li>Python基礎</li>
        <li>HTML基礎</li>
    </ol>

    <ol id="step2" class="study-list">
        <li>Python応用</li>
        <li>HTML応用</li>
    </ol>
</body>
</html>
"""

In [7]:
soup = BeautifulSoup(html, "lxml")
print(soup.h1.text) # 解析したHTMLからh1タグのテキストを取得
print(soup.h2.text) # 解析したHTMLからh2タグのテキストを取得

タイトル
演習内容


In [None]:
# html.parserとは？
# html.parser -> python標準のHTML解析器(ライブラリのインストールが必要ない)
# lxml -> 高速なHTML解析器(ライブラリのインストールが必要)
# html5lib -> HTML5の仕様に準拠したHTML解析器, ただし処理が遅い(ライブラリのインストールが必要)

In [13]:
url = "https://www.anaconda.com/"
res = requests.get(url)

# res.text
soup = BeautifulSoup(res.content, "lxml") # res.textでもOK
# res.contentの方が良い -> res.textだと文字化けすることがある
print(soup.h1.text) # 解析したHTMLからh1タグのテキストを取得
print(soup.find("h1").text) # h1タグを取得

# 基本的にはfindメソッドを使う
# 追加要素class名とかも指定できる soup.find("h1", class_="class名")
# 全ての要素を取得したい場合はfind_allメソッドを使う

The Operating System for AI
The Operating System for AI


In [24]:
url = "https://www.anaconda.com/"
res = requests.get(url)

# res.text
soup = BeautifulSoup(res.content, "lxml") # res.textでもOK

print(soup.h1.text) 
# print(soup.find_all("span", class_ ="bs-pro-button").text)  # リストで返ってくるのでエラーになる
for span in soup.find_all("span", class_ ="bs-pro-button"):
    print(span.text) # すべてのspanタグのテキストを取得

# 一つだけ取得したい場合は
print("========================")
print(soup.find_all("span", class_ ="bs-pro-button")[0].text)

The Operating System for AI
Explore Anaconda Hub
Create Account
Learn More
Learn More
Learn More
Learn More
Get the Report
Press Releases
Blog
Docs
Learning Catalog
PyScript
EduBlocks
PythonAnywhere
Contact Sales
Explore Anaconda Hub


In [25]:
print(soup.find(attrs={"class": "bs-pro-button"}).text) # これでもOK

Explore Anaconda Hub


### 練習問題

In [52]:
url = "https://www.python.org/"
res = requests.get(url)
# アクセスできるか確認
# print(res.status_code)

soup = BeautifulSoup(res.content, "lxml") 
# Latest Newsがあるブロックを取得
latest_news = soup.find("div", class_="medium-widget blog-widget")
latest_news = latest_news.find("ul" , class_="menu")
# latest_news.text
latest_news_title = latest_news.find_all("a")
latest_news_date = latest_news.find_all("time")
# for tem in latest_news_title:
for i, (title, date) in enumerate(zip(latest_news_title, latest_news_date), 0):
    print(f"===================={i}====================")
    print(f"タイトル : {title.text}")
    print(f"日付 : {date.text}")



タイトル : Python 3.13.0RC2, 3.12.6, 3.11.10, 3.10.15, 3.9.20, and 3.8.20 are now available!
日付 : 2024-09-07
タイトル : Pallets projects added to scope of PSF CVE Numbering Authority
日付 : 2024-09-05
タイトル : Python Developers Survey 2023 Results
日付 : 2024-08-29
タイトル : Ask questions or tell us what you think: Introducing monthly PSF Board Office Hours!
日付 : 2024-08-27
タイトル : Announcing Python Software Foundation Fellow Members for Q1 2024! 🎉
日付 : 2024-08-13


In [55]:
# 回答
url = "https://www.python.org/"
res = requests.get(url)
# アクセスできるか確認
# print(res.status_code)

soup = BeautifulSoup(res.content, "lxml") 
# Latest Newsがあるブロックを取得
post = soup.find("div", class_="medium-widget blog-widget")

for i, li in enumerate(post.find_all("li")):
    title = li.find("a").text
    date = li.find("time").text
    print("="* 30, i, "="*30)
    print(f"タイトル : {title}")
    print(f"日付 : {date}")



タイトル : Python 3.13.0RC2, 3.12.6, 3.11.10, 3.10.15, 3.9.20, and 3.8.20 are now available!
日付 : 2024-09-07
タイトル : Pallets projects added to scope of PSF CVE Numbering Authority
日付 : 2024-09-05
タイトル : Python Developers Survey 2023 Results
日付 : 2024-08-29
タイトル : Ask questions or tell us what you think: Introducing monthly PSF Board Office Hours!
日付 : 2024-08-27
タイトル : Announcing Python Software Foundation Fellow Members for Q1 2024! 🎉
日付 : 2024-08-13


In [58]:
# 回答
url = "https://www.python.org/"
res = requests.get(url)
# アクセスできるか確認
# print(res.status_code)
soup = BeautifulSoup(res.content, "lxml") 
# Latest Newsがあるブロックを取得
post = soup.find("div", class_="medium-widget blog-widget")

# post.find("li").find("a") # 最初のliタグの中にあるaタグを取得

print(post.find("li").find("a").attrs) # aタグの属性を取得 (属性は辞書型で返ってくる)-> hrefが取得できる 複数だったら全部取得できる
print(post.find("li").find("a").attrs["href"]) # aタグの属性を取得 (属性は辞書型で返ってくる)-> hrefが取得できる 複数だったら全部取得できる




{'href': 'https://pythoninsider.blogspot.com/2024/09/python-3130rc2-3126-31110-31015-3920.html'}
https://pythoninsider.blogspot.com/2024/09/python-3130rc2-3126-31110-31015-3920.html


In [61]:
print(post.find("li").find("a").get("href")) # URL取得 getメソッドでも取得できる(なかったらNoneが返ってくる)
print(post.find("li").find("a").attrs["href"]) # URL取得 attrsメソッドでも取得できる(なかったらエラーになる)

https://pythoninsider.blogspot.com/2024/09/python-3130rc2-3126-31110-31015-3920.html


In [72]:
print(post.find(text="Latest News").parent) # Latest Newsの親要素を取得
print(soup.find(href="/about/"))

<h2 class="widget-title"><span aria-hidden="true" class="icon-news"></span>Latest News</h2>
<a class="" href="/about/" title="">About</a>


  print(post.find(text="Latest News").parent) # Latest Newsの親要素を取得


In [75]:
# print(soup.find_all(text=re.compile("Python"))) # Pythonという文字列が含まれている要素を取得

for text in soup.find_all(text=re.compile("Python")):
    print(text) # pythonという文字列が含まれている要素を取得


Welcome to Python.org
Python
 The Python Network
                
Python Brochure
Python Books
Python Essays
Python Conferences
Python Logo
Python Wiki
Python News
Python Events
Python Events Archive
# Python 3: Fibonacci series up to n
The core of extensible programming is defining functions. Python allows mandatory and optional arguments, keyword arguments, and even arbitrary argument lists. 
More about defining functions in Python 3
# Python 3: List comprehensions
Lists (known as arrays in other languages) are one of the compound data types that Python understands. Lists can be indexed, sliced and manipulated with other built-in functions. 
More about lists in Python 3
# Python 3: Simple arithmetic
Calculations are simple with Python, and expression syntax is straightforward: the operators 
More about simple math functions in Python 3
Python knows the usual control flow statements that other languages speak — 
More control flow tools in Python 3

>>> print("Hello, I'm Python!")

Hel

  for text in soup.find_all(text=re.compile("Python")):


In [80]:
for text in soup.find_all(text=re.compile("Python")):
    print(text.parent) # どんなHTML要素に含まれているかを取得

for text in soup.find_all("a", text=re.compile("Python")):
    print(text) # どんなHTML要素に含まれているかを取得

<title>Welcome to Python.org</title>
<a class="current_item selectedcurrent_branch selected" href="/" title="The Python Programming Language">Python</a>
<a aria-hidden="true" class="jump-link" href="#top" id="python-network">
<span aria-hidden="true" class="icon-arrow-up"><span>▲</span></span> The Python Network
                </a>
<a href="http://brochure.getpython.info/" title="">Python Brochure</a>
<a href="https://wiki.python.org/moin/PythonBooks" title="">Python Books</a>
<a href="/doc/essays/" title="">Python Essays</a>
<a href="/community/workshops/" title="">Python Conferences</a>
<a href="/community/logos/" title="">Python Logo</a>
<a href="https://wiki.python.org/moin/" title="">Python Wiki</a>
<a href="/blogs/" title="Python Insider Blog Posts">Python News</a>
<a href="/events/python-events/" title="">Python Events</a>
<a href="/events/python-events/past/" title="">Python Events Archive</a>
<span class="comment"># Python 3: Fibonacci series up to n</span>
<p>The core of ext

  for text in soup.find_all(text=re.compile("Python")):
  for text in soup.find_all("a", text=re.compile("Python")):


In [81]:
for text in soup.find_all("a", text=re.compile("Python")):
    print(text) # Pythonでいう文字列が書いているaタグを指定できる

<a class="current_item selectedcurrent_branch selected" href="/" title="The Python Programming Language">Python</a>
<a href="http://brochure.getpython.info/" title="">Python Brochure</a>
<a href="https://wiki.python.org/moin/PythonBooks" title="">Python Books</a>
<a href="/doc/essays/" title="">Python Essays</a>
<a href="/community/workshops/" title="">Python Conferences</a>
<a href="/community/logos/" title="">Python Logo</a>
<a href="https://wiki.python.org/moin/" title="">Python Wiki</a>
<a href="/blogs/" title="Python Insider Blog Posts">Python News</a>
<a href="/events/python-events/" title="">Python Events</a>
<a href="/events/python-events/past/" title="">Python Events Archive</a>
<a href="//docs.python.org/3/tutorial/controlflow.html#defining-functions">More about defining functions in Python 3</a>
<a href="//docs.python.org/3/tutorial/introduction.html#lists">More about lists in Python 3</a>
<a href="http://docs.python.org/3/tutorial/introduction.html#using-python-as-a-calcula

  for text in soup.find_all("a", text=re.compile("Python")):


In [82]:
for text in soup.find_all("a", href=re.compile("docs.python.org")): # docs.python.orgが含まれているURLを持つaタグを取得
    print(text) # Pythonでいう文字列が書いているaタグを指定できる

<a href="https://docs.python.org" title="Python Documentation">Docs</a>
<a href="https://docs.python.org/3/license.html" title="">License</a>
<a href="https://docs.python.org/faq/" title="">FAQ</a>
<a href="//docs.python.org/3/tutorial/controlflow.html#defining-functions">More about defining functions in Python 3</a>
<a href="//docs.python.org/3/tutorial/introduction.html#lists">More about lists in Python 3</a>
<a href="http://docs.python.org/3/tutorial/introduction.html#using-python-as-a-calculator">More about simple math functions in Python 3</a>
<a href="//docs.python.org/3/tutorial/controlflow.html">More control flow tools in Python 3</a>
<a href="//docs.python.org/3/tutorial/">Whet your appetite</a>
<a href="https://docs.python.org">docs.python.org</a>
<a href="https://docs.python.org/3/license.html" title="">License</a>
<a href="https://docs.python.org/faq/" title="">FAQ</a>
