# requests 模組：讀取網站檔案

## 讀取網頁原始碼

In [1]:
#https://blog.gtwang.org/programming/python-requests-module-tutorial/
#https://www.w3schools.com/python/module_requests.asp
#https://steam.oxxostudio.tw/category/python/spider/requests.html

# get.py
import requests
url = 'http://www.ehappy.tw/demo.htm'
html = requests.get(url)
# 檢查HTTP回應碼是否為200(requests.code.ok)
if html.status_code == requests.codes.ok:
    print(type(html))
    print(html)
    print(html.text)

<class 'requests.models.Response'>
<Response [200]>
<!doctype html>
<html>
  <head>
    <meta charset="UTF-8">
    <title>Hello</title>
  </head>
  <body>
    <p>Hello World!</p>
  </body>
</html>


In [2]:
import requests
url = 'http://www.ehappy.tw/demo.htm'
html = requests.get(url)
# 檢查HTTP回應碼是否為200(requests.code.ok)
if html.status_code == requests.codes.ok:
    print(html.text)

<!doctype html>
<html>
  <head>
    <meta charset="UTF-8">
    <title>Hello</title>
  </head>
  <body>
    <p>Hello World!</p>
  </body>
</html>


## 加上 URL 參數

In [3]:
import requests
# 將查詢參數定義為字典資料加入GET請求中
payload = {'key1': 'value1', 'key2': 'value2'} #字典
html = requests.get("http://httpbin.org/get",
                     params=payload)  #測試網址http://httpbin.org/get
print(html.text)

{
  "args": {
    "key1": "value1", 
    "key2": "value2"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.26.0", 
    "X-Amzn-Trace-Id": "Root=1-64ee8a37-078cb89417633b76763f1d04"
  }, 
  "origin": "218.164.152.206", 
  "url": "http://httpbin.org/get?key1=value1&key2=value2"
}



In [4]:
import requests
# 將查詢參數定義為字典資料加入GET請求中
payload = {'key1': 100, 'key2': 'value2'} #字典(value為數字，一樣當成字串)
html = requests.get("http://httpbin.org/get", params=payload)  #測試網址http://httpbin.org/get
print(html.text)

{
  "args": {
    "key1": "100", 
    "key2": "value2"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.26.0", 
    "X-Amzn-Trace-Id": "Root=1-64ee8a43-6ff3eb292fb220843f0c12da"
  }, 
  "origin": "218.164.152.206", 
  "url": "http://httpbin.org/get?key1=100&key2=value2"
}



In [5]:
import requests
# 使用URL方式，將查詢參數加入GET請求中
html = requests.get("http://httpbin.org/get?key1=value1&key2=value2")
print(html.text)

{
  "args": {
    "key1": "value1", 
    "key2": "value2"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.26.0", 
    "X-Amzn-Trace-Id": "Root=1-64ee8ab8-5182b18352ef57a558abbb18"
  }, 
  "origin": "218.164.152.206", 
  "url": "http://httpbin.org/get?key1=value1&key2=value2"
}



## 發送POST請求

In [6]:
import requests
# 將查詢參數加入 POST 請求中
payload = {'key1': 'value1', 'key2': 'value2'}
html = requests.post("http://httpbin.org/post",
                   data=payload)   #測試網址http://httpbin.org/post
print(html.text)

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "key1": "value1", 
    "key2": "value2"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Content-Length": "23", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.26.0", 
    "X-Amzn-Trace-Id": "Root=1-64ee8adf-6b9817e92982409f1125dc72"
  }, 
  "json": null, 
  "origin": "218.164.152.206", 
  "url": "http://httpbin.org/post"
}



## 自訂HTTP Headers偽裝瀏覽器操作

In [7]:
######不要執行(會停擺)######

#利用Chrome瀏覽網址http://httpbin.org/get，取得user-agent
#利用Chrome瀏覽http://www.ehappy.tw/demo.htm，開發人員工具(Ctrl+Shift+I)/Network/Headers，重整，從Request Headers，取得user-agent

import requests
url = 'https://irs.thsrc.com.tw/IMINT/'  #高鐵訂票網站
# 自訂表頭
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'}
# 將自訂表頭加入 GET 請求中
html = requests.get(url, headers=headers)
print(html)

<Response [200]>


In [7]:
# 自訂表頭 (user-agent)
#利用Chrome瀏覽網址http://httpbin.org/get，取得user-agent

import requests

# 自訂表頭 (user-agent)
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'}
payload = {'key1': 100, 'key2': 'value2'}
r = requests.get("http://httpbin.org/get", headers=headers, params=payload)
print(r.text)

{
  "args": {
    "key1": "100", 
    "key2": "value2"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate, br, zstd", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36", 
    "X-Amzn-Trace-Id": "Root=1-68c012e6-3f9e156a3d41bc973f836770"
  }, 
  "origin": "1.173.57.196", 
  "url": "http://httpbin.org/get?key1=100&key2=value2"
}



In [5]:
# 自訂表頭 (user-agent)
# 利用Chrome瀏覽http://www.ehappy.tw/demo.htm，開發人員工具(Ctrl+Shift+I)/Network/Headers，重整，從Request Headers，取得user-agent

import requests

# 自訂表頭 (user-agent)
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'}
payload = {'key1': 100, 'key2': 'value2'}
r = requests.post("http://httpbin.org/post", headers=headers, data=payload)
print(r.text)

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "key1": "100", 
    "key2": "value2"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate, br, zstd", 
    "Content-Length": "20", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36", 
    "X-Amzn-Trace-Id": "Root=1-68c01250-57d273117c18d9fe195c0667"
  }, 
  "json": null, 
  "origin": "1.173.57.196", 
  "url": "http://httpbin.org/post"
}



In [12]:
#測試網址http://httpbin.org/user-agent
# 自訂表頭 (user-agent)

headers={
   'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
}

r = requests.get("http://httpbin.org/user-agent", headers=headers)  #測試網址http://httpbin.org/user-agent
print(r.text)

{
  "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
}



In [14]:
#測試網址http://httpbin.org/user-agent
# 無自訂表頭 (user-agent)

r = requests.get("http://httpbin.org/user-agent")  #測試網址http://httpbin.org/user-agent
print(r.text)

{
  "user-agent": "python-requests/2.26.0"
}



In [None]:
######不要執行(會停擺)######

# 無自訂表頭
import requests
url = 'https://irs.thsrc.com.tw/IMINT/'  #高鐵訂票網站

# GET 請求中
html = requests.get(url)
print(html)

In [8]:
#使用try except
#https://stackoverflow.com/questions/16511337/correct-way-to-try-except-using-python-requests-module
#Correct way to try/except using Python requests module

#利用try except避免產生意外錯誤
#無自訂表頭 (user-agent)，會產生timeout錯誤

import requests
url='https://irs.thsrc.com.tw/IMINT/'

try:
    r = requests.get(url,timeout=3)
    r.raise_for_status()
    print(r)
except requests.exceptions.HTTPError as errh:
    print ("Http Error:",errh)
except requests.exceptions.ConnectionError as errc:
    print ("Error Connecting:",errc)
except requests.exceptions.Timeout as errt:
    print ("Timeout Error:",errt)
except requests.exceptions.RequestException as err:
    print ("OOps: Something Else",err)

Timeout Error: HTTPSConnectionPool(host='irs.thsrc.com.tw', port=443): Read timed out. (read timeout=3)


In [9]:
#使用try except
#https://stackoverflow.com/questions/16511337/correct-way-to-try-except-using-python-requests-module
#Correct way to try/except using Python requests module


# 自訂表頭 (user-agent)，會產生timeout錯誤

import requests
url = 'https://irs.thsrc.com.tw/IMINT/'  #高鐵訂票網站
# 自訂表頭
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'}
# 將自訂表頭加入 GET 請求中

try:
    r = requests.get(url, headers=headers,timeout=3)
    r.raise_for_status()
    print(r)
except requests.exceptions.HTTPError as errh:
    print ("Http Error:",errh)
except requests.exceptions.ConnectionError as errc:
    print ("Error Connecting:",errc)
except requests.exceptions.Timeout as errt:
    print ("Timeout Error:",errt)
except requests.exceptions.RequestException as err:
    print ("OOps: Something Else",err)

Timeout Error: HTTPSConnectionPool(host='irs.thsrc.com.tw', port=443): Read timed out. (read timeout=3)


In [10]:
#使用try except
#https://stackoverflow.com/questions/16511337/correct-way-to-try-except-using-python-requests-module
#Correct way to try/except using Python requests module

#網頁不存在，產生錯誤

import requests
url='http://www.google.com/blahblah'   #網頁不存在

try:
    r = requests.get(url,timeout=3)
    r.raise_for_status()                         #Http Error: 404不會視為例外，所以要raise
except requests.exceptions.HTTPError as errh:
    print ("Http Error:",errh)
except requests.exceptions.ConnectionError as errc:
    print ("Error Connecting:",errc)
except requests.exceptions.Timeout as errt:
    print ("Timeout Error:",errt)
except requests.exceptions.RequestException as err:
    print ("OOps: Something Else",err)

Http Error: 404 Client Error: Not Found for url: http://www.google.com/blahblah


In [None]:
#使用try except
#https://stackoverflow.com/questions/16511337/correct-way-to-try-except-using-python-requests-module
#Correct way to try/except using Python requests module

#網址(網站)不存在，產生錯誤

import requests
url='http://www.google123456.com/blahblah'   #網址(網站)不存在

try:
    r = requests.get(url,timeout=3)
    r.raise_for_status()                         #Http Error: 404不會視為例外，所以要raise
except requests.exceptions.HTTPError as errh:
    print ("Http Error:",errh)
except requests.exceptions.ConnectionError as errc:
    print ("Error Connecting:",errc)
except requests.exceptions.Timeout as errt:
    print ("Timeout Error:",errt)
except requests.exceptions.RequestException as err:
    print ("OOps: Something Else",err)

## 在requests請求時加入Cookie


In [18]:
# 設定cookies值的測試網址

import requests

# 設定cookies的值
cookies = {'over18':'1'}
r = requests.get("http://httpbin.org/cookies", cookies=cookies)  #測試網址http://httpbin.org/cookies
print(r.text)

{
  "cookies": {
    "over18": "1"
  }
}



In [19]:
# 設定cookies值的測試

import requests

# 設定cookies的值
cookies = {'over18':'1'}
r = requests.get("http://httpbin.org/get", cookies=cookies)  #測試網址http://httpbin.org/get
print(r.text)

{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Cookie": "over18=1", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.26.0", 
    "X-Amzn-Trace-Id": "Root=1-64ee96be-2528c4ba78f275c33d5db57c"
  }, 
  "origin": "218.164.152.206", 
  "url": "http://httpbin.org/get"
}



In [20]:
#使用cookies的實際範例
# <<Chrome>> 開發人員工具(Ctrl+Shift+I)/Application/Cookies

import requests
url = 'https://www.ptt.cc/bbs/Gossiping/index.html'
# 設定cookies的值
cookies = {'over18':'1'}
html = requests.get(url, cookies=cookies)
print(html.text)

<!DOCTYPE html>
<html>
	<head>
		<meta charset="utf-8">
		

<meta name="viewport" content="width=device-width, initial-scale=1">

<title>看板 Gossiping 文章列表 - 批踢踢實業坊</title>

<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-common.css">
<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen">
<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-custom.css">
<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen">
<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print">




	</head>
    <body>
		
<div id="topbar-container">
	<div id="topbar" class="bbs-content">
		<a id="logo" href="/bbs/">批踢踢實業坊</a>
		<span>&rsaquo;</span>
		<a class="board" href="/bbs/Gossiping/index.html"><span class="board-label">看板 </span>Gossiping</a>
		<a class="right small" href="/about.html">關於我們</a>
		<a class="right small" href="/co

In [21]:
#使用cookies的實際範例
# <<Chrome>> 開發人員工具(Ctrl+Shift+I)/Application/Cookies
# 沒有設定cookies的值

# get_cookie.py
import requests
url = 'https://www.ptt.cc/bbs/Gossiping/index.html'
# 沒有設定cookies的值
# cookies = {'over18':'1'}
r = requests.get(url)
print(r.text)

<!DOCTYPE html>
<html>
	<head>
		<meta charset="utf-8">
		

<meta name="viewport" content="width=device-width, initial-scale=1">

<title>批踢踢實業坊</title>

<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-common.css">
<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-base.css" media="screen">
<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-custom.css">
<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/pushstream.css" media="screen">
<link rel="stylesheet" type="text/css" href="//images.ptt.cc/bbs/v2.27/bbs-print.css" media="print">




	</head>
    <body>
		
<div class="bbs-screen bbs-content">
    <div class="over18-notice">
        <p>本網站已依網站內容分級規定處理</p>

        <p>警告︰您即將進入之看板內容需滿十八歲方可瀏覽。</p>

        <p>若您尚未年滿十八歲，請點選離開。若您已滿十八歲，亦不可將本區之內容派發、傳閱、出售、出租、交給或借予年齡未滿18歲的人士瀏覽，或將本網站內容向該人士出示、播放或放映。</p>
    </div>
</div>

<div class="bbs-screen bbs-content center clear">
    <form action="/ask/over18"

# BeautifulSoup 模組：網頁解析

## 認識網頁的結構

In [None]:
# bsdemo1.htm
'''
<!doctype html>
<html>
  <head>
    <meta charset="UTF-8">
    <title>我是網頁標題</title>
  </head>
  <body>
    <h1 class="large">我是標題</h1>
    <div>
      <p>我是段落</p>
      <img src="https://www.w3.org/html/logo/
			downloads/HTML5_Logo_256.png" alt="我是圖片">
      <a href="http://www.e-happy.com.tw">我是超連結</a>
    </div>
      </body>
</html>
'''

## BeautifulSoup 常用的屬性

In [22]:
#BeautifulSoup模組
#https://blog.gtwang.org/programming/python-beautiful-soup-module-scrape-web-pages-tutorial/
#https://www.crummy.com/software/BeautifulSoup/bs4/doc/

import requests
from bs4 import BeautifulSoup
url = 'http://ehappy.tw/bsdemo1.htm'
html = requests.get(url)
html.encoding = 'UTF-8'
sp = BeautifulSoup(html.text, 'lxml')
print(type(sp))       # BeautifulSoup物件
print(sp.title)
print(type(sp.title)) # Tag物件
print(sp.title.text)
print(sp.h1)  #第一個滿足者
print(sp.p)

<class 'bs4.BeautifulSoup'>
<title>我是網頁標題</title>
<class 'bs4.element.Tag'>
我是網頁標題
<h1 class="large">我是標題</h1>
<p>我是段落</p>


In [23]:
import requests
from bs4 import BeautifulSoup
url = 'http://www.ehappy.tw/bsdemo1.htm'
html = requests.get(url)

html.encoding = 'UTF-8'
print(html.text)
print("------------------------------------------")
sp = BeautifulSoup(html.text, 'lxml')
print(sp.text)   #所有文字字串
print("------------------------------------------")
print(sp.title.text)   #title中的文字字串
print(sp.h1.text)      #.text針對tag中所有的children、descendants的文字字串
print(sp.p.text)
print(sp.a.text)
print("------------------------------------------")
print(sp.title.string)   #title中的文字字串
print(sp.h1.string)      #.string只能針對tag中擁有唯一的child的文字字串
print(sp.p.string)
print(sp.a.string)

<!doctype html>
<html lang="zh">
  <head>
    <meta charset="UTF-8">
    <title>我是網頁標題</title>
  </head>
  <body>
    <h1 class="large">我是標題</h1>
    <div>
      <p>我是段落</p>
      <img src="https://www.w3.org/html/logo/downloads/HTML5_Logo_256.png" alt="我是圖片">
      <a href="http://www.e-happy.com.tw">我是超連結</a>
    </div>
  </body>
</html>
------------------------------------------



我是網頁標題


我是標題

我是段落

我是超連結



------------------------------------------
我是網頁標題
我是標題
我是段落
我是超連結
------------------------------------------
我是網頁標題
我是標題
我是段落
我是超連結


In [1]:
#https://www.crummy.com/software/BeautifulSoup/bs4/doc/#navigating-the-tree
#.string只能針對tag中擁有唯一的child的文字字串

import requests
from bs4 import BeautifulSoup

sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></a>", 'html.parser')
print(sibling_soup.prettify())
print("------------------------------------------")
print(sibling_soup.text)    #.text針對tag中所有的children、descendants的文字字串
print(sibling_soup.string)  #.string只能針對tag中擁有唯一的child的文字字串
print("------------------------------------------")
b = sibling_soup.b
print(b.text)
print(b.string)

<a>
 <b>
  text1
 </b>
 <c>
  text2
 </c>
</a>
------------------------------------------
text1text2
None
------------------------------------------
text1
text1


## 找尋指定標籤的內容：find()、find_all()

In [24]:
html = '''
<html>
  <head><meta charset="UTF-8"><title>我是網頁標題</title></head>
  <body>
      <p id="p1">我是段落一</p>
      <p id="p2" class='red'>我是段落二</p>
  </body>
</html>
'''

In [25]:
from bs4 import BeautifulSoup
sp = BeautifulSoup(html, 'lxml')
print(sp.find('p'))
print(type(sp.find('p'))) # Tag物件 (找到第一個滿足條件者)
print(sp.find_all('p'))
print(type(sp.find_all('p'))) # ResultSet物件
print(sp.find('p', {'id':'p2', 'class':'red'}))  #加入標籤屬性為搜尋條件
print(sp.find('p', id='p2', class_= 'red'))      #加入標籤屬性為搜尋條件

<p id="p1">我是段落一</p>
<class 'bs4.element.Tag'>
[<p id="p1">我是段落一</p>, <p class="red" id="p2">我是段落二</p>]
<class 'bs4.element.ResultSet'>
<p class="red" id="p2">我是段落二</p>
<p class="red" id="p2">我是段落二</p>


## 利用CSS選擇器找尋內容：select()

In [26]:
from bs4 import BeautifulSoup
sp = BeautifulSoup(html, 'lxml')
print(sp.select('title'))  #select一律回覆串列格式
print(type(sp.select('title'))) # ResultSet物件
print(sp.select('p'))
print(type(sp.select('p'))) # ResultSet物件
print(sp.select('#p1'))     #利用CSS 選擇器
print(sp.select('.red'))    #利用CSS 選擇器

[<title>我是網頁標題</title>]
<class 'bs4.element.ResultSet'>
[<p id="p1">我是段落一</p>, <p class="red" id="p2">我是段落二</p>]
<class 'bs4.element.ResultSet'>
[<p id="p1">我是段落一</p>]
[<p class="red" id="p2">我是段落二</p>]


## *取得標籤的屬性內容*

In [28]:
html = '''
<html>
  <head><meta charset="UTF-8"><title>我是網頁標題</title></head>
  <body>
      <img src="http://www.ehappy.tw/python.png">
      <a href="http://www.e-happy.com.tw">超連結</a>
  </body>
</html>
'''

In [29]:
from bs4 import BeautifulSoup
sp = BeautifulSoup(html, 'lxml')
print(sp.select('img')[0].get('src')) #使用get()方法
print(sp.select('a')[0].get('href'))
print(sp.select('img')[0]['src'])     #使用字典的索引方法
print(sp.select('a')[0]['href'])

http://www.ehappy.tw/python.png
http://www.e-happy.com.tw
http://www.ehappy.tw/python.png
http://www.e-happy.com.tw


In [30]:
#取得Tag的屬性和相關值
#https://www.crummy.com/software/BeautifulSoup/bs4/doc/

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</body>
</html>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

p = soup.p
print(p)
print(p.name)
print(p.contents)  #A tag’s children are available in a list called .contents
print(p.string)
print(p['class'])
print(p.get('class'))

<p class="title"><b>The Dormouse's story</b></p>
p
[<b>The Dormouse's story</b>]
The Dormouse's story
['title']
['title']


In [31]:
#取得Tag的屬性和相關值
#https://www.crummy.com/software/BeautifulSoup/bs4/doc/

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</body>
</html>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

a = soup.a
print(a)
print(a.name)
print(a.contents)
print(a.string)
print(a['href'])  # print(a.get('href'))
print(a['class']) # print(a.get('class'))
print(a['id'])    # print(a.get('id'))

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
a
['Elsie']
Elsie
http://example.com/elsie
['sister']
link1


In [2]:
#取得Tag的內容
#https://www.crummy.com/software/BeautifulSoup/bs4/doc/

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<ul>
  <li>Coffee</li>
  <li>Tea</li>
  <li>Milk</li>
</ul>
</body>
</html>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

p = soup.ul
print(p)
print(p.name)
print(p.contents)  #A tag’s children are available in a list called .contents
print(p.text)      #Tag內容的文字
# print(p.string)
i = 1
for t in p.children:   #取出Tag內容的每一個child (同.contents的串列元素)
    print(i,t)
    i = i + 1
# print(p.children)

<ul>
<li>Coffee</li>
<li>Tea</li>
<li>Milk</li>
</ul>
ul
['\n', <li>Coffee</li>, '\n', <li>Tea</li>, '\n', <li>Milk</li>, '\n']

Coffee
Tea
Milk

1 

2 <li>Coffee</li>
3 

4 <li>Tea</li>
5 

6 <li>Milk</li>
7 



## 專題：威力彩開獎號碼

In [32]:
import requests
from bs4 import BeautifulSoup
url = 'https://www.taiwanlottery.com.tw/'
r = requests.get(url)
sp = BeautifulSoup(r.text, 'lxml')
# 找到威力彩的區塊
datas = sp.find('div', class_='contents_box02')
# 開獎期數
title = datas.find('span', 'font_black15').text
print('威力彩期數：', title)
# 開獎號碼
nums = datas.find_all('div', class_='ball_tx ball_green')
# 開出順序
print('開出順序：', end=' ')
for i in range(0,6):
    print(nums[i].text, end=' ')
# 大小順序
print('\n大小順序：', end=' ')
for i in range(6,12):
    print(nums[i].text, end=' ')
# 第二區
num = datas.find('div', class_='ball_red').text
print('\n第二區：', num)

威力彩期數： 112/8/28 第112000069期 
開出順序： 03  08  06  15  38  24  
大小順序： 03  06  08  15  24  38  
第二區： 05 


# 使用正規表達式

## 建立正規表達式物件

In [33]:
#https://www.w3schools.com/python/python_regex.asp
#https://www.runoob.com/python/python-reg-expressions.html
#https://pythex.org/

import re
m = re.search(r'[0-9]+','abc123xyz') #在字串中，尋找一個相符者(可以是字串中間一段)
print(m)

<re.Match object; span=(3, 6), match='123'>


### match()方法

In [34]:
import re
m = re.match(r'[a-z]+','abc123xyz')  #在字串中，尋找一個相符者，必須從字串開頭相符
print(m)

<re.Match object; span=(0, 3), match='abc'>


In [35]:
if m != None:
    print(m.group())    #abc
    print(m.start())    #0
    print(m.end())      #3
    print(m.span())     #(0, 3)

abc
0
3
(0, 3)


In [None]:
#re.compile 函數
#compile 函數用來編譯正規表示式，產生一個正規表示式（ Pattern ）對象，可以給match() 、 search() 以及findall 等函數使用
#https://www.runoob.com/python/python-reg-expressions.html

import re
p =  re.compile(r'[a-z]+')
m = p.match('abc123xyz') #在字串中，尋找一個相符者(可以是字串中間一段)
print(m)

### search()方法

In [36]:
import re
m = re.search(r'[a-z]+', 'abc123xyz')
print(m)    # <re.Match object; span=(0, 3), match='abc'>
if m != None:
    print(m.group())  # abc
    print(m.start())  # 0
    print(m.end())    # 3
    print(m.span())   # (0,3)

<re.Match object; span=(0, 3), match='abc'>
abc
0
3
(0, 3)


In [None]:
#re.compile 函數
#compile 函數用來編譯正規表示式，產生一個正規表示式（ Pattern ）對象，可以給match() 、 search() 以及findall 等函數使用

import re
p =  re.compile(r'[0-9]+')
m = p.search('abc123xyz') #在字串中，尋找一個相符者(可以是字串中間一段)
print(m)

### findall()方法

In [37]:
import re
m = re.findall(r'[a-z]+', 'abc123xyz')
print(m)    # ['abc', 'xyz'] 

['abc', 'xyz']


In [None]:
#re.compile 函數
#compile 函數用來編譯正規表示式，產生一個正規表示式（ Pattern ）對象，可以給match() 、 search() 以及findall 等函數使用

import re
p =  re.compile(r'[a-z]+')
m = p.findall('abc123xyz')
print(m)    # ['abc', 'xyz'] 

## 使用正規表達式取代內容

In [38]:
import re
result = re.sub(r"\d+", "*", "Password:1234,ID:5678")
print(result)		# Password:*,ID:*

Password:*,ID:*


In [39]:
import re
result = re.sub(r"\d", "*", "Password:1234,ID:5678")
print(result)		# Password:*,ID:*

Password:****,ID:****


## 範例：正規表示式練習

In [27]:
html = """
<div class="content">
    E-Mail：<a href="mailto:mail@test.com.tw">
      mail</a><br>
    E-Mail2：<a href="mailto:mail2@test.com.tw">
      mail2</a><br>
    <ul class="price">定價：360元 </ul>
    <img src="http://test.com.tw/p1.jpg">
    <img src="http://test.com.tw/p2.png">
    電話：(04)-76543210、0937-123456
</div>
"""

In [28]:
import re
pattern=r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
emails = re.findall(pattern,html)
for email in emails: #顯示 email
    print(email)

price=re.findall(r'[\d]+元',html)[0].split('元')[0] #價格
print(price) #顯示定價金額

imglist = re.findall(r'[http://]+[a-zA-Z0-9-/.]+\.[jpgpng]+',html)
for img in imglist: #
    print(img) #顯示圖片網址
    
phonelist = re.findall(r'\(?\d{2,4}\)?\-\d{6,8}',html)
for phone in phonelist:
    print(phone) #顯示電話號碼 

mail@test.com.tw
mail2@test.com.tw
360
http://test.com.tw/p1.jpg
http://test.com.tw/p2.png
(04)-76543210
0937-123456


In [40]:
html = """
<div class="content">
    E-Mail：<a href="mailto:mail@test.com.tw">mail</a><br>
    E-Mail2：<a href="mailto:mail2@test.com.tw">mail2</a><br>
    <ul class="price">定價：360元 </ul>
    <img src="http://test.com.tw/p1.jpg">
    <img src="http://test.com.tw/p2.jpg">
    <img src="http://test.com.tw/p3.png">
    <img src="http://test.com.tw/p4.p">
    <img src="http://test.com.tw/p4.jjj">
    電話：(04)-76543210、0937-123456
</div>
"""

import re

imglist = re.findall(r'[http://]+[a-zA-Z0-9-/.]+\.[jpgpng]+',html)  #錯誤
for img in imglist: #
    print(img) #顯示圖片網址

http://test.com.tw/p1.jpg
http://test.com.tw/p2.jpg
http://test.com.tw/p3.png
http://test.com.tw/p4.p
http://test.com.tw/p4.jjj


In [41]:
html = """
<div class="content">
    E-Mail：<a href="mailto:mail@test.com.tw">mail</a><br>
    E-Mail2：<a href="mailto:mail2@test.com.tw">mail2</a><br>
    <ul class="price">定價：360元 </ul>
    <img src="http://test.com.tw/p1.jpg">
    <img src="http://test.com.tw/p2.jpg">
    <img src="http://test.com.tw/p3.png">
    <img src="http://test.com.tw/p4.p">
    <img src="http://test.com.tw/p4.jjj">
    電話：(04)-76543210、0937-123456
</div>
"""

import re

imglist = re.findall(r'[http://]+[a-zA-Z0-9-/.]+\.jpg|[http://]+[a-zA-Z0-9-/.]+\.png',html)  #OK
for img in imglist: #
    print(img) #顯示圖片網址

http://test.com.tw/p1.jpg
http://test.com.tw/p2.jpg
http://test.com.tw/p3.png


In [42]:
html = """
<div class="content">
    E-Mail：<a href="mailto:mail@test.com.tw">mail</a><br>
    E-Mail2：<a href="mailto:mail2@test.com.tw">mail2</a><br>
    <ul class="price">定價：360元 </ul>
    <img src="http://test.com.tw/p1.jpg">
    <img src="http://test.com.tw/p2.jpg">
    <img src="http://test.com.tw/p3.png">
    <img src="http://test.com.tw/p4.p">
    <img src="http://test.com.tw/p4.jjj">
    電話：(04)-76543210、0937-123456
</div>
"""

import re

imglist = re.findall(r'http://[a-zA-Z0-9-/.]+\.jpg|http://[a-zA-Z0-9-/.]+\.png',html)  #OK
for img in imglist: #
    print(img) #顯示圖片網址

http://test.com.tw/p1.jpg
http://test.com.tw/p2.jpg
http://test.com.tw/p3.png


In [None]:
#https://pythex.org/
#測試規則: http://[a-zA-Z0-9-/.]+\.(jpg|png)

In [43]:
#https://blog.gtwang.org/programming/python-beautiful-soup-module-scrape-web-pages-tutorial/

# 引入 Beautiful Soup 模組
from bs4 import BeautifulSoup

# 原始 HTML 程式碼
html_doc = """
<html><head><title>Hello World</title></head>
<body><h2>Test Header</h2>
<p>This is a test.</p>
<a id="link1" href="/my_link1">Link 1</a>
<a id="link2" href="/my_link2">Link 2</a>
<p>Hello, <b class="boldtext">Bold Text</b></p>
</body></html>
"""

# 以 Beautiful Soup 解析 HTML 程式碼
soup = BeautifulSoup(html_doc, 'html.parser')

# 輸出排版後的 HTML 程式碼
print(soup.prettify())

<html>
 <head>
  <title>
   Hello World
  </title>
 </head>
 <body>
  <h2>
   Test Header
  </h2>
  <p>
   This is a test.
  </p>
  <a href="/my_link1" id="link1">
   Link 1
  </a>
  <a href="/my_link2" id="link2">
   Link 2
  </a>
  <p>
   Hello,
   <b class="boldtext">
    Bold Text
   </b>
  </p>
 </body>
</html>



In [44]:
#https://blog.gtwang.org/programming/python-beautiful-soup-module-scrape-web-pages-tutorial/

# 引入 Beautiful Soup 模組
from bs4 import BeautifulSoup

# 原始 HTML 程式碼
html_doc = """
<html><head><title>Hello World</title></head>
<body><h2>Test Header</h2>
<p>This is a test.</p>
<a id="link1" href="/my_link1">Link 1</a>
<a id="link2" href="/my_link2">Link 2</a>
<p>Hello, <b class="boldtext">Bold Text</b></p>
</body></html>
"""

# 以 Beautiful Soup 解析 HTML 程式碼
soup = BeautifulSoup(html_doc, 'html.parser')

# 搜尋 href 屬性為 /my_link1 的 a 節點
a_tag = soup.find_all("a", href="/my_link1")
print(a_tag)

[<a href="/my_link1" id="link1">Link 1</a>]


In [45]:
#https://blog.gtwang.org/programming/python-beautiful-soup-module-scrape-web-pages-tutorial/

# 引入 Beautiful Soup 模組
from bs4 import BeautifulSoup

import re

# 原始 HTML 程式碼
html_doc = """
<html><head><title>Hello World</title></head>
<body><h2>Test Header</h2>
<p>This is a test.</p>
<a id="link1" href="/my_link1">Link 1</a>
<a id="link2" href="/my_link2">Link 2</a>
<p>Hello, <b class="boldtext">Bold Text</b></p>
</body></html>
"""

# 以 Beautiful Soup 解析 HTML 程式碼
soup = BeautifulSoup(html_doc, 'html.parser')

# 以正規表示法比對超連結網址
links = soup.find_all(href=re.compile("^/my_link\d")) #找出href="/my_link1", href="/my_link2"
print(links)

[<a href="/my_link1" id="link1">Link 1</a>, <a href="/my_link2" id="link2">Link 2</a>]
