In [1]:
# 使用urllib来进行一个非常简单的请求
import urllib.request

response = urllib.request.urlopen('https://www.python.org')
print(response.read().decode('utf-8'))

<!doctype html>
<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->
<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->
<!--[if IE 8]>      <html class="no-js ie8 lt-ie9">                 <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js" lang="en" dir="ltr">  <!--<![endif]-->

<head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js">
    <link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/jquery-ui.min.js">

    <meta name="application-name" content="Python.org">
    <meta name="msapplication-tooltip" content="The official home of the Python Programming Language">
    <meta name="apple-mobile-web-app-title" content="Python.org">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-status-bar-style" content="black">

    <meta name="

In [5]:
# 我们可以通过response获取到一些常用信息
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))

200
[('Connection', 'close'), ('Content-Length', '50958'), ('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'DENY'), ('Via', '1.1 vegur, 1.1 varnish, 1.1 varnish'), ('Accept-Ranges', 'bytes'), ('Date', 'Mon, 06 Jun 2022 10:53:16 GMT'), ('Age', '814'), ('X-Served-By', 'cache-iad-kiad7000025-IAD, cache-hkg17926-HKG'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '465, 2167'), ('X-Timer', 'S1654512796.493615,VS0,VE0'), ('Vary', 'Cookie'), ('Strict-Transport-Security', 'max-age=63072000; includeSubDomains')]
nginx


In [6]:
# 请求携带参数
import urllib.parse
import urllib.request

# 转换成字节类型
data = bytes(urllib.parse.urlencode({'word':'hello'}), encoding='utf8')
# 传递我们的参数
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read())

b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "word": "hello"\n  }, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Content-Length": "10", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.7", \n    "X-Amzn-Trace-Id": "Root=1-629dddb0-0e634ed544755dee103ac7da"\n  }, \n  "json": null, \n  "origin": "210.3.248.165", \n  "url": "http://httpbin.org/post"\n}\n'


In [7]:
# 设置超时，如果超时会自动抛出异常
response = urllib.request.urlopen('http://httpbin.org/get', timeout=1)
print(response.read())

timeout: timed out

In [1]:
# 我们也可以携带更多参数
from urllib import request,parse

url = 'http://httpbin.org/post'
# 头数据
headers = {
    'User-Agent': 'Mozila',
    'Host': 'httpbin.org'
}
# 请求体
dict = {
    'name':'Germey'
}
# 对请求体编码然后返回
data = bytes(parse.urlencode(dict), encoding='utf8')
req = request.Request(url=url,data=data,headers=headers,method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "Germey"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "11", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozila", 
    "X-Amzn-Trace-Id": "Root=1-629e94da-7dc531372b0045a312c6380f"
  }, 
  "json": null, 
  "origin": "42.2.113.62", 
  "url": "http://httpbin.org/post"
}



对于需要我们输入身份验证的网站，我们可以这样

![](.1_images/3b0354c1.png)

如果想使用代理，可以这样

![](.1_images/36f1f39f.png)

In [2]:
# 下面我们来尝试获取一下网站的cookie数据
import http.cookiejar,urllib.request

# 解析cookie数据
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
    print('%s=%s' % (item.name, item.value))

BAIDUID=A9A3D368D5A258F4EA1D0EBABAC12930:FG=1
BIDUPSID=A9A3D368D5A258F4E7E2BAA208776459
H_PS_PSSID=36558_36463_36454_31253_34812_36421_36165_36488_36518_36569_36074_26350_36299_36469_22159
PSTM=1654560532
BDSVRTM=0
BD_HOME=1


In [4]:
# 上面这个是获取了字典数据，下面我们可以输出为文件格式
filename = 'cookies.txt'

# 这里会保存为Mozilla浏览器的格式
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

# 可以直接读取
print(open('cookies.txt').read())

In [6]:
# 下面我们来捕获一下异常
from urllib import request,error

# 这里捕获一下url不存在的异常，比如这里的404异常
try:
    response = request.urlopen('https://cuiqingcai.com/666')
except error.URLError as e:
    print(e.reason)

Not Found


In [9]:
# HTTPError 是URLError异常的子类，专门处理请求错误
try:
    response = request.urlopen('https://cuiqingcai.com/666')
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep='\n')

Not Found
404
Server: GitHub.com
Content-Type: text/html; charset=utf-8
Access-Control-Allow-Origin: *
ETag: "6272da93-247b"
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; img-src data:; connect-src 'self'
x-proxy-cache: MISS
X-GitHub-Request-Id: 8A9A:0A7B:4C7843:9BC8A6:629E998F
Accept-Ranges: bytes
Date: Tue, 07 Jun 2022 00:19:28 GMT
Via: 1.1 varnish
Age: 0
X-Served-By: cache-hnd18730-HND
X-Cache: MISS
X-Cache-Hits: 0
X-Timer: S1654561168.874018,VS0,VE165
Vary: Accept-Encoding
X-Fastly-Request-ID: a26163928e65fa5b8e07e7fc3cb7dd734698ef00
X-Cache-Lookup: Cache Miss
Content-Length: 9339
X-NWS-LOG-UUID: 5693790727468638669
Connection: close
X-Cache-Lookup: Cache Miss




In [10]:
# 建议用下面这种方法更好
try:
    response = request.urlopen('https://cuiqingcai.com/666')
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
    print(e.reason)

Not Found
404
Server: GitHub.com
Content-Type: text/html; charset=utf-8
Access-Control-Allow-Origin: *
ETag: "6272da93-247b"
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; img-src data:; connect-src 'self'
x-proxy-cache: MISS
X-GitHub-Request-Id: 4508:51D5:548C18:104C342:629E996D
Accept-Ranges: bytes
Date: Tue, 07 Jun 2022 00:20:15 GMT
Via: 1.1 varnish
Age: 82
X-Served-By: cache-nrt-rjtf7700026-NRT
X-Cache: HIT
X-Cache-Hits: 1
X-Timer: S1654561216.562561,VS0,VE1
Vary: Accept-Encoding
X-Fastly-Request-ID: fe4e030021528e55d8c54bc7f3e0b3412cbc0b53
X-Cache-Lookup: Cache Miss
Content-Length: 9339
X-NWS-LOG-UUID: 3832935027658678788
Connection: close
X-Cache-Lookup: Cache Miss




## 解析链接

In [11]:
# 比如我们可以进行url的识别
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result), result)

<class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')


In [12]:
# 我们可以设置默认值
result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https')
print(result)

ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=5', fragment='comment')


In [15]:
# 我们可以忽略某一部分
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False)
print(result)

ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='')


In [16]:
# 我们可以直接提取需要的内容
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False)
print(result.scheme, result[0], result.netloc, result[1], sep='\n')

http
http
www.baidu.com
www.baidu.com


In [17]:
# 前面是解析，这里我们演示一下反解析
from  urllib.parse import  urlunparse
data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
print(urlunparse(data))

http://www.baidu.com/index.html;user?a=6#comment


In [18]:
# 分割url参数
from urllib.parse import urlsplit
result = urlsplit('http://www.baidu.com/index.html;user?id=5#comment')
print(result)

SplitResult(scheme='http', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment')


In [20]:
from  urllib.parse import  urlunsplit
# 这个长度必须是5
data = ['http', 'www.baidu.com', 'index.html', 'a=6', 'comment']
print(urlunsplit(data))

http://www.baidu.com/index.html?a=6#comment


下面我们进行链接的合并

![](.1_images/be86dadd.png)

参数序列化和反序列化

![](.1_images/13a9492c.png)

![](.1_images/7ff20cbe.png)

![](.1_images/1780c68d.png)

![](.1_images/a2c70d20.png)

![](.1_images/4a6a751a.png)

In [22]:
# 下面解析一下rebots.txt的数据
from  urllib.robotparser import RobotFileParser

rp = RobotFileParser()
rp.set_url('http://www.jianshu.com/robots.txt')
rp.read()
# 表示是否可以爬取这个链接的数据
print(rp.can_fetch('*', 'http://www.jianshu.com/p/456'))

False
