In [1]:
import urllib.request
import urllib.parse
import urllib.error
from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, build_opener
from urllib.request import ProxyHandler
import http.cookiejar

In [2]:
"""urllib.request.urlopen: directly get the source code of website"""

response = urllib.request.urlopen('https://www.python.org') # http.client.HTTPResponse

web_content = response.read().decode('utf-8') # web content: html...
status_code = response.status
web_headers = response.getheaders()

In [3]:
"""urllib.request.urlopen: post request need data params"""

data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf-8')
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read())

b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "word": "hello"\n  }, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Content-Length": "10", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.11", \n    "X-Amzn-Trace-Id": "Root=1-6553061d-79ca29c60efa84526c967739"\n  }, \n  "json": null, \n  "origin": "223.104.122.18", \n  "url": "http://httpbin.org/post"\n}\n'


In [4]:
"""urllib.request.urlopen: handle simple exception"""

try:
   response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.01)
except urllib.error.URLError as e:
    print(e.reason)

timed out


In [6]:
"""urllib.request: wrap Request body"""

url = 'http://httpbin.org/get'
headers = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
    'Host': 'httpbin.org'
}
dict = {
    'name': 'Germen'
}
data = bytes(urllib.parse.urlencode(dict), encoding='utf-8')
req = urllib.request.Request(url=url, data=data, headers=headers, method='GET')
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

req = urllib.request.Request(url=url, data=data, method='GET')
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "11", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)", 
    "X-Amzn-Trace-Id": "Root=1-65530629-4458026a5edffe4627f8e6a9"
  }, 
  "origin": "223.104.122.18", 
  "url": "http://httpbin.org/get"
}

{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "11", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)", 
    "X-Amzn-Trace-Id": "Root=1-65530629-09a38ecb0bd776773683aac3"
  }, 
  "origin": "223.104.122.10", 
  "url": "http://httpbin.org/get"
}



In [11]:
"""urllib.request: build opener to add proxy"""

from urllib.request import build_opener

proxy_handler = ProxyHandler({
    'http': 'http://127.0.0.1:50151', 
    'https': 'https://127.0.0.1:50151'
})
opener = build_opener(proxy_handler)    # 利用handler构造Opener
try:
    response = opener.open('https://www.google.com')
    google = response.read().decode('utf-8')
    print('connect successfully')
except urllib.error.URLError as e:
    print(e.reason)
# if no proxy exists, it'll return 'connection refuse'.Of course, even successful requests
# may return other errors, such as 'too many requests'

Too Many Requests


In [13]:
"""urllib.request: handle cookies"""
import http.cookiejar

cookie = http.cookiejar.CookieJar()     # 创建 CookieJar 对象
handler = urllib.request.HTTPCookieProcessor(cookie)    # 构造handler
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
    print(item.name+"="+item.value)

# save cookies as a file
filename = '../data/cookies.txt'
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)
# if want to save as LWP, use 'cookie = http.cookiejar.LWPCookieJar(filename)'

# read local cookies
cookie = http.cookiejar.MozillaCookieJar()
cookie.load(filename, ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
# print(response.read().decode('utf-8'))

BIDUPSID=256F2DB4CEEAE718087EC9D1A85BDF57
PSTM=1699940766
BAIDUID=256F2DB4CEEAE718087EC9D1A85BDF57:FG=1
BAIDUID_BFESS=256F2DB4CEEAE718087EC9D1A85BDF57:FG=1
