# urllib

In [10]:
from urllib import request

with request.urlopen('https://api.douban.com/v2/book/2129650') as f:
    data = f.read()
    print('Status:', f.status, f.reason)
    for k, v in f.getheaders():
        print('%s: %s' % (k, v))
    print('Data:', data.decode('utf-8'))

HTTPError: HTTP Error 400: Bad Request

## 模拟浏览器发送GET请求

In [11]:
from urllib import request

req = request.Request('http://www.douban.com/')
req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
with request.urlopen(req) as f:
    print('Status:', f.status, f.reason)
    for k, v in f.getheaders():
        print('%s: %s' % (k, v))
    print('Data:', f.read().decode('utf-8'))

Status: 200 OK
Date: Wed, 27 Jul 2022 06:51:13 GMT
Content-Type: text/html; charset=utf-8
Transfer-Encoding: chunked
Connection: close
X-Xss-Protection: 1; mode=block
X-Douban-Mobileapp: 0
Expires: Sun, 1 Jan 2006 01:00:00 GMT
Pragma: no-cache
Cache-Control: must-revalidate, no-cache, private
X-DAE-App: talion
X-DAE-Instance: default
Set-Cookie: bid=cdMiazZbstk; Expires=Thu, 27-Jul-23 06:51:13 GMT; Domain=.douban.com; Path=/
X-DOUBAN-NEWBID: cdMiazZbstk
Server: dae
Strict-Transport-Security: max-age=15552000
X-Content-Type-Options: nosniff
X-Frame-Options: SAMEORIGIN
Data: 

<!DOCTYPE html>
<html itemscope itemtype="http://schema.org/WebPage" class="ua-safari ua-mobile ">
  <head>
      <meta charset="UTF-8">
      <title>豆瓣(手机版)</title>
      <meta name="google-site-verification" content="ok0wCgT20tBBgo9_zat2iAcimtN4Ftf5ccsh092Xeyw" />
      <meta name="viewport" content="width=device-width, height=device-height, user-scalable=no, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.

In [13]:
from urllib import request, parse

print('Login to weibo.cn...')
email = input('Email:')
passwd = input('Password: ')
login_data = parse.urlencode([
    ('username', email),
    ('password', passwd),
    ('entry', 'mweibo'),
    ('client_id',''),
    ('savestate', '1'),
    ('ec',''),
    ('pagerefer', 'https://passport.weibo.cn/signin/welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F')
])

req = request.Request('https://passport.weibo.cn/sso/login')
req.add_header('Origin', 'https://passport.weibo.cn')
req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
req.add_header('Referer', 'https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F')

with request.urlopen(req, data=login_data.encode('utf-8')) as f:
    print('Status:', f.status, f.reason)
    for k, v in f.getheaders():
        print('%s: %s' % (k, v))
    print('Data:', f.read().decode('utf-8'))

Login to weibo.cn...
Email:janineflagship@163.com
Password: duoduo20201210@w
Status: 200 OK
Server: nginx
Date: Wed, 27 Jul 2022 07:20:31 GMT
Content-Type: text/html
Transfer-Encoding: chunked
Connection: close
Vary: Accept-Encoding
Cache-Control: no-cache, must-revalidate
Expires: Sat, 26 Jul 1997 05:00:00 GMT
Pragma: no-cache
Access-Control-Allow-Origin: https://passport.weibo.cn
Access-Control-Allow-Credentials: true
X-Via-SSL: ssl.138.sinag1.yf.lb.sinanode.com
Data: {"retcode":50050011,"msg":"\u8bf7\u5b8c\u6210\u9a8c\u8bc1","data":{"username":"janineflagship@163.com","errurl":"https:\/\/passport.weibo.cn\/verify\/index?id=2YjBi4Oc_AAMX7gWR5ssQN9Q7Beu8QABLBWxvZ2lu&showmenu=0&r=https%3A%2F%2Fpassport.weibo.cn%2Fsignin%2Fwelcome%3Fentry%3Dmweibo%26r%3Dhttp%253A%252F%252Fm.weibo.cn%252F","errline":15}}


In [15]:
import urllib

In [16]:
proxy_handler = urllib.request.ProxyHandler({'http': 'http://www.example.com:3128/'})
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler()
proxy_auth_handler.add_password('realm', 'host', 'username', 'password')
opener = urllib.request.build_opener(proxy_handler, proxy_auth_handler)
with opener.open('http://www.example.com/login.html') as f:
    pass

URLError: <urlopen error [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。>

# 利用urllib读取JSON，然后将JSON解析为Python对象

In [21]:
# -*- coding: utf-8 -*-
from urllib import request
import json

In [26]:
# -*- coding: utf-8 -*-
from urllib import request
import json


def fetch_data(url):
    with request.urlopen(url) as f:
        data = f.read()
        pobj = json.loads(data.decode('utf-8'))

    return pobj

In [27]:
# 测试
URL = 'https://yesno.wtf/api'
data = fetch_data(URL)
print(data)
assert data['answer']== 'yes' and (data['forced']== False)
print('ok')

{'answer': 'yes', 'forced': False, 'image': 'https://yesno.wtf/assets/yes/3-422e51268d64d78241720a7de52fe121.gif'}
ok


# SAX

In [28]:
from xml.parsers.expat import ParserCreate

class DefaultSaxHandler(object):
    
    def start_element(self, name, attrs):
        print('sax:start_element: %s, attrs: %s' % (name, str(attrs)))

    def end_element(self, name):
        print('sax:end_element: %s' % name)

    def char_data(self, text):
        print('sax:char_data: %s' % text)
        
xml = r'''<?xml version="1.0"?>
<ol>
    <li><a href="/python">Python</a></li>
    <li><a href="/ruby">Ruby</a></li>
</ol>
'''

handler = DefaultSaxHandler()
parser = ParserCreate()
parser.StartElementHandler = handler.start_element
parser.EndElementHandler = handler.end_element
parser.CharacterDataHandler = handler.char_data
parser.Parse(xml)

sax:start_element: ol, attrs: {}
sax:char_data: 

sax:char_data:     
sax:start_element: li, attrs: {}
sax:start_element: a, attrs: {'href': '/python'}
sax:char_data: Python
sax:end_element: a
sax:end_element: li
sax:char_data: 

sax:char_data:     
sax:start_element: li, attrs: {}
sax:start_element: a, attrs: {'href': '/ruby'}
sax:char_data: Ruby
sax:end_element: a
sax:end_element: li
sax:char_data: 

sax:end_element: ol


1

In [30]:
xml = '''<response>
    <status>1</status>
    <count>1</count>
    <info>OK</info>
    <infocode>10000</infocode>
    <lives type="list">
        <live>
            <province>北京</province>
            <city>北京市</city>
            <adcode>110000</adcode>
            <weather>多云</weather>
            <temperature>21</temperature>
            <winddirection>东</winddirection>
            <windpower>≤3</windpower>
            <humidity>72</humidity>
            <reporttime>2021-05-14 19:33:56</reporttime>
        </live>
    </lives>
</response>
'''

In [31]:
handler = DefaultSaxHandler()
parser = ParserCreate()
parser.StartElementHandler = handler.start_element
parser.EndElementHandler = handler.end_element
parser.CharacterDataHandler = handler.char_data
parser.Parse(xml)

sax:start_element: response, attrs: {}
sax:char_data: 

sax:char_data:     
sax:start_element: status, attrs: {}
sax:char_data: 1
sax:end_element: status
sax:char_data: 

sax:char_data:     
sax:start_element: count, attrs: {}
sax:char_data: 1
sax:end_element: count
sax:char_data: 

sax:char_data:     
sax:start_element: info, attrs: {}
sax:char_data: OK
sax:end_element: info
sax:char_data: 

sax:char_data:     
sax:start_element: infocode, attrs: {}
sax:char_data: 10000
sax:end_element: infocode
sax:char_data: 

sax:char_data:     
sax:start_element: lives, attrs: {'type': 'list'}
sax:char_data: 

sax:char_data:         
sax:start_element: live, attrs: {}
sax:char_data: 

sax:char_data:             
sax:start_element: province, attrs: {}
sax:char_data: 北京
sax:end_element: province
sax:char_data: 

sax:char_data:             
sax:start_element: city, attrs: {}
sax:char_data: 北京市
sax:end_element: city
sax:char_data: 

sax:char_data:             
sax:start_element: adcode, attrs: {}
sax:

1

In [None]:
def parseXml(xml_str):
    handlder = DefaultSaxHandler()
    parse = ParserCreate()
    parser.StartElementHandler = handler.start_element
    parse.EndElementHandler = handler.end_element
    
    data = handler .data
    
    return {
        'city': '?',
        'forecast': [
            {
                'date': '2017-11-17',
                'high': 43,
                'low' : 26
            },
            {
                'date': '2017-11-18',
                'high': 41,
                'low' : 20
            },
            {
                'date': '2017-11-19',
                'high': 43,
                'low' : 19
            }
        ]
    }
