# urllib 请求对象的定制

解决User-Agent反爬机制

In [None]:

import urllib.request

# 注意这里是https
url = 'https://www.baidu.com'


In [None]:

# url的组成
# https://www.baidu.com/s?wd=周杰伦

# http/https    www.baidu.com    80/443    s             wd=周杰伦    #
# 协议           主机            端口号      路径            参数        锚点
# http 80
# https 443
# mysql 3306
# oracle 1521
# redis 6379
# mongodb 27017

In [None]:


response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
print(content)

# 结果遇到User-Agent反爬机制
# >>>>>>>>>>>>>>>>>>>>>>>   
# <html>
# <head>
#         <script>
#                 location.replace(location.href.replace("https://","http://"));
#         </script>
# </head>
# <body>
#         <noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>
# </body>
# </html>


In [None]:


# 解决：
# 去百度，F12，Network，刷新，找到第一个请求，Headers，最下方有User-Agent, 复制过来
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# 因为urlopen方法中不能存储字典，所以headers不能传递进去
# 请求对象的定制
# 因为参数顺序的问题，不能直接写url 和 headers 中间还有data 所以需要关键字传参
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)