In [None]:
"""网络爬虫

常用的网络请求模块：urllib, requests
urllib是Python的内置标准库
requests是对urllib的再次封装

常用的数据解析模块：lxml, pyquery, beautifulsoup

网络爬虫框架：scrapy
"""

In [2]:
"""urllib的基本使用

urllib提供的功能就是利用程序去执行各种HTTP请求
通过标识User-Agent等手段来伪装成浏览器请求
可以执行GET POST请求，或使用代理
POST请求可传递数据：request.urlopen(req, data=[(xxx), (xxx)])
"""

from urllib import request

req = request.Request('https://www.csdn.net/')
req.add_header('User-Agent', 
               'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')

with request.urlopen(req) as r:
    print('Request Status: ', r.status, r.reason)
    
    print('Headers: ')
    for k, v in r.getheaders():
        print('%s: %s' % (k, v))
    
    print('Data: ')
    print(r.read().decode('utf-8'))

Request Status:  200 OK
Headers: 
Server: openresty
Date: Thu, 09 Jul 2020 02:28:28 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Connection: close
Vary: Accept-Encoding
Set-Cookie: uuid_tt_dd=10_9870921830-1594261707640-494345; Expires=Thu, 01 Jan 2025 00:00:00 GMT; Path=/; Domain=.csdn.net;
Set-Cookie: dc_session_id=10_1594261707640.328777; Expires=Thu, 01 Jan 2025 00:00:00 GMT; Path=/; Domain=.csdn.net;
Vary: Accept-Encoding
Strict-Transport-Security: max-age=31536000
Data: 
<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=Edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no">
    <meta name="apple-mobile-web-app-status-bar-style" content="black">
    <meta name="referrer"content="always">
    <meta name="msvalidate.01" content="3189512127C34C46BC74BED5852D45E4" />
    <title>CSDN - 专业开发者社区</title>
    <meta data-n-head

In [5]:
"""requests的基本用法

支持HTTP连接保持和连接池
支持使用cookie保持会话
支持文件上传
支持自动响应内容的编码
支持国际化的URL和POST数据自动编码
"""

import requests

req = requests.get('https://www.csdn.net/', headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
})

print('Request URL: ', req.url)
print('Request Encode: ', req.encoding)
print('Request Status: ', req.status_code)

print('Text:')
print(req.text)

# print('Content:')
# print(req.content)

# print('JSON:')
# print(req.json())

Request URL:  https://www.csdn.net/
Request Encode:  UTF-8
Request Status:  200
Text:
<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=Edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no">
    <meta name="apple-mobile-web-app-status-bar-style" content="black">
    <meta name="referrer"content="always">
    <meta name="msvalidate.01" content="3189512127C34C46BC74BED5852D45E4" />
    <title>CSDN - 专业开发者社区</title>
    <meta data-n-head="true" data-hid="description" name="description" content="CSDN是全球知名中文IT技术交流平台,创建于1999年,包含原创博客、精品问答、职业培训、技术论坛、资源下载等产品服务,提供原创、优质、完整内容的专业IT技术开发社区.">
    <script src='//g.csdnimg.cn/tingyun/1.8.3/www.js' type='text/javascript'></script>
    <link ref="canonical"  href="https://www.csdn.net/">
    <link href="//csdnimg.cn/public/favicon.ico" rel="SHORTCUT ICON">
    <link rel="stylesheet" href="//csdnimg.cn/public/common/