# Ⅰ.静态网页请求

In [None]:
'''
功能：使用Urllib3库来完成请求
作者：吴小华
时间：2019-7-21 上午
'''
import urllib3

#1.发url请求
http = urllib3.PoolManager() #生成一个http请求对象
resp = http.request('GET','http://www.163.com')
print(resp.status) #输出状态码  200代表成功

#2.转换网页编码
data = resp.data.decode('GBK')
print(data) #输出结果

In [None]:
'''
功能：使用requests库来完成请求
作者：吴小华
时间：2019-7-21 上午
'''
import requests
import chardet

#1.发url请求
url = 'http://www.163.com'
ua = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}
resp = requests.get(url,headers = ua,timeout = 3)
print(resp.status_code) #request返回响应码
print(resp.encoding) #查看当前网页编码

#2.转换网页编码
print(chardet.detect(resp.content)) #自动识别网页需要的编码
resp.encoding = chardet.detect(resp.content)['encoding'] #设置网页编码
print(resp.text)

In [None]:
'''
功能：使用re解析网页
作者：吴小华
时间：2019-7-21 上午
'''
import re

#1.search方法匹配
title_str = r'(?<=title>).*?(?=</title)' #写正则表达式字符串
title_pattern = re.compile(title_str) #转换为正则表达式
title_search = re.search(title_pattern,resp.text)
title = title_search.group() #提取search中的内容
print(title)

#2.findall方法匹配
title_str1 = r'<title>(.*?)</title>'
title_pattern1 = re.compile(title_str1) #转换为正则表达式
title1 = re.findall(title_pattern1,resp.text)
print(title1)

# Ⅱ.静态网页解析

In [None]:
'''
功能：使用xpath解析网页
作者：吴小华
时间：2019-7-21 下午
'''
import requests
import chardet
import requests
from PIL import Image
from lxml import etree

#1.发url请求
url = 'http://www.163.com'
resp = requests.get(url,timeout=3)
print(resp.status_code)
print(resp.text)

#2.转换编码
resp.encoding = chardet.detect(resp.content)['encoding']
print(resp.encoding)

#3.转换为xpath格式
html = resp.text #把爬取的网页文本放到html变量中
html_etree = etree.HTML(html,parser = etree.HTMLParser(encoding ='utf-8')) #转换为xpath能识别的格式
result = etree.tostring(html_etree,encoding = 'utf-8',pretty_print = True,method = 'html') #格式修正
print(html_etree)

#4.解析网页元素
result1 = html_etree.xpath('head') #提取元素
print(result1)
result2 = html_etree.xpath('//title/text()') #提取元素title的文本
print(result2)
results3 = html_etree.xpath('//div[starts-with(@class,"news_")]/ul/li/a/text()') #提取多个文本
results4 = html_etree.xpath('//div[starts-with(@class,"news_")]/ul/li/a/@href') #提取多个超链接
print(results3)
print(results4)

#5.文本信息存储
with open('C:/Users/op/Desktop/111/news.txt','w') as pf:
    for i in range(41):
        pf.write(results3[i]+':'+results4[i]+'\n')

#6.图片信息存储
s = requests.Session()
url = 'http://cms-bucket.ws.126.net/2019/07/21/01d104df618a4da8bee48fbfd02507d3.gif'
imag = s.get(url)
with open('C:/Users/op/Desktop/111/pic1.gif','wb') as pf:
    pf.write(imag.content)

#7.查看图片
im = Image.open('C:/Users/op/Desktop/111/pic1.gif')  
im.show()

In [None]:
'''
功能：使用BeautifulSoup解析网页
作者：吴小华
时间：2019-7-21 下午
'''
import requests
from bs4 import BeautifulSoup
import chardet

#1.发url请求
url = 'http://www.hao123.com'
ua = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'}
resp = requests.get(url,headers = ua,timeout=3)
print(resp.encoding)

#2.转换编码
resp.encoding = chardet.detect(resp.content)['encoding']
print(resp.encoding)

#3.解析网页元素
html = resp.content
soup = BeautifulSoup(html,'html.parser') #创建bs对象
soup.prettify #初始化对象
print(soup.title.get_text())