# 网络爬虫

编码：utf-8；作者：王明智；Email：1765471602@qq.com；

## 导入第三方库

In [None]:
import re
import jieba
import json
import time
import math
import requests
import geopandas
import osmnx as ox
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from matplotlib import colors
from wordcloud import WordCloud
from shapely.geometry import MultiPolygon, Polygon

## 修改全局变量

In [None]:
plt.rcParams['font.sans-serif'] = ['SimSun']
plt.rcParams['axes.unicode_minus'] = False

## 获取交叉口经纬度数据

根据交叉口名称，通过高德地图API接口爬取交叉口的经纬度

In [None]:
def getintersectionlatlng(temp_crossroadname, result, error):
    try:
        url_1 = 'https://restapi.amap.com/v3/geocode/geo?address='
        url_2 = '&batch=true&output=json&key=eff48ee434d763609e59839fa946b9e1'
        url = url_1 + '|'.join(temp_crossroadname) + url_2  # 对把交叉口名包含在url中

        r_text = requests.get(url)
        r_text.raise_for_status()  # 当出现错误时及时抛出错误
        content = json.loads(r_text.content)
        r_text.close()  # 很重要的一步！！！，否则会导致错误

        status = content["status"]
        for k in range(int(content["count"])):
            if status == "1":
                adcode = content["geocodes"][k]["adcode"]
                formatted_address = content["geocodes"][k]["formatted_address"]
                location = content["geocodes"][k]["location"]
                level = content["geocodes"][k]["level"]
                result.append((temp_crossroadname[k], formatted_address, adcode, location, level))
            else:
                error.append(temp_crossroadname[k])
                print('error!')
    except TimeoutError:
        print('timeout error')


result = []  # 设置一个列表用来存放提取结果
error = []  # 设置一个列表用来存放请求失败的交叉口数据
result.append(('Name', 'formatted_address', 'adcode', 'location', 'level'))
error.append('Name')

with open(r'F:\18120900\桌面\地理逆编码.txt', 'r', encoding='utf-8') as f:
    crossRoad = f.readlines()
print(len(crossRoad))

temp_crossRoadName = []  # 设置一个列表用来存放交叉口名称
i = -1
for crossRoadName in crossRoad:
    i += 1
    temp_crossRoadName.append(crossRoadName.replace('\n', ''))
    if i % 10 == 9 or i == len(crossRoad) - 1:
        print(i + 1)  # 显示处理到那一个交叉口了
        getintersectionlatlng(temp_crossRoadName, result, error)
        temp_crossRoadName.clear()
        time.sleep(2)
df = pd.DataFrame(result)
df.to_excel(r'F:\18120900\桌面\地理逆编码处理结果.xlsx')
print('Finished')

## 获取省市县区划Shape数据

参考连接：https://mp.weixin.qq.com/s/cUW7cm0_shipSs2_-3x5Ag
参考连接：https://mp.weixin.qq.com/s/JKP-Do8zR_hiW4qJrahgYQ

In [None]:
def crawler(_columns, _properties, _geometry, url, _code):
    """
    爬取url数据并返回_columns, _properties, _geometry
    @param _columns: 用来存放列名
    @param _geometry: 用来存放每个区划的polygon
    @param _properties: 每个区划的相关信息
    @param url: url
    @param _code: 待爬取城市的代码，与url拼合后爬取数据
    @return: _columns, _properties, _geometry
    """
    r_text = requests.get(url + _code + '.json')
    r_text.raise_for_status()  # 当出现错误时及时抛出错误
    content = json.loads(r_text.content)  # 解析url返回的数据
    for item in content['features']:
        if not _columns:
            _columns.append(list(item['properties'].keys()))
        # 获取对应值,并把列表中的值全部转为str否则生成shape文件时会存在问题
        _properties.append([str(item) for item in list(item['properties'].values())])
        polygons = [Polygon(coordinate[0]) for coordinate in item['geometry']['coordinates']]
        _geometry.append(MultiPolygon(polygons))


def draw(data):
    fig, ax = plt.subplots()
    data.to_crs({'init': 'epsg:4524'}).plot(ax=ax, alpha=0.85)  # 投影到epsg:4524,避免看起来扁
    plt.title("中国地图", fontsize=12)
    plt.tight_layout()
    plt.show()
    

def geojson2shape(data, file_save, crs):
    """
    geojson文件转存为shape文件
    @param data: GeoDataFrame格式数据
    @param file_save: shape文件保存地址
    @param crs: 指定shape文件的坐标系统
    @return: None
    """
    data.to_crs(crs, inplace=True)
    data.to_file(file_save + '.shp', driver='ESRI Shapefile', encoding='utf-8')
    print("保存成功，文件存放在：" + file_save)


def geojson2file(data, file_save, crs):
    data.to_crs(crs, inplace=True)
    data.to_json(file_save + '.json')
    print("保存成功，文件存放在：" + file_save)


columns, properties, geometry = [], [], []
areacode_list = ['410000_full']
for code in areacode_list:
    crawler(columns, properties, geometry, r"https://geo.datav.aliyun.com/areas_v2/bound/", code)
df = pd.DataFrame(properties, columns=columns[0])
gdf = geopandas.GeoDataFrame(df, geometry=geometry)
gdf.crs = {'init': 'epsg:4326'}  # 设置geojson的地理坐标系
draw(gdf)                        # 画出geojson地图
# file_name = '省级行政区划'
# geojson2shape(gdf, file_name, {'init': 'epsg:4326'})  # 转存为shape文件
# print("Done")

## 爬取高德地图收费站数据

In [None]:
def getstationinfo(url, params):
    """
    @param url: str
    @param params: dict
    @return: Dataframe
    """
    js = crawler(url, params)
    name = [key for key, value in js['pois'][0].items() if isinstance(value, str)]
    pagecount = -(-int(js['count']) // params['offset'])
    print('total station count:', js['count'])
    temp_list = []
    for i in range(pagecount):
        params['page'] = i + 1
        js = crawler(url, params)
        if js['status'] == '1':
            for v in js['pois']:
                temp_list.append([value for value in v.values() if isinstance(value, str)])
        else:
            print('meet an error in page:', params['page'])
            continue
        print('processing:', (params['page'] - 1) * params['offset'] + len(js['pois']))
    return pd.DataFrame(columns=name, data=temp_list)


def crawler(url, params):
    requests.DEFAULT_RETRIES = 5
    res = requests.get(url, params, timeout=25)
    res.raise_for_status()  # 如果响应状态码不是 200，就主动抛出异常
    js = json.loads(res.text)
    res.close()
    time.sleep(5)
    return js


def GCJ2WGS(location):
    """
    @param location: locations[1] = "113.923745,22.530824"
    @return: str: wgsLon,wgsLat
    """
    # 官方API: http://lbs.amap.com/api/webservice/guide/api/convert
    # 坐标体系说明：http://lbs.amap.com/faq/top/coordinate/3
    # GCJ02->WGS84 Java版本：http://www.cnblogs.com/xinghuangroup/p/5787306.html
    # 验证坐标转换正确性的地址：http://www.gpsspg.com/maps.htm

    lon = float(location[0:location.find(",")])
    lat = float(location[location.find(",") + 1:len(location)])
    a = 6378245.0  # 克拉索夫斯基椭球参数长半轴a
    ee = 0.00669342162296594323  # 克拉索夫斯基椭球参数第一偏心率平方
    PI = 3.14159265358979324  # 圆周率
    # 以下为转换公式
    x = lon - 105.0
    y = lat - 35.0
    # 经度
    dLon = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x))
    dLon += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0
    dLon += (20.0 * math.sin(x * PI) + 40.0 * math.sin(x / 3.0 * PI)) * 2.0 / 3.0
    dLon += (150.0 * math.sin(x / 12.0 * PI) + 300.0 * math.sin(x / 30.0 * PI)) * 2.0 / 3.0
    # 纬度
    dLat = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x))
    dLat += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0
    dLat += (20.0 * math.sin(y * PI) + 40.0 * math.sin(y / 3.0 * PI)) * 2.0 / 3.0
    dLat += (160.0 * math.sin(y / 12.0 * PI) + 320 * math.sin(y * PI / 30.0)) * 2.0 / 3.0
    radLat = lat / 180.0 * PI
    magic = math.sin(radLat)
    magic = 1 - ee * magic * magic
    sqrtMagic = math.sqrt(magic)
    dLat = (dLat * 180.0) / ((a * (1 - ee)) / (magic * sqrtMagic) * PI)
    dLon = (dLon * 180.0) / (a / sqrtMagic * math.cos(radLat) * PI)
    wgsLon = lon - dLon
    wgsLat = lat - dLat
    return str(round(wgsLon, 6)) + ',' + str(round(wgsLat, 6))


url = 'https://restapi.amap.com/v3/place/text?'
params = {'keywords': '收费站', 'city': 'heilongjiang', 'page': '1', 'offset': 20, 'key': 'eff48ee434d763609e59839fa946b9e1'}
df = getstationinfo(url, params)

df['wgsLocation'] = df['location'].map(GCJ2WGS)
time = time.strftime('%Y.%m.%d',time.localtime(time.time()))
save_path = 'F:\\18120900\\桌面\\%s-%s.xlsx'%(params['city'], time)
df.to_excel(save_path, index=None)
print("Done!")

## 爬取OSM数据

In [None]:
def crawler():
    city = ox.graph_from_place("福田区,深圳")  # 从OSM上爬取福田区地图
    ox.plot_graph(city)  # 用python展示地图
    ox.save_graph_shapefile(city, filepath='szft')  # 保存地图


crawler()
print('Done')

## 爬取豆瓣数据并生成词云

In [None]:
color_list = [
    '#CD853F', '#DC143C', '#00FF7F', '#FF6347', '#8B008B', '#00FFFF',
    '#0000FF', '#8B0000', '#FF8C00', '#1E90FF', '#00FF00', '#FFD700',
    '#008080', '#008B8B', '#8A2BE2', '#228B22', '#FA8072', '#808080']


def getCommentsFromDouban(url, headers, comments):
    try:
        r_text = requests.get(url, headers=headers)
        soup = BeautifulSoup(r_text.text, 'lxml')
        pattern = soup.find_all('span', 'short')
        for item in pattern:
            comments.append(item.string)

        pattern_s = re.compile('<span class="user-stars allstar(.*?) rating"')
        p = re.findall(pattern_s, r_text.text)
        s = 0
        for star in p:
            s += int(star)
        print(s)

    except TimeoutError:
        print('Unknow error')


def simpleWC3(sep=' ', back='black', freDictpath='data_fre.json', savepath='res.png'):
    """
    词云可视化Demo【自定义字体的颜色】
    """
    # 基于自定义颜色表构建colormap对象
    colormap = colors.ListedColormap(color_list)
    try:
        with open(freDictpath) as f:
            data = f.readlines()
            data_list = [one.strip().split(sep) for one in data if one]
        fre_dict = {}
        for one_list in data_list:
            fre_dict[one_list[0]] = int(one_list[1])
    except FileNotFoundError:
        fre_dict = freDictpath
    wc = WordCloud(
        font_path='font/simhei.ttf',  # 设置字体  #simhei
        background_color=back,  # 背景颜色
        max_words=2300,  # 词云显示的最大词数
        max_font_size=120,  # 字体最大值
        colormap=colormap,  # 自定义构建colormap对象
        margin=2, width=3200, height=2400, random_state=42, prefer_horizontal=0.5)  # 无法水平放置就垂直放置
    wc.generate_from_frequencies(fre_dict)
    plt.figure()
    plt.imshow(wc)
    plt.axis("off")
    # wc.to_file(savepath)


# 获取书评内容
pageNum = 10
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0;Win64;64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/78.03904.108 Safari/537.36'}
comments = []
for index in range(1, pageNum + 1):
    url = 'https://book.douban.com/subject/1029553/comments/hot?p=' + str(index)
    getCommentsFromDouban(url, headers, comments)

# 生成词云
word_list = "/".join(jieba.cut('。'.join(comments))).split('/')
fre_dict = {}
for one in word_list:
    if one in fre_dict:
        fre_dict[one] += 1
    else:
        fre_dict[one] = 1
simpleWC3(sep=' ', back='black', freDictpath=fre_dict, savepath='simpleWC3.png')

## 爬取疫情数据

In [None]:
def str2json(str, code):
    return str.replace('\\', '').replace('(', '').replace(')', '').replace(code, '').replace('"{', '{').replace('}"', '}')


def get_json(url, code):
    response = str(requests.get(url).content, 'utf-8')
    res = str2json(response, code)
    data = json.loads(res)  # 提取数据部分
    return data


def get_china_data(data):
    update_time = data["data"]["lastUpdateTime"]
    areaTree = data["data"]["areaTree"]  # 各地方数据

    filepath = "中国各城市病例数据-new.csv"
    with open(filepath, "w+", newline="", encoding='utf_8_sig') as csv_file:
        writer = csv.writer(csv_file)
        header = [
            "province", "city_name", "today_confirm", "today_confirmCuts",
            "total_confirm", "total_dead", "total_heal", "total_nowConfirm",
            "total_suspect", "update_time"]
        writer.writerow(header)

        china_data = areaTree[0]["children"]  # 中国数据
        for j in range(len(china_data)):
            province = china_data[j]["name"]  # 省份
            city_list = china_data[j]["children"]  # 该省份下面城市列表
            for k in range(len(city_list)):
                city_name = city_list[k]["name"]  # 城市名称
                today_confirm = city_list[k]["today"]["confirm"]  # 今日确认病例
                today_confirmCuts = city_list[k]["today"]["confirmCuts"]
                total_confirm = city_list[k]["total"]["confirm"]  # 总确认病例
                total_dead = city_list[k]["total"]["dead"]  # 总死亡病例
                total_heal = city_list[k]["total"]["heal"]  # 总治愈病例
                total_nowConfirm = city_list[k]["total"]["nowConfirm"]
                total_suspect = city_list[k]["total"]["suspect"]  # 总疑似病例

                data_row3 = [
                    province, city_name, today_confirm, today_confirmCuts,
                    total_confirm, total_dead, total_heal, total_nowConfirm,
                    total_suspect, update_time]
                writer.writerow(data_row3)


url = "https://view.inews.qq.com/g2/getOnsInfo?{0}&{1}"
name_china = "name=disease_h5"
callback_china = "callback=jQuery34105039333360681013_1584838849613&_=1584838849614"
code_china = "jQuery34105039333360681013_1584838849613"
url_china = url.format(name_china, callback_china)
get_china_data(get_json(url_china, code_china))
print("已完成中国数据的爬取")