In [1]:
import json
import requests
import time
import pymysql
import hashlib
from bs4 import BeautifulSoup



In [2]:
def get_tencent_data():
    """
    :return: 返回历史数据和当日详细数据
    """
    url_det = 'https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=diseaseh5Shelf'
    url_his = "https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=chinaDayList,chinaDayAddList,nowConfirmStatis,provinceCompare"
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    }
    r_det = requests.get(url_det, headers)
    r_his = requests.get(url_his, headers)
    res_det = json.loads(r_det.text)  # json字符串转字典
    res_his = json.loads(r_his.text)
    data_det = res_det['data']['diseaseh5Shelf']
    data_his = res_his['data']

    history = {}  # 历史数据
    for i in data_his["chinaDayList"]:
        ds = i["y"]+"."+i["date"]
        tup = time.strptime(ds, "%Y.%m.%d")
        ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式,不然插入数据库会报错，数据库是datetime类型
        confirm = i["confirm"]
        confirm_now = i["nowConfirm"]
        suspect = i["suspect"]
        heal = i["heal"]
        dead = i["dead"]
        history[ds] = {"confirm": confirm,"confirm_now":confirm_now, "suspect": suspect, "heal": heal, "dead": dead}
    for i in data_his["chinaDayAddList"]:
        ds = i["y"]+"."+i["date"]
        tup = time.strptime(ds, "%Y.%m.%d")
        ds = time.strftime("%Y-%m-%d", tup)
        confirm_add = i["confirm"]
        suspect_add = i["suspect"]
        heal_add = i["heal"]
        dead_add = i["dead"]
        history[ds].update({"confirm_add": confirm_add, "suspect_add": suspect_add, "heal_add": heal_add, "dead_add": dead_add})

    details = []  # 当日详细数据
    update_time = data_det["lastUpdateTime"]
    data_country = data_det["areaTree"]  # list 之前有25个国家,现在只有中国
    data_province = data_country[0]["children"]  # 中国各省
    for pro_infos in data_province:
        province = pro_infos["name"]  # 省名
        for city_infos in pro_infos["children"]:
            city = city_infos["name"] #城市名
            confirm = city_infos["total"]["confirm"] #l累计确诊
            confirm_add = city_infos["today"]["confirm"] #新增确诊
            confirm_now = city_infos["total"]["nowConfirm"] #现有确诊
            heal = city_infos["total"]["heal"] #累计治愈
            dead = city_infos["total"]["dead"] #累计死亡
            details.append([update_time, province, city, confirm, confirm_add,confirm_now, heal, dead])
    return history, details

In [3]:
def get_conn():
    conn = pymysql.connect(host="127.0.0.1",user="root",password="wwj520hy",db="covid-19",charset="utf8")
    cursor = conn.cursor()
    return conn, cursor
def close_conn(conn, cursor):
    cursor.close()
    conn.close()

## 爬取tencent实时数据

In [4]:
def update_details():
    """
    更新 details 表
    """
    cursor = None
    conn = None
    try:
        li = get_tencent_data()[1]
        conn, cursor = get_conn()
        sql = "insert into details(update_time,province,city,confirm,confirm_add,confirm_now,heal,dead) " \
              "values(%s,%s,%s,%s,%s,%s,%s,%s)"
        sql_query = 'select %s=(select update_time from details order by id desc limit 1)'
        cursor.execute(sql_query,li[0][0])
        if not cursor.fetchone()[0]:
            print(f"{time.asctime()}——开始更新——")
            for item in li:
                cursor.execute(sql, item)
            conn.commit()
            print(f"{time.asctime()}——更新完毕——")
        else:
            print(f"{time.asctime()}——已是最新数据——")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

In [5]:
def update_history():
    """
    更新历史数据
    """
    cursor = None
    conn = None
    try:
        dic = get_tencent_data()[0]
        print(f"{time.asctime()}——更新历史数据——")
        conn, cursor = get_conn()
        sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        sql_query = "select confirm from history where ds=%s"
        for k, v in dic.items():
            if not cursor.execute(sql_query, k): 
                cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"),v.get("confirm_now"),
                                     v.get("suspect"),v.get("suspect_add"), v.get("heal"),
                                     v.get("heal_add"),v.get("dead"), v.get("dead_add")])
        conn.commit() 
        print(f"{time.asctime()}——历史数据更新完毕——")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

## 爬取中高风险地区数据

In [6]:
def get_risk_area():
    """
    risk_h,risk_m 中高风险地区详细数据
    """
    o = '%.3f' % (time.time() / 1e3)
    e = o.replace('.', '')
    i = "23y0ufFl5YxIyGrI8hWRUZmKkvtSjLQA"
    a = "123456789abcdefg"
    s1 = hashlib.sha256()
    s1.update(str(e + i + a + e).encode("utf8"))
    s1 = s1.hexdigest().upper()
    # 签名2
    s2 = hashlib.sha256()
    s2.update(str(e + 'fTN2pfuisxTavbTuYVSsNJHetwq5bJvCQkjjtiLM2dCratiA' + e).encode("utf8"))
    s2 = s2.hexdigest().upper()
    #post请求数据
    post_dict = {
        'appId': 'NcApplication',
        'key': '3C502C97ABDA40D0A60FBEE50FAAD1DA',
        'nonceHeader': '123456789abcdefg',
        'paasHeader': 'zdww',
        'signatureHeader': s1,
        'timestampHeader': e
    }
    headers = {
        'Content-Type': 'application/json; charset=utf-8',
        'Referer': 'http://bmfw.www.gov.cn/',
        'Origin': 'http://bmfw.www.gov.cn',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        'x-wif-nonce': 'QkjjtiLM2dCratiA',
        'x-wif-paasid': 'smt-application',
        'x-wif-signature': s2,
        'x-wif-timestamp': e,
    }
    url = "http://103.66.32.242:8005/zwfwMovePortal/interface/interfaceJson"
    req = requests.post(url=url, data=json.dumps(post_dict), headers=headers)
    resp = req.text
    res = json.loads(resp)
    # print(res)
    utime = res['data']['end_update_time'] #更新时间
    hcount = res['data'].get('hcount',0) #高风险地区个数
    mcount = res['data'].get('mcount',0) #低风险地区个数
    #具体数据
    hlist = res['data']['highlist']
    mlist = res['data']['middlelist']

    risk_h = []
    risk_m = []

    for hd in hlist:
        type = "高风险"
        province = hd['province']
        city = hd['city']
        county = hd['county']
        area_name = hd['area_name']
        communitys = hd['communitys']
        for x in communitys:
            risk_h.append([utime,province,city,county,x,type])

    for md in mlist:
        type = "中风险"
        province = md['province']
        city = md['city']
        county = md['county']
        area_name = md['area_name']
        communitys = md['communitys']
        for x in communitys:
            risk_m.append([utime, province, city, county, x, type])

    return risk_h,risk_m

In [7]:
def update_risk_area():
    """
        更新 risk_area 表
        """
    cursor = None
    conn = None
    try:
        risk_h, risk_m = get_risk_area()
        conn, cursor = get_conn()
        sql = "insert into risk_area(end_update_time,province,city,county,address,type) values(%s,%s,%s,%s,%s,%s)"
        sql_query = 'select %s=(select end_update_time from risk_area order by id desc limit 1)'
        cursor.execute(sql_query, risk_h[0][0])
        if not cursor.fetchone()[0]:
            print(f"{time.asctime()}——开始更新最新数据——")
            for item in risk_h:
                cursor.execute(sql, item)
            for item in risk_m:
                cursor.execute(sql, item)
            conn.commit()  # 提交事务 update delete insert操作
            print(f"{time.asctime()}——更新最新数据完毕——")
        else:
            print(f"{time.asctime()}——已是最新数据——")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

## 爬取百度热搜数据

In [8]:
def get_baidu_hot():
    """
    百度热搜
    """
    url = "https://top.baidu.com/board?tab=realtime"
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    }
    res = requests.get(url, headers=headers)
    html = res.text
    soup = BeautifulSoup(html,features="html.parser")
    titles = soup.select("div.c-single-text-ellipsis")
    count = soup.select("div.hot-index_1Bl1a")
    context = []
    for i in range(len(titles)):
        t = titles[i].text.strip()
        v = count[i].text.strip()
        context.append(f"{t}{v}".replace('\n', ''))
    return context

In [9]:
def update_hotsearch():
    """
    将疫情热搜插入数据库
    """
    cursor = None
    conn = None
    try:
        context = get_baidu_hot()
        print(f"{time.asctime()}——开始更新热搜数据——")
        conn, cursor = get_conn()
        sql = "insert into hotsearch(dt,content) values(%s,%s)"
        ts = time.strftime("%Y-%m-%d %X")
        for i in context:
            cursor.execute(sql, (ts, i))  # 插入数据
        conn.commit()  # 提交事务保存数据
        print(f"{time.asctime()}——数据更新完毕——")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

In [10]:
update_details()

Mon Mar  7 13:00:38 2022——开始更新——
Mon Mar  7 13:00:38 2022——更新完毕——


In [11]:
update_history()

Mon Mar  7 13:00:38 2022——更新历史数据——
Mon Mar  7 13:00:38 2022——历史数据更新完毕——


In [12]:
update_risk_area()

Mon Mar  7 13:00:39 2022——开始更新最新数据——
Mon Mar  7 13:00:39 2022——更新最新数据完毕——


In [13]:
update_hotsearch()

Mon Mar  7 13:00:39 2022——开始更新热搜数据——
Mon Mar  7 13:00:39 2022——数据更新完毕——


In [14]:
base_url="https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?"
url_details = base_url+"modules=diseaseh5Shelf"
url_history = base_url+"modules=chinaDayList,chinaDayAddList,nowConfirmStatis,provinceCompare"
url_riskarea = "http://103.66.32.242:8005/zwfwMovePortal/interface/interfaceJson"
url_hotsearch = "https://top.baidu.com/board?tab=realtime"

In [15]:
li = get_tencent_data()[1]
conn, cursor = get_conn()
sql_query = 'select * from details'
cursor.execute(sql_query)

5276