In [359]:
import pandas as pd
from http import HTTPStatus
from collections import Counter
from tqdm import tqdm
import time
import requests
import json
import jionlp as jio
import networkx as nx
import matplotlib.pyplot as plt

In [360]:
df = pd.read_csv('./data/typhoon.csv')
df_sa = pd.read_csv('./data/sa340.csv')

xys = df_sa['center']
lngs = [i.split(',')[0] for i in xys]
lats = [i.split(',')[1] for i in xys]
df_sa['longitude'] = lngs
df_sa['latitude'] = lats

def get_typhoon_news(CN_NAME, start_time, end_time):
    temp = df[df['text'].str.contains(CN_NAME, na=False)].copy()
    temp['date'] = pd.to_datetime(temp['date'])
    filtered_df = temp[(temp['date'] >= start_time) & (temp['date'] <= end_time)]
    filtered_df = filtered_df.sort_values(by='date')
    return filtered_df

typhoon_time_TT = {
    '杜苏芮': ('Doksuri', '2023-07-15 00:00:00', '2023-08-04 23:59:59'),
    '苏拉': ('Saola', '2023-08-17 00:00:00', '2023-09-09 23:59:59'),
    '卡努': ('Khanun', '2023-07-21 00:00:00', '2023-08-20 23:59:59'),
    '海葵': ('Haikui', '2023-08-22 00:00:00', '2023-09-11 23:59:59'),
    '泰利': ('Talim', '2023-07-08 00:00:00', '2023-07-24 23:59:59'),
}

In [361]:
# 城市提取模型
# https://doi.org/10.1111/tgis.13249
# build stopwords
# 融合 City Network的城市提取 和 jionlp新闻地名提取
def disambiguation_locations():
    cities = [i.split('+')[0] for i in df_sa['city_shortname'].tolist()]
    
    data = []
    #===============common sense===================
    for city_name in cities:
        # common sense
        data.append(city_name + '路')
        data.append(city_name + '东路')
        data.append(city_name + '西路')
        data.append(city_name + '南路')
        data.append(city_name + '北路')
        data.append(city_name + '中路')
        data.append(city_name + '街')
        data.append(city_name + '镇')
        # jionlp做的挺好，这块可以省略
        # data.append(city_name + '县')
        data.append(city_name + '区')
        data.append(city_name + '村')
        data.append(city_name + '公园')
        data.append(city_name + '先生')
        data.append(city_name + '大道')
        data.append(city_name + '一路')
        data.append(city_name + '二路')
        data.append(city_name + '三路')
        data.append(city_name + '四路')
        data.append(city_name + '五路')
        data.append(city_name + '六路')
        data.append(city_name + '七路')
        data.append(city_name + '八路')
        data.append(city_name + '九路')
        data.append(city_name + '十路')
        data.append(city_name + '环路')
        data.append(city_name + '的路')
        data.append(city_name + '东一路')
        data.append(city_name + '西一路')
        data.append(city_name + '南一路')
        data.append(city_name + '北一路')
        data.append(city_name + '东二路')
        data.append(city_name + '西二路')
        data.append(city_name + '南二路')
        data.append(city_name + '北二路')
        data.append(city_name + '东街')
        data.append(city_name + '西街')
        data.append(city_name + '南街')
        data.append(city_name + '北街')
        data.append(city_name + '支路')
        data.append(city_name + '桥')
        data.append(city_name + '外国语学校')
        # 过滤底部无关新闻的超链接
        data.append('@' + city_name)
        data.append(city_name + '广场')
        data.append(city_name + '池')
        
    #===============special common sense===================
    data.append('吉林省')
    data.append('北京时间')
    data.append('入海口')
    data.append('海口岸')
    data.append('延安精神')
    data.append('重大理论')
    data.append('亚运城')
    data.append('日照时长')
    data.append('东西部')
    
    data.append('南海东') 
    data.append('海东西') 
    data.append('海东南')
    data.append('海东北')
    data.append('海南部')
    
    data.append('南海西')
    data.append('海西部')
    data.append('海西北')
    data.append('海西南')
    data.append('海西区')
    data.append('海西湖')
    
    data.append('海北部')
    data.append('渤海北')
    data.append('南海北')
    data.append('东海北')
    data.append('海口岸')
    data.append('出海口')
    
    #===============extra knowledge===================
    data.append('三明治')
    data.append('端午安康')
    data.append('孙中山')
    data.append('适中')
    data.append('重庆小面')
    data.append('重庆麻辣')
    data.append('重庆面庄')
    data.append('隆重庆祝')
    data.append('长春百子图')
    data.append('春分')
    data.append('兰州拉面')
    data.append('兰州牛肉面')
    data.append('兰州牛肉拉面')
    data.append('兰州传统牛肉面')
    data.append('宁德时代')
    data.append('山地') 
    # jionlp可以补充去掉外国地名
    # 因此删除外国同类地名

    #===============hundred family names===================
    # add hundred family names
    with open ('./data/family_name.txt') as f:
        family_name = f.read()
    family_names = family_name.split(',')
    family_names = family_names[0:100]
    
    for city in cities:
        for f in family_names:
            data.append(f + city)

    return data

def get_content(text):
    if '来源' in text:
        text = text.split('来源')[0]
    if '编辑' in text:
        text = text.split('编辑')[0]
    if '往期阅读' in text:
        text = text.split('往期阅读')[0]
    if '推荐阅读' in text:
        text = text.split('推荐阅读')[0]
    return text

def jio_loations(text):
    res = jio.recognize_location(text)
    return res['domestic']


def get_locations(text):
    # STEP1: 核心信息，正文内容
    # 是否需要保留转载
    text = get_content(text)
    # STEP2：去掉歧义词
    dis_locs = disambiguation_locations()
    for i in dis_locs:
        text = text.replace(i, '')
    # STEP3: 提取地名
    res = jio_loations(text)
    if res == None:
        return []

    cities = []
    for item in res:
        g = item[0]
        city = g['city']
        if city is not None:
            cities.append(city)
    return cities

In [370]:
# 按每一篇新闻提取城市对
def get_city_network(df):
    rows = []
    for i, r in tqdm(df.iterrows(), total=len(df)):
        source = r['city_name']
        cities = list(set(get_locations(r['text'])))
        # 不包含source名单
        clean_cties = []
        for j in cities:
            if source != j:
                clean_cties.append(j)
                
        targets = ','.join(clean_cties)
        row = [source, targets]
        rows.append(row)
    temp = pd.DataFrame(rows, columns=['source', 'targets'])
    return temp

def get_en_name(city_name_cn):
    en_name = df_sa[df_sa.name==city_name_cn]['en_name'].tolist()[0]
    return en_name

def get_graph(df_origin, FILE_NAME):
    df = get_city_network(df_origin)
    # 地级市名单340个以内
    cities_mustin_list = df_sa['name'].tolist()
    #===========Nodes==================
    t1 = df['source'].tolist()
    targets_str = df['targets'].tolist()
    t2 = []
    for i in targets_str:
        if i != '':
            t2 += i.split(',')
            
    t2 = list(set(t1 + t2))
    t3 = [i for i in t2 if i in cities_mustin_list]
    t4 = df_sa[['name', 'en_name', 'longitude', 'latitude']]
    nodes = t4[t4.name.isin(t3)]
    nodes['size'] = 1
    nodes.to_csv('./QGIS/graph/' + FILE_NAME + '_node.csv', index=None)
    
    #=============Edges=================
    # 1.取出来源城市，去掉重复
    city_list = list(set(df['source'].tolist()))
    # 2.往target中添加城市
    # {'钦州市': [], '阳江市': [], '嘉兴市': [], '海口市': [], '江门市': []}
    city_dict = {city: [] for city in city_list}
    for i, r in df.iterrows():
        start = r['source']
        end = r['targets']
        if end != '':
            city_dict[start].extend(end.split(','))
    # 3.统计目标城市的权重（次数）
    new_edges = []
    for city, target_list in city_dict.items():
        # 存在链接则有必要，否则不保留
        if len(target_list):
            cos = Counter(target_list)
            for target_city, weight in cos.items():
                if target_city in cities_mustin_list:
                    row = [city, target_city, weight, weight]
                    new_edges.append(row)
    temp = pd.DataFrame(new_edges, columns=['source', 'target', 'weight', 'QWeight'])
    temp.to_csv('./QGIS/graph/' + FILE_NAME + '_edge.csv', index=None)
    return temp

In [371]:
# Doksuri
Do_T1 = get_typhoon_news('杜苏芮', '2023-07-15 00:00:00', '2023-07-19 23:59:59')
Do_T2 = get_typhoon_news('杜苏芮', '2023-07-20 00:00:00', '2023-07-30 23:59:59')
Do_T3 = get_typhoon_news('杜苏芮', '2023-07-31 00:00:00', '2023-08-04 23:59:59')

get_graph(Do_T1, 'Doksuri_T1')
get_graph(Do_T2, 'Doksuri_T2')
get_graph(Do_T3, 'Doksuri_T3')

100%|███████████████████████████████████████████| 5/5 [00:00<00:00,  5.37it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes['size'] = 1
100%|█████████████████████████████████████| 1041/1041 [01:33<00:00, 11.15it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes['size'] = 1
100%|███████████████████████████████████████| 264/264 [00:24<00:00, 10.96it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-do

Unnamed: 0,source,target,weight,QWeight
0,晋城市,保定市,1,1
1,承德市,保定市,3,3
2,承德市,石家庄市,1,1
3,承德市,包头市,1,1
4,承德市,邢台市,3,3
...,...,...,...,...
155,眉山市,北京市,1,1
156,眉山市,泉州市,1,1
157,无锡市,保定市,1,1
158,无锡市,钦州市,1,1


In [372]:
# Saola
Sa_T1 = get_typhoon_news('苏拉', '2023-08-17 00:00:00', '2023-08-21 23:59:59')
Sa_T2 = get_typhoon_news('苏拉', '2023-08-22 00:00:00', '2023-09-04 23:59:59')
Sa_T3 = get_typhoon_news('苏拉', '2023-09-05 00:00:00', '2023-09-09 23:59:59')

get_graph(Sa_T1, 'Saola_T1')
get_graph(Sa_T2, 'Saola_T2')
get_graph(Sa_T3, 'Saola_T3')

100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 12.24it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes['size'] = 1
100%|███████████████████████████████████████| 744/744 [01:09<00:00, 10.68it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes['size'] = 1
100%|█████████████████████████████████████████| 19/19 [00:01<00:00, 10.20it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-do

Unnamed: 0,source,target,weight,QWeight
0,云浮市,龙岩市,1,1
1,云浮市,潮州市,1,1
2,三明市,福州市,1,1
3,防城港市,贺州市,1,1
4,防城港市,玉林市,1,1
5,防城港市,梧州市,1,1
6,潮州市,茂名市,1,1
7,潮州市,清远市,1,1
8,玉林市,南宁市,1,1
9,玉林市,梧州市,1,1


# Networkx

In [391]:
# 计算图节点的入度，用于可视化节点的大小
# 更新Node size
def city_in_degree(FILE_NAME):
    # 非孤立的点
    df_edges = pd.read_csv('./QGIS/graph/' + FILE_NAME + '_edge.csv')
    G = nx.from_pandas_edgelist(df_edges, source='source', target='target', 
                            edge_attr="weight", create_using=nx.DiGraph())
    exist_nodes = list(G.nodes)

    # 添加孤立的点
    df_nodes = pd.read_csv('./QGIS/graph/' + FILE_NAME + '_node.csv')
    all_nodes = df_nodes['name'].tolist()
    isolated_nodes = [i for i in all_nodes if i not in exist_nodes]
    G.add_nodes_from(isolated_nodes)

    # 更新节点的 Size(度)
    in_degree_dict = dict(G.in_degree()) 
    sorted_nodes = sorted(in_degree_dict.items(), key=lambda x: x[1], reverse=True)
    rows = [[i[0], i[1] + 1]for i in sorted_nodes]
    temp1 = pd.DataFrame(rows, columns=['name', 'size'])
    
    temp2 = df_sa[['name', 'en_name', 'longitude', 'latitude']]
    temp3 = pd.merge(temp1, temp2, on='name')
    temp3.to_csv('./QGIS/graph/' + FILE_NAME + '_node.csv', index=None)
    return temp3

In [393]:
city_in_degree('Doksuri_T1')
city_in_degree('Doksuri_T2')
city_in_degree('Doksuri_T3')

Unnamed: 0,name,size,en_name,longitude,latitude
0,北京市,40,Beijing,116.407387,39.904179
1,保定市,24,Baoding,115.464523,38.874476
2,邢台市,10,Xingtai,114.49742,37.060227
3,晋城市,9,Jincheng,112.852022,35.491315
4,阳泉市,9,Yangquan,113.580426,37.857094
...,...,...,...,...,...
114,平顶山市,1,Pingdingshan,113.192595,33.766554
115,荆门市,1,Jingmen,112.199009,31.035445
116,深圳市,1,Shenzhen,114.057939,22.543527
117,江门市,1,Jiangmen,113.081548,22.578948


In [394]:
city_in_degree('Saola_T1')
city_in_degree('Saola_T2')
city_in_degree('Saola_T3')

Unnamed: 0,name,size,en_name,longitude,latitude
0,梧州市,3,Wuzhou,111.279022,23.476733
1,龙岩市,2,Longyan,117.017362,25.075884
2,潮州市,2,Chaozhou,116.621901,23.657662
3,福州市,2,Fuzhou,119.296411,26.074286
4,贺州市,2,Hezhou,111.567216,24.404182
5,玉林市,2,Yulin,110.18097,22.654001
6,茂名市,2,Maoming,110.925533,21.662728
7,清远市,2,Qingyuan,113.056098,23.682064
8,南宁市,2,Nanning,108.366407,22.8177
9,云浮市,1,Yunfu,112.044524,22.915163
