In [361]:
import pandas as pd
from collections import Counter
import re
import networkx as nx
import matplotlib.pyplot as plt

In [362]:
df_source = pd.read_csv('./data/source-vilidation-version.csv')
sa340 = pd.read_csv('./data/sa340.csv')

In [364]:
data = []
for i, row in df_source.iterrows():
    city = row['city_name']
    city_abb = sa340[sa340.name==city]['city_shortname'].tolist()[0]
    city_2_pro = sa340[sa340.name==city]['pro_shortname'].tolist()[0]
    gzh = sa340[sa340.name==city]['gzh'].tolist()[0]
    org_names = row['source'].split(',')
    new_names = []
    for name in org_names:
        if re.match(r"^市", name) and '都市' not in name:
            n_n = name + '(' + city_abb + ')'
            new_names.append(n_n)

        elif re.match(r"^省", name):
            n_n = name + '(' + city_2_pro + ')'
            new_names.append(n_n)
        else:
            new_names.append(name)
    data.append([city, gzh, ','.join(new_names), row['link']])

temp = pd.DataFrame(data, columns=['city', 'gzh','source', 'link'])
temp.to_csv('./data/gzh_citation.csv', index=None)

In [365]:
# 构建节点和边
def build_edges():
    citation = pd.read_csv('./data/gzh_citation.csv')
    edges = []
    for i, row in citation.iterrows():
        target_nodes = row['source'].split(',')
        for node in target_nodes:
            edges.append([row['city'], row['gzh'], node])
    temp = pd.DataFrame(edges, columns=['city', 'origin', 'target'])
    temp["weight"] = temp.groupby(["origin", "target"])["target"].transform("count")
    result = temp.drop_duplicates(subset=["origin", "target"]).reset_index(drop=True)
    result.to_csv('./data/citation_graph/_edges.csv', index=None)
    return result

edges = build_edges()

In [366]:
# 创建图
G = nx.DiGraph()
for i, row in edges.iterrows():
    G.add_edge(row["origin"], row["target"], weight=row["weight"])

In [367]:
# 按加权入度排名
weighted_in_degree = dict(G.in_degree(weight="weight"))
weighted_in_degree_ranking = sorted(weighted_in_degree.items(), key=lambda x: x[1], reverse=True)
top_nodes = {node for node, _ in weighted_in_degree_ranking[:10]} 

def build_4gephi():
    temp = edges[["origin", "target", "weight"]].rename(columns={
        "origin": "Source",
        "target": "Target",
        "weight": "Weight"
    })
    temp.to_csv("./data/citation_graph/edges.csv", index=False)
    print("Edges file saved to './data/citation_graph/edges.csv'")

    nodes = pd.DataFrame(
        pd.concat([edges["origin"], edges["target"]]).unique(), 
        columns=["Id"]
    )
    # 为了保证0节点也可以可视化，则 + 1
    nodes["size"] = nodes["Id"].map(weighted_in_degree) + 1
    nodes["Label"] = nodes["Id"].apply(lambda x: x if x in top_nodes else "")

    nodes.to_csv("./data/citation_graph/nodes.csv", index=False)
    print("Nodes file saved to './data/citation_graph/nodes.csv'")

build_4gephi()

Edges file saved to './data/citation_graph/edges.csv'
Nodes file saved to './data/citation_graph/nodes.csv'


In [368]:
weighted_in_degree_ranking[:10]

[('中央气象台', 561),
 ('中国天气', 334),
 ('广东天气', 276),
 ('新华社', 186),
 ('应急管理部', 148),
 ('央视新闻', 136),
 ('南方+', 134),
 ('广东应急管理', 125),
 ('浙江天气', 119),
 ('江苏气象', 119)]

In [369]:
citation = pd.read_csv('./data/gzh_citation.csv')

In [421]:
def get_Level_0():
    temp = citation
    include_keyword = ['国', '央', '部', '新华社', '人民日报', '人民网']
    exclude_keyword = '指挥部'
    tt = temp[temp.source.str.contains('|'.join(include_keyword)) & ~temp['source'].str.contains(exclude_keyword)]
    return tt

In [422]:
def get_Level_1():
    temp = citation
    include_keyword = sa340['pro_shortname'].tolist()
    tt = temp[temp.source.str.contains('|'.join(include_keyword))]
    return tt

In [423]:
def get_level_0_citation_count():
    all_orgs = []
    for i in citation['source'].tolist():
        all_orgs += i.split(',')

    print('总引用：', len(all_orgs))
    level_0_keys = ['国', '央', '部', '新华社', '人民日报', '人民网', '新华']
    level_0_count = 0
    for o in all_orgs:
        if any(k in o for k in level_0_keys):
            level_0_count += 1
    print('国家级，部级，被引用次数：', level_0_count)

    level_1_keys = sa340['pro_shortname'].tolist()
    level_1_count = 0
    for o in all_orgs:
        if any(k in o for k in level_1_keys):
            level_1_count += 1
    print('省级被引用次数：', level_1_count)

    print('其他引用次数：', len(all_orgs) - level_0_count - level_1_count)
   

get_level_0_citation_count()

总引用： 7826
国家级，部级，被引用次数： 2098
省级被引用次数： 1693
其他引用次数： 4035


In [424]:
def get_count_by_keyword(word):
    all_orgs = []
    for i in citation['source'].tolist():
        all_orgs += i.split(',')
    n = 0
    for o in all_orgs:
        if word in o:
            n += 1
    return n

In [425]:
get_count_by_keyword('气象台')

957

In [426]:
get_count_by_keyword('报')

1000

In [427]:
get_count_by_keyword('广播'), get_count_by_keyword('广播')/7826 *100

(73, 0.9327881420904677)

In [428]:
get_count_by_keyword('天气')

1278

In [429]:
7826 - 1278 -1000 - 957 -73

4518

In [430]:
def build_4gephi_city_level(city_gzh):
    temp = edges[["origin", "target", "weight"]].rename(columns={
        "origin": "Source",
        "target": "Target",
        "weight": "Weight"
    })
    temp = temp[temp.Source==city_gzh]
    temp.to_csv("./data/citation_graph/" + city_gzh + "_edges.csv", index=False)

    nodes = pd.DataFrame(
        pd.concat([temp["Source"], temp["Target"]]).unique(), 
        columns=["Id"]
    )
    
    # 为了保证0节点也可以可视化，则 + 1
    nodes["size"] = nodes["Id"].map(weighted_in_degree) + 1
    nodes["Label"] = nodes["Id"]
    nodes.to_csv("./data/citation_graph/" + city_gzh + "_nodes.csv", index=False)

build_4gephi_city_level('杭州发布')

In [431]:
G.number_of_nodes(),nx.density(G)*100,G.number_of_edges()

(1800, 0.08501636711753442, 2753)

In [432]:
HZ = pd.read_csv('./data/citation_graph/杭州发布_edges.csv')
HZ[:5]

Unnamed: 0,Source,Target,Weight
0,杭州发布,市气象局(杭州),48
1,杭州发布,浙江天气,23
2,杭州发布,中央气象台,10
3,杭州发布,杭州日报,6
4,杭州发布,广州日报,1


In [433]:
railway = edges[edges.target.str.contains('铁路')]
railway['weight'].sum()

117