In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import functools
from collections import Counter
import math
import statistics

import seaborn as sns
import matplotlib.lines as mlines
import os
from IPython.display import display
import scipy.stats as st

import matplotlib.image as mpimg

from scipy.stats import norm

"""
采用 pagerank + citation
pagerank： 表示节点的重要程度
citation：表示节点的次数
"""

'\n采用 pagerank + citation\npagerank： 表示节点的重要程度\ncitation：表示节点的次数\n'

In [3]:
df = pd.read_csv('./study_data/cities_gzh_aja_martix.csv', index_col='city')

In [11]:
# Map to 0-100
def map_to_0_100(value, max_value, threshold):
    # delta = 1/340
    _delta = 1/340 
    value = value + _delta
    max_value = max_value + _delta
    if value <= threshold:
        return (value / threshold) * 50
    else:
        return 50 + ((value - threshold) / (max_value - threshold)) * 50
    

def map_values(values, max_value):
    # Considering the actual data situation, most of the data is in the long tail, so the average value is used as the threshold
    threshold = np.mean(values)  
    mapped_values = []
    for value in values:
        v = map_to_0_100(value, max_value, threshold)
        mapped_values.append(v)
    return mapped_values


In [12]:
MAX_VALUE = max(list(df.describe().loc['max']))
MAX_10_LEN = 10 ** len(str(int(MAX_VALUE))) + 1

# Set the weights as the reciprocals and take the integers
# Some centrality measures need to consider resistance.
all_rows = []
for i in df.values:
    row = []
    for j in i:
        cul_num = 0
        if j == 0:
            #/////1.0
            cul_num = 0
        else:
            # 2.0版本，动态阻尼，根据最大值设置合适阻尼
            cul_num = round(1 / j * MAX_10_LEN )
            # 1.0版本，超大阻尼
            # cul_num = round(1 / j * 1000000)
        row.append(cul_num)
    all_rows.append(row)

# Inverse weight matrix
df_weight_normal = pd.DataFrame(all_rows, columns=df.columns)
df_weight_normal.index = df.index

# Normal weight matrix
G = nx.from_pandas_adjacency(df, create_using=nx.DiGraph)
# Inverse weight matrix
G_inverse_weight = nx.from_pandas_adjacency(df_weight_normal, create_using=nx.DiGraph)

print(G.size())
print(G_inverse_weight)

44891
DiGraph with 340 nodes and 44891 edges


In [13]:
# 字典排序
def take_degree(tup):
    return tup[1]

# 将dict进行统计并排序
def get_result_sort(dict_data):
    item_arr = []
    for i in dict_data:
        item_arr.append((i, dict_data.get(i)*100))
    item_arr.sort(key=take_degree, reverse=True)
    
    res = []
    rank = 0
    for i in item_arr:
        rank += 1
        res.append([i[0], i[1], rank])
    return res

In [14]:
in_degree = G.in_degree()
out_degree = G.out_degree()
y_in = [i[1] for i in list(in_degree)]
x_out = [i[1] for i in list(out_degree)]

print('网络密度：', nx.density(G))

网络密度： 0.3894759673781017


In [15]:
# 度集合
degree_arr = list(nx.degree(G))
df_degree = pd.DataFrame(degree_arr, columns=['name', 'degree'])
df_degree['in-degree'] = y_in
df_degree['out-degree'] = x_out
df_degree.to_csv('./study_data/degree.csv', index=None)
df_degree

Unnamed: 0,name,degree,in-degree,out-degree
0,龙岩市,280,82,198
1,齐齐哈尔市,409,96,313
2,黔西南布依族苗族自治州,119,116,3
3,黔南布依族苗族自治州,194,77,117
4,黔东南苗族侗族自治州,316,149,167
...,...,...,...,...
335,三门峡市,251,65,186
336,三沙市,84,26,58
337,三明市,317,105,212
338,三亚市,500,242,258


## 1.Importance score

In [16]:
# page rank
cen_pagerank = get_result_sort(nx.pagerank(G,  weight='weight'))
df_pagerank = pd.DataFrame(cen_pagerank, columns=['name', 'pagerank_value', 'pagerank_rank'])

df_importance = df_pagerank
# 结果映射
vals = list(df_pagerank['pagerank_value'])
df_importance['importance_score'] = map_values(vals, max(vals))
df_importance['pagerank_score'] = map_values(vals, max(vals))
df_importance.to_csv('./study_data/score/城市重要程度得分.csv', index=None)

In [17]:
df_importance[0:50]

Unnamed: 0,name,pagerank_value,pagerank_rank,importance_score,pagerank_score
0,北京市,7.551218,1,100.0,100.0
1,上海市,4.805913,2,81.093047,81.093047
2,香港特别行政区,2.772559,3,67.089316,67.089316
3,深圳市,2.750922,4,66.940297,66.940297
4,广州市,2.693822,5,66.547053,66.547053
5,杭州市,2.635709,6,66.146828,66.146828
6,成都市,2.056624,7,62.158661,62.158661
7,天津市,1.969882,8,61.561266,61.561266
8,重庆市,1.869064,9,60.866929,60.866929
9,澳门特别行政区,1.683657,10,59.590032,59.590032


## 2.Citation score

In [18]:
# 被引用得分
# 因为被引用得分存在0值，则设定ε = 1/340 
# x = x + ε
cites = pd.read_csv('./study_data/all_cities_in_count.csv')

citation_score = []
delta = 1/len(cites)
for i, row in cites.iterrows():
    citation_score.append([row['cityname'], row['in-count']])
    
df_citation_score = pd.DataFrame(citation_score, columns=['name', 'in_citation'])

In [19]:
values = list(df_citation_score['in_citation'])
max_value = max(values)
mapped_values = map_values(values, max_value)
df_citation_score['citation_score'] = mapped_values
df_citation_score.to_csv('./study_data/score/被引用得分.csv', index=None)

In [20]:
df_citation_score[0:50]

Unnamed: 0,name,in_citation,citation_score
0,北京市,36899,100.0
1,上海市,22841,80.272952
2,广州市,12751,66.114046
3,深圳市,11783,64.755689
4,成都市,11564,64.448375
5,杭州市,10219,62.560988
6,重庆市,9749,61.901455
7,香港特别行政区,9056,60.928995
8,武汉市,7633,58.932154
9,南京市,6481,57.315597


## 3.merge info

In [21]:
# df_citation_score：引用得分
# df_importance： 重要性得分
df_sa = pd.read_csv('./data/sa340.csv')

df_rank_all = pd.merge(df_sa, df_citation_score, on='name')
df_rank_all = pd.merge(df_rank_all, df_importance, on='name')

# 处理经纬度信息
df_rank_all['Longitude'] = [i.split(',')[0] for i in df_rank_all['center']]
df_rank_all['Latitude'] = [i.split(',')[1] for i in df_rank_all['center']]
print('======信息融合：done====')
df_rank_all



Unnamed: 0,citycode,adcode,name,level,center,pro_name,pro_adcode,gzh,city_shortname,pro_shortname,en_name,in_citation,citation_score,pagerank_value,pagerank_rank,importance_score,pagerank_score,Longitude,Latitude
0,315,130200,唐山市,city,"118.180149,39.63068",河北省,130000,唐山发布,唐山,河北,Tangshan,667,26.307175,0.246035,82,42.325968,42.325968,118.180149,39.63068
1,335,130300,秦皇岛市,city,"119.52022,39.888243",河北省,130000,秦皇岛发布,秦皇岛,河北,Qinhuangdao,596,23.506873,0.199254,109,34.373180,34.373180,119.52022,39.888243
2,314,130800,承德市,city,"117.962749,40.952942",河北省,130000,承德发布,承德,河北,Chengde,483,19.050055,0.181898,134,31.422604,31.422604,117.962749,40.952942
3,311,130100,石家庄市,city,"114.514976,38.042007",河北省,130000,石家庄发布,石家庄,河北,Shijiazhuang,2122,51.198781,0.482737,32,51.319282,51.319282,114.514976,38.042007
4,316,131000,廊坊市,city,"116.683546,39.538304",河北省,130000,廊坊发布,廊坊,河北,Langfang,690,27.214315,0.299410,61,50.056707,50.056707,116.683546,39.538304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,10,110000,北京市,province,"116.407387,39.904179",北京市,110000,北京发布,北京,北京,Beijing,36899,100.000000,7.551218,1,100.000000,100.000000,116.407387,39.904179
336,22,120000,天津市,province,"117.201509,39.085318",天津市,120000,天津发布,天津,天津,Tianjin,6461,57.287532,1.969882,8,61.561266,61.561266,117.201509,39.085318
337,21,310000,上海市,province,"121.473667,31.230525",上海市,310000,上海发布,上海,上海,Shanghai,22841,80.272952,4.805913,2,81.093047,81.093047,121.473667,31.230525
338,23,500000,重庆市,province,"106.550483,29.563707",重庆市,500000,重庆发布,重庆,重庆,Chongqing,9749,61.901455,1.869064,9,60.866929,60.866929,106.550483,29.563707


## 4.Index fusion

In [22]:
# cite_score + importance_score
# 1.算数平均数
# 2.几何平均数
# 3.调和平均数
ARI_importance_cite = []
GEO_importance_cite = []
HAR_importance_cite = []
for i, r in df_rank_all.iterrows():
    data = [r['importance_score'], r['citation_score']]
    
    ARI_importance_cite.append(statistics.mean(data))
    GEO_importance_cite.append(statistics.geometric_mean(data))
    HAR_importance_cite.append(statistics.harmonic_mean(data))
    
df_rank_all['ARI'] = ARI_importance_cite
df_rank_all['GEO'] = GEO_importance_cite
df_rank_all['HAR'] = HAR_importance_cite

df_rank_all = df_rank_all.sort_values(by='ARI', ascending=False)
df_rank_all.to_csv('./study_data/score/城市信息+排名得分综合表.csv', index=None)
print('===done===')
df_rank_all

===done===


Unnamed: 0,citycode,adcode,name,level,center,pro_name,pro_adcode,gzh,city_shortname,pro_shortname,...,citation_score,pagerank_value,pagerank_rank,importance_score,pagerank_score,Longitude,Latitude,ARI,GEO,HAR
335,10,110000,北京市,province,"116.407387,39.904179",北京市,110000,北京发布,北京,北京,...,100.000000,7.551218,1,100.000000,100.000000,116.407387,39.904179,100.000000,100.000000,100.000000
337,21,310000,上海市,province,"121.473667,31.230525",上海市,310000,上海发布,上海,上海,...,80.272952,4.805913,2,81.093047,81.093047,121.473667,31.230525,80.683000,80.681958,80.680916
207,20,440100,广州市,city,"113.264499,23.130061",广东省,440000,中国广州发布,广州,广东,...,66.114046,2.693822,5,66.547053,66.547053,113.264499,23.130061,66.330549,66.330196,66.329843
194,755,440300,深圳市,city,"114.057939,22.543527",广东省,440000,深圳发布,深圳,广东,...,64.755689,2.750922,4,66.940297,66.940297,114.057939,22.543527,65.847993,65.838933,65.829874
89,571,330100,杭州市,city,"120.210792,30.246026",浙江省,330000,杭州发布,杭州,浙江,...,62.560988,2.635709,6,66.146828,66.146828,120.210792,30.246026,64.353908,64.328928,64.303957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,2898,460300,三沙市,city,"112.338649,16.831004",海南省,460000,中国三沙,三沙,海南,...,2.327127,0.054463,336,9.758726,9.758726,112.338649,16.831004,6.042927,4.765480,3.758079
307,972,630200,海东市,city,"102.41064,36.473448",青海省,630000,海东市政府网,海东市,青海,...,0.000116,0.045041,337,8.156901,8.156901,102.41064,36.473448,4.078509,0.030761,0.000232
280,894,540400,林芝市,city,"94.361436,29.64875",西藏自治区,540000,林芝发布,林芝市,西藏,...,0.000116,0.045041,338,8.156901,8.156901,94.361436,29.64875,4.078509,0.030761,0.000232
262,886,533300,怒江傈僳族自治州,city,"98.8566,25.817555",云南省,530000,怒江发布,怒江市,云南,...,0.000116,0.045041,339,8.156901,8.156901,98.8566,25.817555,4.078509,0.030761,0.000232
