In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import re
import jieba

from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('../../clean_data/comments_study.csv')
user = pd.read_csv('../../clean_data/user_study.csv')
sa = pd.read_csv('../../data/sa340.csv')

  df = pd.read_csv('../../clean_data/comments_study.csv')


In [3]:
def get_cities_in_China():
    user_locs = user[['userid', 'location', 'province', 'city']]
    user_locs = user_locs.rename(columns={'userid': 'userId', 'province': 'user_province', 'city': 'user_city'})
    comms = df[['city', 'link', 'userId']]
    comms = comms.drop_duplicates(subset=['link', 'userId'])
    song_user_city = pd.merge(comms, user_locs, on='userId')
    song_user_city = song_user_city[song_user_city['user_city'].notna()]
    
    # 中国省市
    song_user_city.loc[song_user_city['user_city'] == '澳门', 'user_city'] = '澳门特别行政区'
    song_user_city.loc[song_user_city['user_province'] == '澳门', 'user_province'] = '澳门特别行政区'
    song_user_city.loc[song_user_city['user_city'] == '香港', 'user_city'] = '香港特别行政区'
    song_user_city.loc[song_user_city['user_province'] == '香港', 'user_province'] = '香港特别行政区'
    song_user_city.loc[song_user_city['user_province'] == '台湾省', 'user_city'] = '台湾省'

    
    song_user_city = song_user_city[song_user_city.user_city.isin(sa['name'])]
    
    return song_user_city

In [4]:
temp = get_cities_in_China()

## Construct Network

In [5]:
def build_whole():
    cities = sa[['name', 'center']].copy()
    whole = temp.groupby(['city', 'user_city']).size().reset_index(name='weight')
    whole = whole.rename(columns={'city': 'target', 'user_city': 'source'})
    # whole = whole[whole.weight > 10]
    
    cities[['longitude', 'latitude']] = cities['center'].str.split(',', expand=True)
    cities = cities.drop(columns=['center'])
    # 将 source 的经纬度匹配到 whole
    whole = whole.merge(cities, left_on='source', right_on='name', how='left')
    whole = whole.rename(columns={'longitude': 'source_longitude', 'latitude': 'source_latitude'})
    whole = whole.drop(columns=['name'])
    # 将 target 的经纬度匹配到 whole
    whole = whole.merge(cities, left_on='target', right_on='name', how='left')
    whole = whole.rename(columns={'longitude': 'target_longitude', 'latitude': 'target_latitude'})
    whole = whole.drop(columns=['name'])
    whole.to_csv('../QGIS/nationwide/edges.csv', index=None)
    
    whole_nodes = pd.unique(whole[['source', 'target']].values.ravel())
    node_list = sa[sa.name.isin(whole_nodes)][['name', 'en_name', 'center']]
    node_list[['longitude', 'latitude']] = node_list['center'].str.split(',', expand=True)
    node_list.to_csv('../QGIS/nationwide/nodes.csv', index=None)

    return whole

whole_edges = build_whole()

## 省会-省内城市

In [6]:
def get_topN_edges_domestic(n=3):
    edges = whole_edges
    # 不考虑self-loop
    edges = edges[edges['source'] != edges['target']]
    topN = edges.sort_values(by='weight', ascending=False).head(n)
    topN = topN.reset_index()

    topN_pairs_cities = pd.unique(topN[0:n][['source', 'target']].values.ravel())
    topN_pairs_labels = sa[sa.name.isin(topN_pairs_cities)][['name', 'en_name', 'center']]
    topN_pairs_labels[['longitude', 'latitude']] = topN_pairs_labels['center'].str.split(',', expand=True)

    return topN, topN_pairs_labels

get_topN_edges_domestic(3)[0]

Unnamed: 0,index,target,source,weight,source_longitude,source_latitude,target_longitude,target_latitude
0,8848,广元市,成都市,414,104.066301,30.572961,105.844004,32.435774
1,5843,商洛市,西安市,401,108.939645,34.343207,109.918646,33.873358
2,9420,张家界市,长沙市,339,112.938882,28.228304,110.478887,29.117343


In [7]:
top100, _ = get_topN_edges_domestic(n=100)
top100.to_csv('./results/top100_pairs.csv', index=None)

## 不同等级城市的网络

In [8]:
def delete_auto_pre(text):
    text = text.replace('Autonomous Prefecture', '')
    text = text.replace('Prefecture', '')
    return text.strip()
    
def build_city_network(df, city_en_name):
    cities = sa[['name', 'center']].copy()
    whole = df.groupby(['city', 'user_city']).size().reset_index(name='weight')
    whole = whole.rename(columns={'city': 'target', 'user_city': 'source'})

    cities[['longitude', 'latitude']] = cities['center'].str.split(',', expand=True)
    cities = cities.drop(columns=['center'])
    whole = whole.merge(cities, left_on='source', right_on='name', how='left')
    whole = whole.rename(columns={'longitude': 'source_longitude', 'latitude': 'source_latitude'})
    whole = whole.drop(columns=['name'])
    whole = whole.merge(cities, left_on='target', right_on='name', how='left')
    whole = whole.rename(columns={'longitude': 'target_longitude', 'latitude': 'target_latitude'})
    whole = whole.drop(columns=['name'])
    whole.to_csv('../QGIS/city_network/' + city_en_name + '/edges.csv', index=None)
    
    whole_nodes = pd.unique(whole[['source', 'target']].values.ravel())
    node_list = sa[sa.name.isin(whole_nodes)][['name', 'en_name', 'center']]
    node_list[['longitude', 'latitude']] = node_list['center'].str.split(',', expand=True)
    node_list['en_name'] = node_list['en_name'].apply(delete_auto_pre)
    
    node_list.to_csv('../QGIS/city_network/' + city_en_name + '/nodes.csv', index=None)

    # The number of linked cities
    print(len(node_list) -1)

    # return whole
    print('==done==')


def get_center_city(city_en_name):
    target = sa[sa.en_name==city_en_name].copy()
    target[['longitude', 'latitude']] = target['center'].str.split(',', expand=True)
    t = target[['name', 'en_name', 'longitude', 'latitude']]
    t.to_csv('../QGIS/city_network/' + city_en_name + '/map_node_label.csv', index=None)
    return target[['name', 'en_name', 'longitude', 'latitude']]

In [9]:
build_city_network(temp[temp.city=='北京市'], 'Beijing')
get_center_city('Beijing')

210
==done==


Unnamed: 0,name,en_name,longitude,latitude
335,北京市,Beijing,116.407387,39.904179


In [10]:
build_city_network(temp[temp.city=='杭州市'], 'Hangzhou')
get_center_city('Hangzhou')

212
==done==


Unnamed: 0,name,en_name,longitude,latitude
89,杭州市,Hangzhou,120.210792,30.246026


In [11]:
build_city_network(temp[temp.city=='珠海市'], 'Zhuhai')
get_center_city('Zhuhai')

53
==done==


Unnamed: 0,name,en_name,longitude,latitude
195,珠海市,Zhuhai,113.576892,22.271644


In [12]:
build_city_network(temp[temp.city=='安庆市'], 'Anqing')
get_center_city('Anqing')

186
==done==


Unnamed: 0,name,en_name,longitude,latitude
96,安庆市,Anqing,117.115349,30.531828


In [13]:
build_city_network(temp[temp.city=='景德镇市'], 'Jingdezhen')
get_center_city('Jingdezhen')

72
==done==


Unnamed: 0,name,en_name,longitude,latitude
124,景德镇市,Jingdezhen,117.184892,29.2744


In [14]:
build_city_network(temp[temp.city=='鹤岗市'], 'Hegang')
get_center_city('Hegang')

5
==done==


Unnamed: 0,name,en_name,longitude,latitude
59,鹤岗市,Hegang,130.297687,47.350659


## 导出T0-T5 城市地理位置

In [15]:
six_cities = sa[sa.name.isin(['北京市', '杭州市', '珠海市', '安庆市', '景德镇市', '鹤岗市'])]
six_cities[['longitude', 'latitude']] = six_cities['center'].str.split(',', expand=True)
six_cities['longitude'] = six_cities['longitude'].astype(float)
six_cities['latitude'] = six_cities['latitude'].astype(float)
six_cities.to_csv('../QGIS/nationwide/six-nodes.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  six_cities[['longitude', 'latitude']] = six_cities['center'].str.split(',', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  six_cities[['longitude', 'latitude']] = six_cities['center'].str.split(',', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  six_cities['longitude'] = s