In [1]:
import pandas as pd
import networkx as nx
from pypinyin import pinyin, Style

In [2]:
data=pd.read_excel('data/comment_data.xlsx')
data['city'] = data['city_cn'].str.replace('市', '').apply(lambda x: pinyin(x, style=Style.NORMAL)[0][0].capitalize() + ''.join(p[0] for p in pinyin(x, style=Style.NORMAL)[1:]))

city_table_df = pd.read_excel("data/final_scores.xlsx")
city_table_df['city'] = city_table_df['city_cn'].str.replace('市', '').apply(lambda x: pinyin(x, style=Style.NORMAL)[0][0].capitalize() + ''.join(p[0] for p in pinyin(x, style=Style.NORMAL)[1:]))


data = pd.merge(
    data,
    city_table_df[['city_cn','id']],
    on=['city_cn'],
    how='left'
)

In [3]:
data['Date Created'] = None

for i in reversed(range(len(data))):
    if pd.notnull(data.loc[i, 'time']): 
        start_date_str, end_date_str = data.loc[i, 'time'].split('-')
        start_date_month, start_date_day = map(int, start_date_str.split('.'))
        
        date_created = pd.Timestamp(year=2022, month=start_date_month, day=start_date_day + 1)
        
        data.loc[i, 'Date Created'] = date_created
        j = i - 1
        while j >= 0 and pd.isnull(data.loc[j, 'time']):
            data.loc[j, 'Date Created'] = date_created
            j -= 1

data.to_excel('data/comment_data_filter.xlsx',index=False)

# construct networks

In [4]:
city_table_df['mention_count'] = 0

users = data['user ID'].tolist()
cities = data['id'].tolist()
city_relations = set()
for user in set(users):
    user_cities = {cities[i] for i in range(len(users)) if users[i] == user}
    for city1 in user_cities:
        for city2 in user_cities:
            if city1 != city2:
                city_relations.add(tuple(sorted((city1, city2))))
                
                
relation_table = [(source, target) for source, target in city_relations]
relation_df = pd.DataFrame(relation_table, columns=['source', 'target'])
                

relation_df.to_excel("results/city_relations.xlsx", index=False)

In [5]:
city_counts = data['id'].value_counts().reset_index()
city_counts.columns = ['id', 'mention_count']

mention_count_map = dict(zip(city_counts['id'], city_counts['mention_count']))

city_table_df['mention_count'] = city_table_df['id'].map(mention_count_map).fillna(0)
city_table_df = city_table_df[city_table_df['mention_count']!=0]
G = nx.Graph()
G.add_edges_from(city_relations)

degree_dict = dict(G.degree())
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)


city_table_df['Degree'] = city_table_df['id'].map(degree_dict).fillna(0)
city_table_df['Degree Centrality'] = city_table_df['id'].map(degree_centrality).fillna(0)
city_table_df['Betweenness Centrality'] = city_table_df['id'].map(betweenness_centrality).fillna(0)
city_table_df['Closeness Centrality'] = city_table_df['id'].map(closeness_centrality).fillna(0)


city_table_df['Degree Rank'] = city_table_df['Degree'].rank(ascending=False, method='min')
city_table_df['Degree Centrality Rank'] = city_table_df['Degree Centrality'].rank(ascending=False, method='min')
city_table_df['Betweenness Centrality Rank'] = city_table_df['Betweenness Centrality'].rank(ascending=False, method='min')
city_table_df['Closeness Centrality Rank'] = city_table_df['Closeness Centrality'].rank(ascending=False, method='min')
city_table_df['mention_count Rank'] = city_table_df['mention_count'].rank(ascending=False, method='min')
city_table_df['GDP Rank'] = city_table_df['GDP'].rank(ascending=False, method='min')
city_table_df['Population Rank'] = city_table_df['Population'].rank(ascending=False, method='min')


first_columns = ['id', 'city', 'mention_count', 'GDP', 'Population', 'Degree Centrality', 'Betweenness Centrality', 'Closeness Centrality']
other_columns = [col for col in city_table_df.columns if col not in first_columns]
city_table_df = city_table_df[first_columns + other_columns]
city_table_df = city_table_df.rename(columns={'city': 'ADM2'})
city_table_df['mention_count'] = city_table_df['mention_count'].astype(float)
city_table_df['GDP'] = city_table_df['GDP'].astype(float)
city_table_df['Population'] = city_table_df['Population'].astype(float)
city_table_df.to_excel("results/city_table_count.xlsx", index=False)
data['id'] = data['id'].apply(lambda x: [x])
data.to_excel('data/processed_comments_with_ids.xlsx',index=False)