In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm 
from collections import Counter
import jieba
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_all = pd.read_csv('./data/20221007_20231006_all.csv')

In [3]:
df = pd.read_csv('./data/sa340.csv')
with open ('./data/family_name.txt') as f:
    family_name = f.read()
family_names = family_name.split(',')
family_names = family_names[0:100]

In [4]:
def build_stopwords(city_name):
    data = []
    # common sense
    data.append(city_name + '路')
    data.append(city_name + '东路')
    data.append(city_name + '西路')
    data.append(city_name + '南路')
    data.append(city_name + '北路')
    data.append(city_name + '中路')
    data.append(city_name + '街')
    data.append(city_name + '镇')
    data.append(city_name + '县')
    data.append(city_name + '区')
    data.append(city_name + '村')

    
    data.append(city_name + '公园')
    data.append(city_name + '先生')
    data.append(city_name + '大道')
    data.append(city_name + '一路')
    data.append(city_name + '二路')
    data.append(city_name + '三路')
    data.append(city_name + '四路')
    data.append(city_name + '五路')
    data.append(city_name + '六路')
    data.append(city_name + '七路')
    data.append(city_name + '八路')
    data.append(city_name + '九路')
    data.append(city_name + '十路')
    data.append(city_name + '环路')
    data.append(city_name + '的路')
    data.append(city_name + '东一路')
    data.append(city_name + '西一路')
    data.append(city_name + '南一路')
    data.append(city_name + '北一路')
    data.append(city_name + '东二路')
    data.append(city_name + '西二路')
    data.append(city_name + '南二路')
    data.append(city_name + '北二路')
    data.append(city_name + '东街')
    data.append(city_name + '西街')
    data.append(city_name + '南街')
    data.append(city_name + '北街')
    data.append(city_name + '支路')
    data.append(city_name + '桥')
    data.append(city_name + '外国语学校')
    
    # 过滤底部无关新闻的超链接
    data.append('@' + city_name)
    data.append(city_name + '广场')
    data.append(city_name + '池')
    
    # special common sense
    data.append('吉林省')
    data.append('北京时间')
    data.append('入海口')
    data.append('海口岸')
    data.append('延安精神')
    data.append('重大理论')
    data.append('亚运城')
    data.append('日照时长')
    data.append('东西部')
    
    data.append('南海东') 
    data.append('海东西') 
    data.append('海东南')
    data.append('海东北')
    data.append('海南部')
    
    data.append('南海西')
    data.append('海西部')
    data.append('海西北')
    data.append('海西南')
    data.append('海西区')
    data.append('海西湖')
    
    data.append('海北部')
    data.append('渤海北')
    data.append('南海北')
    data.append('东海北')
    data.append('海口岸')
    data.append('出海口')
    
     # extra knowledge
    data.append('三明治')
    data.append('端午安康')
    data.append('孙中山')
    data.append('适中')
    data.append('重庆小面')
    data.append('重庆麻辣')
    data.append('重庆面庄')
    data.append('隆重庆祝')
    data.append('长春百子图')
    data.append('春分')
    data.append('兰州拉面')
    data.append('兰州牛肉面')
    data.append('兰州牛肉拉面')
    data.append('兰州传统牛肉面')
    data.append('澳大利亚昆士兰州')
    data.append('马里兰州')
    data.append('宁德时代')
    data.append('山地') 
    
    # add hundred family names
    for f in family_names:
        data.append(f + city_name)
        
        
    return data

In [5]:
data = []
for i in df['city_shortname']:
    name = i.split('+')[0]
    data += build_stopwords(name)
    
data = list(set(data))

In [6]:
len(data)

49380

In [7]:
pd.DataFrame(data, columns=['word']).to_csv('./data/city_stopwords2.csv', index=None)

In [8]:
df_mapping = pd.read_csv('./data/sa340.csv')
pro_names = list(set(df_mapping['city_shortname']))
len(pro_names)

340

In [9]:
stopwords = list(pd.read_csv('./data/city_stopwords2.csv')['word'])

def replace_citys_stop_words(text, stopwords):
    if 'xxx' in text:
        return ''
    
    # 因为编辑的外链习惯是不同的
    
    if '来源' in text:
        text = text.split('来源')[0]
        
    if '编辑' in text:
        text = text.split('编辑')[0]
        
    if '往期阅读' in text:
        text = text.split('往期阅读')[0]
        
        
    if '推荐阅读' in text:
        text = text.split('推荐阅读')[0]
    
        
    for i in stopwords:
        text = text.replace(i, '')
    return text




def get_city_loc_name_arr(text, names, exclude_city='0'):
    words = []
    text = str(text)
    
    
    for name in names:
        name_arr = name.split('+')
        # 针对多个地名简称
        # exclude_city 可能是存在多个简称的情况 +，因此必须是 not in 
        results = []
        for j in name_arr:
            if j in text:
                # 1.替换
                text = replace_citys_stop_words(text, stopwords)
                # 2.分词
                cut_ts = jieba.lcut(text)
                
                if j in cut_ts and j not in exclude_city:
                    results.append(j)
                     
        if len(results):
            words.append(name)
        else:
            pass
        
    if len(words):
        return list(set(words))
    else:
        return []
    
"""
city abbreviation to city full name
"""
def shortname2name(shortname):
    return list(set(df_mapping[df_mapping['city_shortname'] == shortname]['name']))[0]

"""
city full name to city abbreviation
"""
def name2shortname(name):
    return list(set(df_mapping[df_mapping['name'] == name]['city_shortname']))[0]

"""
sort the aggregated cities
"""
def Counter_citylink(citylinks):
    c = Counter(citylinks)
    cc = sorted(c.items(), key=lambda x:x[1] , reverse=True)
    return cc

In [10]:
# Get the number of times a city reports on other cities
def get_city_links(query_name):
    city_csv = df_all[df_all.city_name==query_name]
    shortname = name2shortname(query_name)
    gzh_citys = []
    for i, r in city_csv.iterrows():
        citys = get_city_loc_name_arr(r['text'], pro_names, shortname)
        gzh_citys += citys
        
    v = Counter_citylink(gzh_citys)
    return [(shortname2name(i[0]), i[1]) for i in v]

In [11]:
"""
source_city: 
target_citys: 
"""
def get_aja_list (source_city, target_citys):
    links = get_city_links(source_city)
    links = dict(links)
    data = []
    for n in target_citys:
        count = links.get(n) or 0
        data.append(count)
    return data

In [13]:
# 必须按照citys做邻接矩阵，否则顺序乱了
# citys = list(set(df_mapping['name']))
# citys.sort(reverse=True)
# data = []
# for i in tqdm(citys):
#     data.append(get_aja_list(i, citys))

In [14]:
# df = pd.DataFrame(data, columns=citys)
# df['city'] = citys
# df = df.set_index('city')
# df

In [15]:
# df.to_csv('./study_data/cities_gzh_aja_martix.csv')

In [17]:
len(stopwords)

49380