In [0]:
#### Calculate the distance between each dialect ####
# This code calculates the dialectal distance between each pair of sub-dialect group
# Programmer: Dan Qin
# Date: 2018.07.08

# import libraries
import pandas as pd
import numpy as np

In [0]:
# load data
dia_dict = pd.read_csv("data/Chinese_dialectdict_compl.csv")
dia_dict.head(1)

Unnamed: 0,语系,语族,方言大区,方言区/语支,方言片/语种,Supergroup,Dialect group,Sub-dialect group
0,汉藏,汉语,官话,东北官话,吉沈片,Mandarin,Northeastern,Jishen


In [0]:
### Calculate distance between dialects ###
# create a dataframe of sub-dialect group pairs
dia_pairs = pd.DataFrame(index = dia_dict["方言片/语种"], 
                         columns = dia_dict["方言片/语种"])
dia_pairs.head(1)

方言片/语种,吉沈片,哈阜片,黑松片,京承片,朝峰片,保唐片,石济片,沧惠片,登连片,青莱片,...,土族,锡伯,赫哲,佤,京,布赓,阿美,回辉,塔吉克,朝鲜语
方言片/语种,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
吉沈片,,,,,,,,,,,...,,,,,,,,,,


In [0]:
# calculate the distance between every two sub-dialect groups
for i in range(len(dia_pairs.index)):
    for j in range(len(dia_pairs.columns)):
        # pair of sub-dialect groups
        dia_1 = dia_pairs.index[i]
        dia_2 = dia_pairs.columns[j]
        
        # find their rows in the dialect dictionary
        row_1 = dia_dict.loc[dia_dict["方言片/语种"] == dia_1]
        row_2 = dia_dict.loc[dia_dict["方言片/语种"] == dia_2]
        
        # initialize dialectal distance
        distance = 0 
        
        # Assign a distance value according to the dialect tree
        # Assignment Rules:
        # 1. When two counties belong to the same sub-group, the distance is 0
        # 2. If different sub-groups but same group, the distance is 1
        # 3. If different groups but same super-group, the distance is 2
        # 4. If different dialect groups but same branch, the distance is 3
        # 5. Else, the distance is 4
        if dia_1 == dia_2:
            pass
        elif row_1["方言区/语支"].values[0] == row_2["方言区/语支"].values[0]:
            distance = 1
        elif row_1["方言大区"].values[0] == row_2["方言大区"].values[0]:
            distance = 2
        elif row_1["语族"].values[0] == row_2["语族"].values[0]:
            distance = 3
        else:
            distance = 4
         
        dia_pairs.iloc[i,j] = distance

In [0]:
# check
dia_pairs.head(3)

方言片/语种,吉沈片,哈阜片,黑松片,京承片,朝峰片,保唐片,石济片,沧惠片,登连片,青莱片,...,土族,锡伯,赫哲,佤,京,布赓,阿美,回辉,塔吉克,朝鲜语
方言片/语种,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
吉沈片,0,1,1,2,2,2,2,2,2,2,...,4,4,4,4,4,4,4,4,4,4
哈阜片,1,0,1,2,2,2,2,2,2,2,...,4,4,4,4,4,4,4,4,4,4
黑松片,1,1,0,2,2,2,2,2,2,2,...,4,4,4,4,4,4,4,4,4,4


In [0]:
# load county dialect data
county_dia = pd.read_csv("data/CH_dialect_county_compl.csv")
county_dia.head(1)

Unnamed: 0,AdCode,Province,Prefecture,County,方言大区,方言区/语支,方言片/语种,SGroup,DiaGroup,SubDiaGroup
0,110101,北京市,北京市,东城区,官话,北京官话,京承片,Mandarin,Beijing,Jingcheng


In [0]:
# load county population data
county_pop = pd.read_csv("data/2011_census_pop_age_edu.csv")
county_pop.head(2)

Unnamed: 0,省级,地级,县市,总人口,年轻人口(20-39),高等教育人口,年轻人比例,高等教育人口比例,市总人口,县市人口比例
0,北京市,北京市,北京市,19612368.0,8556982.0,6177772.0,0.436305,0.314994,19612368,1.0
1,北京市,北京市,东城区,573180.0,213458.0,213156.0,0.37241,0.371883,19612368,0.029225


In [0]:
# keep county rows in census data, drop province and prefectures(omitted)
# as census data was collected in 2011, replace some of the county names that have altered(omitted)

In [0]:
### Calculate dialectal distance between counties ###

# county list in the population data
census_clist_new = county_popcp["县市"].tolist()

# as pandas is extremely slow with loops,use a numpy array instead
county_array = np.empty([len(census_clist_new),len(census_clist_new)])
county_array[:] = np.nan

# dialect of counties
county_dialist = county_dia["方言片/语种"].tolist()

# loop through each pair of counties
for i in range(len(census_clist_new)):
    for j in range(len(census_clist_new)):
        # pairs of counties
        county_1 = census_clist_new[i]
        county_2 = census_clist_new[j]
        
        # find the index of the counties
        index_1 = county_list.index(county_1)
        index_2 = county_list.index(county_2)
        
        # find the dialect of the specified county
        dia_1 = county_dialist[index_1]
        dia_2 = county_dialist[index_2]
        
        # get the dialectal distance between the two counties
        distance = dia_pairs.loc[dia_1, dia_2]
        
        # store the value to the array
        county_array[i,j] = distance  
 

In [0]:
# create a dataframe
county_dist_2 = pd.DataFrame(data = county_array,
                         index = census_clist_new,
                         columns = census_clist_new)
county_dist_2.head(1)

Unnamed: 0,东城区,西城区,东城区.1,西城区.1,北京市朝阳区,丰台区,石景山区,海淀区,门头沟区,房山区,...,布尔津县,富蕴县,福海县,哈巴河县,青河县,吉木乃县,石河子市,阿拉尔市,图木舒克市,五家渠市
东城区,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,4.0,4.0


In [0]:
### Calculate dialectal distance between prefectures ###
# remove irrelevant names in pref_list
pref_list.remove("省直辖县级行政区划")
pref_list.remove("自治区直辖县级行政区划")

for province in prov_list:
    pref_list.remove(province)

# add the four municipalities
pref_list = ["北京市","上海市","天津市","重庆市"] + pref_list 
    
# create a list of prefecture pairs
pref_1 = []
pref_2 = []
for i in range(len(pref_list)):
    for j in range(len(pref_list)):
        # pairs of prefectures
        pref_1.append(pref_list[i])
        pref_2.append(pref_list[j])

# create a dataframe to store dialectal distance of prefecture pairs
index = np.arange(len(pref_1))
columns = ['Pref_1','Pref_2','DiaDist']
pref_dist = pd.DataFrame(index = index ,columns = columns)

pref_dist["Pref_1"] = pref_1
pref_dist["Pref_2"] = pref_2
pref_dist.head()

Unnamed: 0,Pref_1,Pref_2,DiaDist
0,北京市,北京市,
1,北京市,上海市,
2,北京市,天津市,
3,北京市,重庆市,
4,北京市,保定市,


In [0]:
# Calculate the dialectal distance between prefectures
# calculation formula: d(A,B) = ∑i ∑j S_Ai * S_Bj * d_ij 
    # d(A,B): dialectal distance between prefecture a and b
    # S_Ai: population proportion of county i in prefecture a
    # S_Bj: population proportion of county j in prefecture b
    # d_ij: dialectal distance between county i and j
    
# list for dialectal distance
DiaDist = []

for row in pref_dist.itertuples():
    # pairs of prefectures
    p1 = row.Pref_1
    p2 = row.Pref_2
    
    # county subset of each prefecture
    sub_1 = county_popcp.loc[county_popcp.地级 == p1]
    sub_2 = county_popcp.loc[county_popcp.地级 == p2]
    
    # initialize dialectal distance
    distance = 0
        
    for row1 in sub_1.itertuples():
        for row2 in sub_2.itertuples():
            # counties from either prefecture
            county_1 = row1.县市
            county_2 = row2.县市
                
            # get the dialectal distance between two counties
            index_1 = census_clist_new.index(county_1)
            index_2 = census_clist_new.index(county_2)
            county_distance = county_array[index_1,index_2]
            
            # calculate the distance based on population proportion
            distance += row1.县市人口比例 * row2.县市人口比例 * county_distance
        
    DiaDist.append(distance)


In [0]:
# add the column to dataframe and check the results
pref_dist["DiaDist"] = DiaDist
pref_dist.head(2)

Unnamed: 0,Pref_1,Pref_2,DiaDist
0,北京市,北京市,0.083037
1,北京市,上海市,3.0
