In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize, scale
from sklearn.feature_selection import r_regression, f_regression
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
from collections import Counter
import statsmodels.api as sm

# Yunnan

In [10]:
yunnan_nodelist = pd.read_csv('../Data/Preprocessed/yunnan_nodelist.csv')
yunnan_edgelist = pd.read_csv('../Data/Preprocessed/yunnan_edgelist.csv')
yunnan_network = pd.read_csv('../Data/Tables/basic/yunnan_basic_analysis.csv')

In [11]:
yunnan_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,relatives
0,0,1.0,,1.0,1.0
1,1,2.0,53.0,1.0,
2,2,3.0,39.0,0.0,0.0
3,3,4.0,34.0,0.0,
4,4,5.0,49.0,1.0,
...,...,...,...,...,...
166,166,167.0,33.0,1.0,0.0
167,167,168.0,62.0,0.0,0.0
168,168,169.0,41.0,1.0,1.0
169,169,170.0,52.0,1.0,1.0


In [12]:
yunnan_network

Unnamed: 0.1,Unnamed: 0,Referee,Degree_Centrality,Betweenness_Centrality,Pagerank_Centrality,Component_Size,Avg_Shortest_Path_Length
0,0,1,0.005882,0.0,0.013495,2,0.5
1,1,10,0.005882,0.0,0.013495,2,0.5
2,2,16,0.005882,0.0,0.013495,2,0.5
3,3,15,0.005882,0.0,0.013495,2,0.5
4,4,90,0.005882,0.0,0.013495,2,0.5
...,...,...,...,...,...,...,...
166,166,143,0.000000,0.0,0.002024,1,0.0
167,167,142,0.000000,0.0,0.002024,1,0.0
168,168,153,0.000000,0.0,0.002024,1,0.0
169,169,162,0.000000,0.0,0.002024,1,0.0


In [13]:
yunnan_nodelist['gender'].unique(), yunnan_nodelist['relatives'].unique()

(array([ 1.,  0., nan]), array([ 1., nan,  0.]))

In [14]:
yunnan_nodelist.columns, yunnan_network.columns

(Index(['Unnamed: 0', 'node_id', 'age', 'gender', 'relatives'], dtype='object'),
 Index(['Unnamed: 0', 'Referee', 'Degree_Centrality', 'Betweenness_Centrality',
        'Pagerank_Centrality', 'Component_Size', 'Avg_Shortest_Path_Length'],
       dtype='object'))

In [15]:
yunnan_nodelist['age'] = yunnan_nodelist['age'].fillna(value=int(yunnan_nodelist['age'].mean()))
yunnan_nodelist['gender'] = yunnan_nodelist['gender'].mask(yunnan_nodelist['gender'].isnull(), np.random.randint(0, 2, size=yunnan_nodelist.shape[0]))
yunnan_nodelist['relatives'] = yunnan_nodelist['relatives'].mask(yunnan_nodelist['relatives'].isnull(), np.random.randint(0, 2, size=yunnan_nodelist.shape[0]))

In [16]:
yunnan_nodelist['gender'].unique(), yunnan_nodelist['relatives'].unique()

(array([1., 0.]), array([1., 0.]))

In [17]:
yunnan_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_relatives', 'referral_relatives', 'relatives_diff',
                                    'referee_degree_centrality', 'referral_degree_centrality', 'degree_centrality_diff', 'referee_betweenness_centrality', 'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                    'referee_pagerank_centrality', 'referral_pagerank_centrality', 'pagerank_centrality_diff', 'referee_component_size', 'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                    'referral_avg_path_len', 'avg_path_len_diff'])

In [18]:
added_combinations = set()
for index, row in yunnan_edgelist[~yunnan_edgelist['Referral'].isna()].iterrows():
    referee_id = row['Referee']
    referral_id = row['Referral']
    referee = yunnan_nodelist.query('node_id == @referee_id')
    referral = yunnan_nodelist.query('node_id == @referral_id')
    referee_network_stats = yunnan_network.query('Referee == @referee_id')
    referral_network_stats = yunnan_network.query('Referee == @referral_id')

    referee_age = referee['age'].iloc[0]
    referral_age = referral['age'].iloc[0]
    referee_gender = referee['gender'].iloc[0]
    referral_gender = referral['gender'].iloc[0]
    referee_relatives = referee['relatives'].iloc[0]
    referral_relatives = referral['relatives'].iloc[0]

    referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
    referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
    referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
    referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
    referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
    referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
    referee_component_size = referee_network_stats['Component_Size'].iloc[0]
    referral_component_size = referral_network_stats['Component_Size'].iloc[0]
    referee_avg_path_len = referee_network_stats['Avg_Shortest_Path_Length'].iloc[0]
    referral_avg_path_len = referral_network_stats['Avg_Shortest_Path_Length'].iloc[0]

    new_row = pd.Series({'referee': referee_id, 'referral': referral_id, 'contact': 1, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                         'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                         'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                         'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                         'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                         'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                         'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size),
                         'referee_avg_path_len': referee_avg_path_len, 'referral_avg_path_len': referral_avg_path_len, 'avg_path_len_diff': abs(referee_avg_path_len - referral_avg_path_len)})
    yunnan_data = pd.concat([yunnan_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee_id, referral_id))
    added_combinations.add((referral_id, referee_id))

In [19]:
for ref_a in yunnan_nodelist['node_id']:
    for ref_b in yunnan_nodelist['node_id']:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = yunnan_nodelist.query('node_id == @ref_a')
        referral = yunnan_nodelist.query('node_id == @ref_b')
        referee_network_stats = yunnan_network.query('Referee == @ref_a')
        referral_network_stats = yunnan_network.query('Referee == @ref_b')

        referee_age = referee['age'].iloc[0]
        referral_age = referral['age'].iloc[0]
        referee_gender = referee['gender'].iloc[0]
        referral_gender = referral['gender'].iloc[0]
        referee_relatives = referee['relatives'].iloc[0]
        referral_relatives = referral['relatives'].iloc[0]

        referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
        referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
        referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
        referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
        referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
        referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
        referee_component_size = referee_network_stats['Component_Size'].iloc[0]
        referral_component_size = referral_network_stats['Component_Size'].iloc[0]
        referee_avg_path_len = referee_network_stats['Avg_Shortest_Path_Length'].iloc[0]
        referral_avg_path_len = referral_network_stats['Avg_Shortest_Path_Length'].iloc[0]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                             'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                             'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                             'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                             'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                             'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                             'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size),
                             'referee_avg_path_len': referee_avg_path_len, 'referral_avg_path_len': referral_avg_path_len, 'avg_path_len_diff': abs(referee_avg_path_len - referral_avg_path_len)})
        yunnan_data = pd.concat([yunnan_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

In [20]:
yunnan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff,referee_avg_path_len,referral_avg_path_len,avg_path_len_diff
0,1.0,10.0,1.0,41.0,63.0,22.0,1.0,1.0,0.0,1.0,...,0.0,0.013495,0.013495,0.000000,2.0,2.0,0.0,0.500000,0.500000,0.00
1,16.0,15.0,1.0,68.0,71.0,3.0,0.0,1.0,1.0,1.0,...,0.0,0.013495,0.013495,0.000000,2.0,2.0,0.0,0.500000,0.500000,0.00
2,90.0,75.0,1.0,34.0,76.0,42.0,0.0,1.0,1.0,1.0,...,0.0,0.013495,0.013495,0.000000,2.0,2.0,0.0,0.500000,0.500000,0.00
3,86.0,85.0,1.0,79.0,67.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.013495,0.013495,0.000000,3.0,3.0,0.0,0.666667,0.666667,0.00
4,86.0,81.0,1.0,79.0,46.0,33.0,0.0,0.0,0.0,0.0,...,0.0,0.013495,0.013495,0.000000,3.0,3.0,0.0,0.666667,0.666667,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14530,168.0,169.0,0.0,62.0,41.0,21.0,0.0,1.0,1.0,0.0,...,0.0,0.010376,0.002024,0.008352,3.0,1.0,2.0,1.000000,0.000000,1.00
14531,168.0,170.0,0.0,62.0,52.0,10.0,0.0,1.0,1.0,0.0,...,0.0,0.010376,0.013495,0.003119,3.0,4.0,1.0,1.000000,0.750000,0.25
14532,168.0,171.0,0.0,62.0,24.0,38.0,0.0,0.0,0.0,0.0,...,0.0,0.010376,0.013495,0.003119,3.0,4.0,1.0,1.000000,0.750000,0.25
14533,169.0,170.0,0.0,41.0,52.0,11.0,1.0,1.0,0.0,1.0,...,0.0,0.002024,0.013495,0.011471,1.0,4.0,3.0,0.000000,0.750000,0.75


In [21]:
(yunnan_nodelist.shape[0] * (yunnan_nodelist.shape[0] - 1)) / 2

14535.0

In [22]:
yunnan_data.columns

Index(['referee', 'referral', 'contact', 'referee_age', 'referral_age',
       'age_diff', 'referee_gender', 'referral_gender', 'gender_diff',
       'referee_relatives', 'referral_relatives', 'relatives_diff',
       'referee_degree_centrality', 'referral_degree_centrality',
       'degree_centrality_diff', 'referee_betweenness_centrality',
       'referral_betweenness_centrality', 'betweenness_centrality_diff',
       'referee_pagerank_centrality', 'referral_pagerank_centrality',
       'pagerank_centrality_diff', 'referee_component_size',
       'referral_component_size', 'component_size_diff',
       'referee_avg_path_len', 'referral_avg_path_len', 'avg_path_len_diff'],
      dtype='object')

In [23]:
yunnan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
             'referral_avg_path_len', 'avg_path_len_diff']] = normalize(yunnan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                         'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                         'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                         'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                         'pagerank_centrality_diff', 'referee_component_size',
                                                                                         'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                                                                     'referral_avg_path_len', 'avg_path_len_diff']])

yunnan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
             'referral_avg_path_len', 'avg_path_len_diff']] = scale(yunnan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                     'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                     'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                     'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                     'pagerank_centrality_diff', 'referee_component_size',
                                                                                     'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                                                                 'referral_avg_path_len', 'avg_path_len_diff']])
yunnan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff,referee_avg_path_len,referral_avg_path_len,avg_path_len_diff
0,1.0,10.0,1.0,-0.873428,0.969984,0.145853,1.0,1.0,0.0,1.0,...,-0.300416,1.390802,0.543677,-0.852743,0.057621,-0.415697,-0.567127,1.007831,0.073212,-0.847309
1,16.0,15.0,1.0,0.190044,0.529141,-1.188026,0.0,1.0,1.0,1.0,...,-0.300416,0.922140,0.194138,-0.852743,-0.139986,-0.487708,-0.567127,0.712674,-0.104450,-0.847309
2,90.0,75.0,1.0,-1.878515,1.027616,1.049256,0.0,1.0,1.0,1.0,...,-0.300416,1.022439,0.268943,-0.852743,-0.097695,-0.472297,-0.567127,0.775841,-0.066429,-0.847309
3,86.0,85.0,1.0,0.607199,0.102742,-0.737626,0.0,0.0,0.0,0.0,...,-0.300416,0.817234,0.115896,-0.852743,0.180352,-0.370971,-0.567127,1.009634,0.074297,-0.847309
4,86.0,81.0,1.0,0.956113,-0.803310,0.457684,0.0,0.0,0.0,0.0,...,-0.300416,0.943106,0.209774,-0.852743,0.259961,-0.341961,-0.567127,1.115331,0.137919,-0.847309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14530,168.0,169.0,0.0,0.890219,-0.496457,0.096869,0.0,1.0,1.0,0.0,...,-0.300416,0.882087,-0.912789,0.233808,0.562521,-0.590260,-0.191780,2.496797,-0.799767,1.019560
14531,168.0,170.0,0.0,0.619246,0.073207,-0.698094,0.0,1.0,1.0,0.0,...,-0.300416,0.786322,0.473668,-0.468558,0.483743,-0.090701,-0.389471,2.339908,0.456325,-0.405504
14532,168.0,171.0,0.0,0.929007,-1.661259,1.288605,0.0,0.0,0.0,0.0,...,-0.300416,0.895795,0.579863,-0.443794,0.573797,-0.046944,-0.378019,2.519255,0.537290,-0.377026
14533,169.0,170.0,0.0,-0.329918,0.801210,-0.479604,1.0,1.0,0.0,1.0,...,-0.300416,-0.509960,0.825447,0.861645,-0.348221,0.054245,0.079642,-0.442481,0.724527,0.761111


In [24]:
r_regression(yunnan_data[yunnan_data.columns[3:]], yunnan_data['contact'])

array([-0.01826755,  0.01610531, -0.02117468, -0.00252344,  0.02569006,
        0.00605719, -0.04726125, -0.03999255, -0.0672912 ,  0.46673806,
        0.1548551 , -0.01554568,  0.06450678,  0.17198082,  0.11798308,
        0.16829081,  0.11251789, -0.04322958,  0.47163359,  0.13479964,
       -0.04814475,  0.28100767,  0.10123703, -0.04805675])

In [25]:
f_regression(yunnan_data[yunnan_data.columns[3:]], yunnan_data['contact'])

(array([4.85133281e+00, 3.77056382e+00, 6.51903933e+00, 9.25428475e-02,
        9.59781321e+00, 5.33228651e-01, 3.25339467e+01, 2.32813797e+01,
        6.61063017e+01, 4.04770226e+03, 3.57065300e+02, 3.51301063e+00,
        6.07263142e+01, 4.42949672e+02, 2.05155243e+02, 4.23597741e+02,
        1.86351026e+02, 2.72100649e+01, 4.15747674e+03, 2.68965693e+02,
        3.37645515e+01, 1.24599305e+03, 1.50490144e+02, 3.36409410e+01]),
 array([2.76404206e-002, 5.21813466e-002, 1.06826858e-002, 7.60973360e-001,
        1.95180697e-003, 4.65264392e-001, 1.19409382e-008, 1.41369216e-006,
        4.61538236e-016, 0.00000000e+000, 1.07412804e-078, 6.09087593e-002,
        7.00154290e-015, 6.85002169e-097, 3.23190079e-046, 8.42620173e-093,
        3.62128937e-042, 1.85012998e-007, 0.00000000e+000, 6.57946137e-060,
        6.35047644e-009, 6.06961466e-262, 2.00489660e-034, 6.76610928e-009]))

In [26]:
print(sm.OLS(yunnan_data['contact'], yunnan_data[yunnan_data.columns[3:]]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                contact   R-squared (uncentered):                   0.591
Model:                            OLS   Adj. R-squared (uncentered):              0.591
Method:                 Least Squares   F-statistic:                              875.4
Date:                Fri, 30 Jun 2023   Prob (F-statistic):                        0.00
Time:                        13:04:54   Log-Likelihood:                          21782.
No. Observations:               14535   AIC:                                 -4.352e+04
Df Residuals:                   14511   BIC:                                 -4.333e+04
Df Model:                          24                                                  
Covariance Type:            nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------

# Hainan

In [27]:
hainan_edgelist = pd.read_csv('../Data/Preprocessed/hainan_edgelist.csv')
hainan_nodelist = pd.read_csv('../Data/Preprocessed/hainan_nodelist.csv')
hainan_network = pd.read_csv('../Data/Tables/basic/hainan_basic_analysis.csv')

In [28]:
hainan_nodelist['gender'].unique(), hainan_nodelist['relatives'].unique()

(array([1., 0.]), array([0., 1.]))

Don't need to fill nans

In [29]:
hainan_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_relatives', 'referral_relatives', 'relatives_diff',
                                    'referee_degree_centrality', 'referral_degree_centrality', 'degree_centrality_diff', 'referee_betweenness_centrality', 'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                    'referee_pagerank_centrality', 'referral_pagerank_centrality', 'pagerank_centrality_diff', 'referee_component_size', 'referral_component_size', 'component_size_diff',
                                    'referee_avg_path_len', 'referral_avg_path_len', 'avg_path_len_diff'])

In [30]:
added_combinations = set()
for index, row in hainan_edgelist[~hainan_edgelist['Referral'].isna()].iterrows():
    referee_id = row['Referee']
    referral_id = row['Referral']
    referee = hainan_nodelist.query('node_id == @referee_id')
    referral = hainan_nodelist.query('node_id == @referral_id')
    referee_network_stats = hainan_network.query('Referee == @referee_id')
    referral_network_stats = hainan_network.query('Referee == @referral_id')

    referee_age = referee['age'].iloc[0]
    referral_age = referral['age'].iloc[0]
    referee_gender = referee['gender'].iloc[0]
    referral_gender = referral['gender'].iloc[0]
    referee_relatives = referee['relatives'].iloc[0]
    referral_relatives = referral['relatives'].iloc[0]

    referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
    referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
    referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
    referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
    referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
    referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
    referee_component_size = referee_network_stats['Component_Size'].iloc[0]
    referral_component_size = referral_network_stats['Component_Size'].iloc[0]
    referee_avg_path_len = referee_network_stats['Avg_Shortest_Path_Length'].iloc[0]
    referral_avg_path_len = referral_network_stats['Avg_Shortest_Path_Length'].iloc[0]

    new_row = pd.Series({'referee': referee_id, 'referral': referral_id, 'contact': 1, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                         'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                         'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                         'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                         'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                         'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                         'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size),
                         'referee_avg_path_len': referee_avg_path_len, 'referral_avg_path_len': referral_avg_path_len, 'avg_path_len_diff': abs(referee_avg_path_len - referral_avg_path_len)})
    hainan_data = pd.concat([hainan_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee_id, referral_id))
    added_combinations.add((referral_id, referee_id))

In [31]:
for ref_a in hainan_nodelist['node_id']:
    for ref_b in hainan_nodelist['node_id']:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = hainan_nodelist.query('node_id == @ref_a')
        referral = hainan_nodelist.query('node_id == @ref_b')
        referee_network_stats = hainan_network.query('Referee == @ref_a')
        referral_network_stats = hainan_network.query('Referee == @ref_b')

        referee_age = referee['age'].iloc[0]
        referral_age = referral['age'].iloc[0]
        referee_gender = referee['gender'].iloc[0]
        referral_gender = referral['gender'].iloc[0]
        referee_relatives = referee['relatives'].iloc[0]
        referral_relatives = referral['relatives'].iloc[0]

        referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
        referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
        referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
        referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
        referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
        referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
        referee_component_size = referee_network_stats['Component_Size'].iloc[0]
        referral_component_size = referral_network_stats['Component_Size'].iloc[0]
        referee_avg_path_len = referee_network_stats['Avg_Shortest_Path_Length'].iloc[0]
        referral_avg_path_len = referral_network_stats['Avg_Shortest_Path_Length'].iloc[0]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                             'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                             'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                             'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                             'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                             'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                             'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size),
                             'referee_avg_path_len': referee_avg_path_len, 'referral_avg_path_len': referral_avg_path_len, 'avg_path_len_diff': abs(referee_avg_path_len - referral_avg_path_len)})
        hainan_data = pd.concat([hainan_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

In [32]:
hainan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff,referee_avg_path_len,referral_avg_path_len,avg_path_len_diff
0,5.0,3.0,1.0,27.0,27.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.009838,0.009838,0.000000,2.0,2.0,0.0,0.500000,0.500000,0.000000
1,15.0,7.0,1.0,54.0,53.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.009838,0.009838,0.000000,2.0,2.0,0.0,0.500000,0.500000,0.000000
2,14.0,13.0,1.0,44.0,47.0,3.0,0.0,1.0,1.0,1.0,...,0.0,0.008852,0.008852,0.000000,6.0,6.0,0.0,1.166667,1.166667,0.000000
3,36.0,35.0,1.0,17.0,8.0,9.0,0.0,1.0,1.0,1.0,...,0.0,0.009838,0.009838,0.000000,2.0,2.0,0.0,0.500000,0.500000,0.000000
4,43.0,42.0,1.0,69.0,68.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.009838,0.009838,0.000000,2.0,2.0,0.0,0.500000,0.500000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13036,158.0,159.0,0.0,62.0,27.0,35.0,1.0,0.0,1.0,1.0,...,0.0,0.009838,0.009838,0.000000,7.0,4.0,3.0,0.857143,0.750000,0.107143
13037,158.0,162.0,0.0,62.0,73.0,11.0,1.0,1.0,0.0,1.0,...,0.0,0.009838,0.001476,0.008362,7.0,1.0,6.0,0.857143,0.000000,0.857143
13038,160.0,159.0,0.0,25.0,27.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.009838,0.009838,0.000000,7.0,4.0,3.0,0.857143,0.750000,0.107143
13039,160.0,162.0,0.0,25.0,73.0,48.0,0.0,1.0,1.0,1.0,...,0.0,0.009838,0.001476,0.008362,7.0,1.0,6.0,0.857143,0.000000,0.857143


In [33]:
(hainan_nodelist.shape[0] * (hainan_nodelist.shape[0] - 1)) / 2

13041.0

In [34]:
hainan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
             'referral_avg_path_len', 'avg_path_len_diff']] = normalize(hainan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                         'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                         'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                         'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                         'pagerank_centrality_diff', 'referee_component_size',
                                                                                         'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                                                                     'referral_avg_path_len', 'avg_path_len_diff']])

hainan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
             'referral_avg_path_len', 'avg_path_len_diff']] = scale(hainan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                     'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                     'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                     'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                     'pagerank_centrality_diff', 'referee_component_size',
                                                                                     'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                                                                 'referral_avg_path_len', 'avg_path_len_diff']])
hainan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff,referee_avg_path_len,referral_avg_path_len,avg_path_len_diff
0,5.0,3.0,1.0,0.344824,0.360335,-1.398943,0.0,1.0,1.0,0.0,...,-0.201886,2.444011,2.367555,-1.006181,0.506155,0.179543,-0.977939,1.000436,0.836707,-1.089657
1,15.0,7.0,1.0,0.388989,0.331585,-1.328331,0.0,1.0,1.0,1.0,...,-0.201886,0.656699,0.558689,-1.006181,-0.291145,-0.447296,-0.977939,0.090971,-0.037618,-1.089657
2,14.0,13.0,1.0,0.188618,0.458837,-1.152257,0.0,1.0,1.0,1.0,...,-0.201886,0.744085,0.647129,-1.006181,1.742799,1.151796,-0.977939,1.688716,1.498393,-1.089657
3,36.0,35.0,1.0,0.911283,-1.446505,0.888826,0.0,1.0,1.0,1.0,...,-0.201886,5.410718,5.370036,-1.006181,1.829568,1.220015,-0.977939,2.510032,2.287975,-1.089657
4,43.0,42.0,1.0,0.382375,0.340957,-1.343774,0.0,1.0,1.0,1.0,...,-0.201886,0.256776,0.153943,-1.006181,-0.469546,-0.587556,-0.977939,-0.112529,-0.233255,-1.089657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13036,158.0,159.0,0.0,0.915969,-1.602282,1.043098,1.0,0.0,1.0,1.0,...,-0.201886,0.634962,0.536690,-1.006181,1.714139,0.178754,0.159729,0.736611,0.393677,-0.874662
13037,158.0,162.0,0.0,-0.012001,0.630882,-0.791670,1.0,1.0,0.0,1.0,...,-0.201886,0.257767,-1.074870,0.361991,1.125221,-0.837900,0.822388,0.407581,-0.932110,0.271239
13038,160.0,159.0,0.0,0.099158,0.405492,-1.116464,0.0,0.0,0.0,1.0,...,-0.201886,2.485636,2.409681,-1.006181,4.603614,1.476875,1.325023,2.350965,1.751659,-0.654446
13039,160.0,162.0,0.0,-2.025460,0.883548,1.410642,0.0,1.0,1.0,1.0,...,-0.201886,0.343907,-1.061793,0.444430,1.259712,-0.822795,0.930866,0.482722,-0.932110,0.353239


In [35]:
r_regression(hainan_data[hainan_data.columns[3:]], hainan_data['contact'])

array([-0.00616692, -0.01122213,  0.00740684,  0.0058602 ,  0.00772179,
        0.007496  ,  0.0871856 ,  0.08413633, -0.07812765,  0.16198783,
        0.07839425, -0.07209395,  0.0689273 ,  0.01106468,  0.03157248,
        0.09585184,  0.07694475, -0.07107369,  0.12996357,  0.08095444,
       -0.09503362,  0.09426274,  0.08291782, -0.08691061])

In [36]:
f_regression(hainan_data[hainan_data.columns[3:]], hainan_data['contact'])

(array([  0.49590343,   1.64228849,   0.71537469,   0.44780005,
          0.77751038,   0.73270189,  99.8728961 ,  92.96012754,
         80.07793457, 351.36391993,  80.62874986,  68.12477048,
         62.24364453,   1.59652167,  13.01052834, 120.90763719,
         77.65710067,  66.20052386, 224.01943842,  86.01639407,
        118.8335103 , 116.89624958,  90.26852058,  99.2390933 ]),
 array([4.81318247e-01, 2.00034556e-01, 3.97680819e-01, 5.03393377e-01,
        3.77919892e-01, 3.92024531e-01, 1.97302275e-23, 6.31474686e-22,
        4.08053745e-19, 2.21714829e-77, 3.09312475e-19, 1.68135700e-16,
        3.27585504e-15, 2.06419429e-01, 3.10902161e-04, 5.31384830e-28,
        1.37943370e-18, 4.43997626e-16, 3.13705524e-50, 2.06194036e-20,
        1.49747717e-27, 3.94222623e-27, 2.43756633e-21, 2.71058630e-23]))

In [37]:
print(sm.OLS(hainan_data['contact'], hainan_data[hainan_data.columns[3:]]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                contact   R-squared (uncentered):                   0.125
Model:                            OLS   Adj. R-squared (uncentered):              0.124
Method:                 Least Squares   F-statistic:                              77.61
Date:                Fri, 30 Jun 2023   Prob (F-statistic):                        0.00
Time:                        13:09:53   Log-Likelihood:                          12830.
No. Observations:               13041   AIC:                                 -2.561e+04
Df Residuals:                   13017   BIC:                                 -2.543e+04
Df Model:                          24                                                  
Covariance Type:            nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------

# Shanxi

In [38]:
shanxi_edgelist = pd.read_csv('../Data/Preprocessed/shanxi_edgelist.csv')
shanxi_nodelist = pd.read_csv('../Data/Preprocessed/shanxi_nodelist.csv')
shanxi_network = pd.read_csv('../Data/Tables/basic/shanxi_basic_analysis.csv')

In [39]:
shanxi_nodelist['gender'].unique(), shanxi_nodelist['relatives'].unique(), shanxi_nodelist['hukou'].unique()

(array([1, 0], dtype=int64),
 array([0, 1], dtype=int64),
 array(['xianyang', 'xian', 'ankang', 'yanan', 'wuhan', 'dazhi',
        'hanzhong', 'tongchuan', 'weinan', 'baoji', 'henan', 'shangluo',
        'xiaogan', 'hancheng', 'yulin', 'lantian', 'shanghai', 'lingbao',
        'tianmen', 'nanjing', 'yichang', 'suizhou', 'pingdingshan',
        'yingcheng', 'yanglin'], dtype=object))

No missing values to fill

In [40]:
shanxi_nodelist['hukou_ints'] = pd.factorize(shanxi_nodelist['hukou'])[0]
shanxi_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,hukou,relatives,hukou_ints
0,0,1,42,1,xianyang,0,0
1,1,2,32,0,xian,0,1
2,2,3,22,1,xian,0,1
3,3,4,49,1,ankang,0,2
4,4,5,23,1,yanan,0,3
...,...,...,...,...,...,...,...
232,232,233,67,0,weinan,0,8
233,233,234,46,1,hanzhong,0,6
234,234,235,71,1,hanzhong,1,6
235,235,236,66,0,hanzhong,1,6


In [41]:
shanxi_dummies = pd.get_dummies(shanxi_nodelist)

In [42]:
shanxi_dummies.columns

Index(['Unnamed: 0', 'node_id', 'age', 'gender', 'relatives', 'hukou_ints',
       'hukou_ankang', 'hukou_baoji', 'hukou_dazhi', 'hukou_hancheng',
       'hukou_hanzhong', 'hukou_henan', 'hukou_lantian', 'hukou_lingbao',
       'hukou_nanjing', 'hukou_pingdingshan', 'hukou_shanghai',
       'hukou_shangluo', 'hukou_suizhou', 'hukou_tianmen', 'hukou_tongchuan',
       'hukou_weinan', 'hukou_wuhan', 'hukou_xian', 'hukou_xianyang',
       'hukou_xiaogan', 'hukou_yanan', 'hukou_yanglin', 'hukou_yichang',
       'hukou_yingcheng', 'hukou_yulin'],
      dtype='object')

Don't need to fill nans

In [43]:
shanxi_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_relatives', 'referral_relatives', 'relatives_diff',
                                    'referee_hukou', 'referral_hukou', 'hukou_diff', 'referee_degree_centrality', 'referral_degree_centrality', 'degree_centrality_diff', 'referee_betweenness_centrality', 'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                    'referee_pagerank_centrality', 'referral_pagerank_centrality', 'pagerank_centrality_diff', 'referee_component_size', 'referral_component_size', 'component_size_diff',
                                    'referee_avg_path_len', 'referral_avg_path_len', 'avg_path_len_diff'])

referees_hukou = pd.DataFrame(columns=['hukou_ankang', 'hukou_baoji', 'hukou_dazhi', 'hukou_hancheng',
                                       'hukou_hanzhong', 'hukou_henan', 'hukou_lantian', 'hukou_lingbao',
                                       'hukou_nanjing', 'hukou_pingdingshan', 'hukou_shanghai',
                                       'hukou_shangluo', 'hukou_suizhou', 'hukou_tianmen', 'hukou_tongchuan',
                                       'hukou_weinan', 'hukou_wuhan', 'hukou_xian', 'hukou_xianyang',
                                       'hukou_xiaogan', 'hukou_yanan', 'hukou_yanglin', 'hukou_yichang',
                                       'hukou_yingcheng', 'hukou_yulin'])
referrals_hukou = referees_hukou.copy()

In [44]:
added_combinations = set()
for index, row in shanxi_edgelist[~shanxi_edgelist['Referral'].isna()].iterrows():
    """
    referee = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == row['Referee']]
    referral = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == row['Referral']]
    new_row = pd.Series({'referee': row['Referee'], 'referral': row['Referral'], 'contact': 1, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                         'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                         'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0]),
                         'hukou_diff': abs(referee['hukou_ints'].iloc[0] - referral['hukou_ints'].iloc[0])})

    referee_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == row['Referee']][shanxi_dummies.columns[6:]]
    referral_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == row['Referral']][shanxi_dummies.columns[6:]]

    shanxi_data = pd.concat([shanxi_data, new_row.to_frame().T], axis='index', ignore_index=True)
    shanxi_data['hukou_diff'] = shanxi_data['hukou_diff'].where(shanxi_data['hukou_diff'] == 0, 1)

    referees_hukou = pd.concat([referees_hukou, referee_dummies], axis='index', ignore_index=True)
    referrals_hukou = pd.concat([referrals_hukou, referral_dummies], axis='index', ignore_index=True)

    added_combinations.add((referee['node_id'].iloc[0], referral['node_id'].iloc[0]))
    added_combinations.add((referral['node_id'].iloc[0], referee['node_id'].iloc[0]))
    """
    referee_id = row['Referee']
    referral_id = row['Referral']
    referee = shanxi_nodelist.query('node_id == @referee_id')
    referral = shanxi_nodelist.query('node_id == @referral_id')
    referee_network_stats = shanxi_network.query('Referee == @referee_id')
    referral_network_stats = shanxi_network.query('Referee == @referral_id')

    referee_age = referee['age'].iloc[0]
    referral_age = referral['age'].iloc[0]
    referee_gender = referee['gender'].iloc[0]
    referral_gender = referral['gender'].iloc[0]
    referee_relatives = referee['relatives'].iloc[0]
    referral_relatives = referral['relatives'].iloc[0]
    referee_hukou = referee['hukou_ints'].iloc[0]
    referral_hukou = referral['hukou_ints'].iloc[0]

    referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
    referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
    referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
    referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
    referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
    referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
    referee_component_size = referee_network_stats['Component_Size'].iloc[0]
    referral_component_size = referral_network_stats['Component_Size'].iloc[0]
    referee_avg_path_len = referee_network_stats['Avg_Shortest_Path_Length'].iloc[0]
    referral_avg_path_len = referral_network_stats['Avg_Shortest_Path_Length'].iloc[0]

    new_row = pd.Series({'referee': referee_id, 'referral': referral_id, 'contact': 1, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                         'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                         'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                         'referee_hukou': referee_hukou, 'referral_hukou': referral_hukou, 'hukou_diff': abs(referee_hukou - referral_hukou),
                         'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                         'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                         'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                         'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size),
                         'referee_avg_path_len': referee_avg_path_len, 'referral_avg_path_len': referral_avg_path_len, 'avg_path_len_diff': abs(referee_avg_path_len - referral_avg_path_len)})
    shanxi_data = pd.concat([shanxi_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee_id, referral_id))
    added_combinations.add((referral_id, referee_id))

In [45]:
shanxi_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff,referee_avg_path_len,referral_avg_path_len,avg_path_len_diff
0,11.0,10.0,1.0,9.0,45.0,36.0,0.0,0.0,0.0,1.0,...,0.000000,0.007450,0.007450,0.000000,8.0,8.0,0.0,1.375,1.375,0.000
1,11.0,9.0,1.0,9.0,46.0,37.0,0.0,1.0,1.0,1.0,...,0.000541,0.007450,0.015512,0.008062,8.0,8.0,0.0,1.375,1.000,0.375
2,10.0,9.0,1.0,45.0,46.0,1.0,0.0,1.0,1.0,1.0,...,0.000541,0.007450,0.015512,0.008062,8.0,8.0,0.0,1.375,1.000,0.375
3,22.0,2.0,1.0,33.0,32.0,1.0,1.0,0.0,1.0,0.0,...,0.000108,0.004612,0.012776,0.008164,4.0,4.0,0.0,1.250,0.750,0.500
4,20.0,19.0,1.0,52.0,70.0,18.0,0.0,0.0,0.0,0.0,...,0.000216,0.003142,0.010087,0.006946,8.0,8.0,0.0,2.000,1.250,0.750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,235.0,234.0,1.0,71.0,46.0,25.0,1.0,1.0,0.0,1.0,...,0.000000,0.006653,0.006653,0.000000,4.0,4.0,0.0,0.750,0.750,0.000
121,236.0,234.0,1.0,66.0,46.0,20.0,0.0,1.0,1.0,1.0,...,0.000000,0.006653,0.006653,0.000000,4.0,4.0,0.0,0.750,0.750,0.000
122,236.0,235.0,1.0,66.0,71.0,5.0,0.0,1.0,1.0,1.0,...,0.000000,0.006653,0.006653,0.000000,4.0,4.0,0.0,0.750,0.750,0.000
123,237.0,234.0,1.0,65.0,46.0,19.0,0.0,1.0,1.0,1.0,...,0.000000,0.006653,0.006653,0.000000,4.0,4.0,0.0,0.750,0.750,0.000


In [46]:
for ref_a in shanxi_nodelist['node_id']:
    for ref_b in shanxi_nodelist['node_id']:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        """
        referee = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == ref_a]
        referral = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == ref_b]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                             'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                             'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0]),
                             'hukou_diff': abs(referee['hukou_ints'].iloc[0] - referral['hukou_ints'].iloc[0])})
        referee_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == ref_a][shanxi_dummies.columns[6:]]
        referral_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == ref_b][shanxi_dummies.columns[6:]]

        shanxi_data = pd.concat([shanxi_data, new_row.to_frame().T], axis='index', ignore_index=True)
        shanxi_data['hukou_diff'] = shanxi_data['hukou_diff'].where(shanxi_data['hukou_diff'] == 0, 1)

        referees_hukou = pd.concat([referees_hukou, referee_dummies], axis='index', ignore_index=True)
        referrals_hukou = pd.concat([referrals_hukou, referral_dummies], axis='index', ignore_index=True)
        """
        referee = shanxi_nodelist.query('node_id == @ref_a')
        referral = shanxi_nodelist.query('node_id == @ref_b')
        referee_network_stats = shanxi_network.query('Referee == @ref_a')
        referral_network_stats = shanxi_network.query('Referee == @ref_b')

        referee_age = referee['age'].iloc[0]
        referral_age = referral['age'].iloc[0]
        referee_gender = referee['gender'].iloc[0]
        referral_gender = referral['gender'].iloc[0]
        referee_relatives = referee['relatives'].iloc[0]
        referral_relatives = referral['relatives'].iloc[0]
        referee_hukou = referee['hukou_ints'].iloc[0]
        referral_hukou = referral['hukou_ints'].iloc[0]

        referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
        referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
        referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
        referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
        referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
        referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
        referee_component_size = referee_network_stats['Component_Size'].iloc[0]
        referral_component_size = referral_network_stats['Component_Size'].iloc[0]
        referee_avg_path_len = referee_network_stats['Avg_Shortest_Path_Length'].iloc[0]
        referral_avg_path_len = referral_network_stats['Avg_Shortest_Path_Length'].iloc[0]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                             'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                             'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                             'referee_hukou': referee_hukou, 'referral_hukou': referral_hukou, 'hukou_diff': abs(referee_hukou - referral_hukou),
                             'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                             'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                             'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                             'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size),
                             'referee_avg_path_len': referee_avg_path_len, 'referral_avg_path_len': referral_avg_path_len, 'avg_path_len_diff': abs(referee_avg_path_len - referral_avg_path_len)})
        shanxi_data = pd.concat([shanxi_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))


In [47]:
#shanxi_data = shanxi_data.join(referees_hukou)
#shanxi_data = shanxi_data.join(referrals_hukou, lsuffix='_referee', rsuffix='_referral')
shanxi_data['hukou_diff'] = shanxi_data['hukou_diff'].where(shanxi_data['hukou_diff'] == 0, 1)

In [48]:
shanxi_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff,referee_avg_path_len,referral_avg_path_len,avg_path_len_diff
0,11.0,10.0,1.0,9.0,45.0,36.0,0.0,0.0,0.0,1.0,...,0.000000,0.007450,0.007450,0.000000,8.0,8.0,0.0,1.375,1.375,0.000
1,11.0,9.0,1.0,9.0,46.0,37.0,0.0,1.0,1.0,1.0,...,0.000541,0.007450,0.015512,0.008062,8.0,8.0,0.0,1.375,1.000,0.375
2,10.0,9.0,1.0,45.0,46.0,1.0,0.0,1.0,1.0,1.0,...,0.000541,0.007450,0.015512,0.008062,8.0,8.0,0.0,1.375,1.000,0.375
3,22.0,2.0,1.0,33.0,32.0,1.0,1.0,0.0,1.0,0.0,...,0.000108,0.004612,0.012776,0.008164,4.0,4.0,0.0,1.250,0.750,0.500
4,20.0,19.0,1.0,52.0,70.0,18.0,0.0,0.0,0.0,0.0,...,0.000216,0.003142,0.010087,0.006946,8.0,8.0,0.0,2.000,1.250,0.750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27967,232.0,237.0,0.0,46.0,65.0,19.0,0.0,0.0,0.0,0.0,...,0.000000,0.000998,0.006653,0.005655,1.0,4.0,3.0,0.000,0.750,0.750
27968,233.0,234.0,0.0,67.0,46.0,21.0,0.0,1.0,1.0,0.0,...,0.000000,0.000998,0.006653,0.005655,1.0,4.0,3.0,0.000,0.750,0.750
27969,233.0,235.0,0.0,67.0,71.0,4.0,0.0,1.0,1.0,0.0,...,0.000000,0.000998,0.006653,0.005655,1.0,4.0,3.0,0.000,0.750,0.750
27970,233.0,236.0,0.0,67.0,66.0,1.0,0.0,0.0,0.0,0.0,...,0.000000,0.000998,0.006653,0.005655,1.0,4.0,3.0,0.000,0.750,0.750


In [49]:
shanxi_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
             'referral_avg_path_len', 'avg_path_len_diff']] = normalize(shanxi_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                         'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                         'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                         'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                         'pagerank_centrality_diff', 'referee_component_size',
                                                                                         'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                                                                     'referral_avg_path_len', 'avg_path_len_diff']])
shanxi_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
             'referral_avg_path_len', 'avg_path_len_diff']] = scale(shanxi_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                         'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                         'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                         'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                         'pagerank_centrality_diff', 'referee_component_size',
                                                                                         'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                                                                 'referral_avg_path_len', 'avg_path_len_diff']])
shanxi_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff,referee_avg_path_len,referral_avg_path_len,avg_path_len_diff
0,11.0,10.0,1.0,-2.565122,0.456532,2.025931,0.0,0.0,0.0,1.0,...,-0.217599,1.096045,1.322771,-0.976088,1.915282,2.611252,-0.815722,1.725098,1.915777,-1.075193
1,11.0,9.0,1.0,-2.584266,0.453531,2.043080,0.0,1.0,1.0,1.0,...,3.856677,1.048526,4.024152,1.479961,1.850954,2.529321,-0.815722,1.667717,1.100768,-0.333923
2,10.0,9.0,1.0,0.476145,0.105467,-1.435083,0.0,1.0,1.0,1.0,...,3.572137,0.903212,3.653328,1.308435,1.654238,2.278777,-0.815722,1.492246,0.960713,-0.385692
3,22.0,2.0,1.0,0.609572,0.017766,-1.398281,1.0,0.0,1.0,0.0,...,0.851337,0.657692,4.451560,2.286666,0.881734,1.294883,-0.815722,2.151522,1.068398,0.221349
4,20.0,19.0,1.0,-0.142704,0.605468,-0.349392,0.0,0.0,0.0,0.0,...,0.886002,-0.438020,1.052678,0.456862,0.941647,1.371191,-0.815722,1.629989,0.792876,-0.071252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27967,232.0,237.0,0.0,-0.246748,0.689649,-0.166755,0.0,0.0,0.0,0.0,...,-0.217599,-0.825591,0.402512,0.300973,-0.704799,0.271385,-0.011150,-0.844823,0.210188,0.023696
27968,233.0,234.0,0.0,1.089188,-0.927143,-0.060760,0.0,1.0,1.0,0.0,...,-0.217599,-0.830678,0.360950,0.269531,-0.711224,0.238650,-0.030959,-0.844823,0.182740,-0.003360
27969,233.0,235.0,0.0,0.455583,0.250091,-1.285078,0.0,1.0,1.0,0.0,...,-0.217599,-0.858978,0.129715,0.094603,-0.746972,0.056530,-0.141167,-0.844823,0.030032,-0.153882
27970,233.0,236.0,0.0,0.605653,0.085399,-1.462533,0.0,0.0,0.0,0.0,...,-0.217599,-0.852275,0.184483,0.136034,-0.738505,0.099665,-0.115064,-0.844823,0.066201,-0.118231


In [50]:
r_regression(shanxi_data[shanxi_data.columns[3:]], shanxi_data['contact'])

array([ 0.01450159, -0.0175731 , -0.01056091, -0.01539246,  0.00552877,
        0.01150495,  0.05974816,  0.02571722, -0.00035044,  0.00788541,
        0.01891385, -0.09406561,  0.03385996,  0.14421896,  0.02751665,
       -0.00160706,  0.21000583,  0.08109907,  0.02975455,  0.11685592,
        0.01861931,  0.04097268,  0.06369963, -0.05465225,  0.05224747,
        0.03406456, -0.04175386])

In [51]:
f_regression(shanxi_data[shanxi_data.columns[3:]], shanxi_data['contact'])

(array([5.88321597e+00, 8.64019394e+00, 3.11992060e+00, 6.62843994e+00,
        8.54992530e-01, 3.70270591e+00, 1.00206212e+02, 1.85109144e+01,
        3.43501594e-03, 1.73927197e+00, 1.00093878e+01, 2.49697442e+02,
        3.21043161e+01, 5.94107967e+02, 2.11939786e+01, 7.22366126e-02,
        1.29045779e+03, 1.85178262e+02, 2.47847187e+01, 3.87226624e+02,
        9.69996400e+00, 4.70338997e+01, 1.13954654e+02, 8.37929933e+01,
        7.65614650e+01, 3.24939391e+01, 4.88476413e+01]),
 array([1.52921619e-002, 3.29095226e-003, 7.73514580e-002, 1.00414063e-002,
        3.55151778e-001, 5.43344351e-002, 1.50458858e-023, 1.69507094e-005,
        9.53263954e-001, 1.87241897e-001, 1.55910923e-003, 5.28348334e-056,
        1.47547129e-008, 7.26053267e-130, 4.16890605e-006, 7.88109666e-001,
        2.56300410e-276, 4.88468766e-042, 6.44845067e-007, 1.26289579e-085,
        1.84457727e-003, 7.12204047e-012, 1.49987365e-026, 5.85772469e-020,
        2.25231012e-018, 1.20766903e-008, 2.82838670e-

In [52]:
print(sm.OLS(shanxi_data['contact'], shanxi_data[shanxi_data.columns[3:]]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                contact   R-squared (uncentered):                   0.107
Model:                            OLS   Adj. R-squared (uncentered):              0.107
Method:                 Least Squares   F-statistic:                              124.6
Date:                Fri, 30 Jun 2023   Prob (F-statistic):                        0.00
Time:                        13:16:00   Log-Likelihood:                          37573.
No. Observations:               27972   AIC:                                 -7.509e+04
Df Residuals:                   27945   BIC:                                 -7.487e+04
Df Model:                          27                                                  
Covariance Type:            nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------

# Bucharest

In [53]:
bucharest_nodelist = pd.read_csv('../Data/Preprocessed/bucharest_nodelist.csv')
bucharest_edgelist = pd.read_csv('../Data/Preprocessed/bucharest_edgelist.csv')
bucharest_network = pd.read_csv('../Data/Tables/basic/bucharest_basic_analysis.csv')

In [63]:
bucharest_network

Unnamed: 0.1,Unnamed: 0,Referee,Degree_Centrality,Betweenness_Centrality,Pagerank_Centrality,Component_Size,Avg_Shortest_Path_Length
0,0,53594,0.000035,5.979585e-10,0.000058,3,0.666667
1,1,10330S53594,0.000017,0.000000e+00,0.000029,3,1.000000
2,2,10331S53594,0.000017,0.000000e+00,0.000029,3,1.000000
3,3,58373,0.000017,0.000000e+00,0.000039,2,0.500000
4,4,11577S58373,0.000017,0.000000e+00,0.000039,2,0.500000
...,...,...,...,...,...,...,...
57830,57830,256534,0.000000,0.000000e+00,0.000006,1,0.000000
57831,57831,256533,0.000000,0.000000e+00,0.000006,1,0.000000
57832,57832,256514,0.000000,0.000000e+00,0.000006,1,0.000000
57833,57833,256531,0.000000,0.000000e+00,0.000006,1,0.000000


In [54]:
bucharest_nodelist['medical_ints'] = pd.factorize(bucharest_nodelist['medical'])[0]
bucharest_nodelist['isco08_code_ints'] = pd.factorize(bucharest_nodelist['isco08_code'])[0]

In [55]:
bucharest_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,medical,isco08_code,isco08_label,medical_ints,isco08_code_ints
0,0,179373,65.0,1,No,NAP,Not Active - pensioner,0,0
1,1,1S179373,63.0,,,,,-1,-1
2,2,146179,40.0,1,No,EMP,Employee - unknown group,0,1
3,3,235990,32.0,2.0,,,,-1,-1
4,4,3S146179,3.0,,No,NAC,Not Active - Child,0,2
...,...,...,...,...,...,...,...,...,...
57830,57830,19,32.0,male,,,,-1,-1
57831,57831,18,30.0,female,,,,-1,-1
57832,57832,16,60.0,male,,,,-1,-1
57833,57833,14,42.0,female,,,,-1,-1


In [56]:
bucharest_nodelist['gender'].unique()

array(['1', nan, '2.0', '2', '1.0', 'male', 'female'], dtype=object)

In [57]:
bucharest_nodelist['age'] = bucharest_nodelist['age'].fillna(value=int(bucharest_nodelist['age'].mean()))
bucharest_nodelist['gender'] = bucharest_nodelist['gender'].where(bucharest_nodelist['gender'] != 'male', 1)
bucharest_nodelist['gender'] = bucharest_nodelist['gender'].where(bucharest_nodelist['gender'] != 'female', 2)
bucharest_nodelist['gender'][~bucharest_nodelist['gender'].isnull()] = bucharest_nodelist['gender'][~bucharest_nodelist['gender'].isnull()].apply(lambda x: float(x))
bucharest_nodelist['gender'] = bucharest_nodelist['gender'].astype('Int64', errors='ignore')
bucharest_nodelist['gender'] = bucharest_nodelist['gender'].where(bucharest_nodelist['gender'] != 0, np.nan)
#bucharest_nodelist['gender'] = bucharest_nodelist['gender'].mask(bucharest_nodelist['gender'].isnull(), np.random.randint(0, 2, size=bucharest_nodelist.shape[0]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bucharest_nodelist['gender'][~bucharest_nodelist['gender'].isnull()] = bucharest_nodelist['gender'][~bucharest_nodelist['gender'].isnull()].apply(lambda x: float(x))


In [92]:
bucharest_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_medical', 'referral_medical', 'medical_diff',
                                    'referee_isco08_label', 'referral_isco08_label', 'isco08_label_diff', 'referee_degree_centrality', 'referral_degree_centrality', 'degree_centrality_diff', 'referee_betweenness_centrality', 'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                    'referee_pagerank_centrality', 'referral_pagerank_centrality', 'pagerank_centrality_diff', 'referee_component_size', 'referral_component_size', 'component_size_diff',
                                       'referee_avg_path_len', 'referral_avg_path_len', 'avg_path_len_diff'])

In [93]:
added_combinations = set()
for index, row in bucharest_edgelist[~bucharest_edgelist['Referral'].isna()].iterrows():
    referee_id = row['Referee']
    referral_id = row['Referral']
    referee = bucharest_nodelist.query('node_id == @referee_id')
    referral = bucharest_nodelist.query('node_id == @referral_id')
    referee_network_stats = bucharest_network.query('Referee == @referee_id')
    referral_network_stats = bucharest_network.query('Referee == @referral_id')

    referee_age = referee['age'].iloc[0]
    referral_age = referral['age'].iloc[0]
    referee_gender = referee['gender'].iloc[0]
    referral_gender = referral['gender'].iloc[0]
    referee_medical = referee['medical_ints'].iloc[0]
    referral_medical = referral['medical_ints'].iloc[0]
    referee_isco08_label = referee['isco08_code_ints'].iloc[0]
    referral_isco08_label = referral['isco08_code_ints'].iloc[0]

    referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
    referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
    referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
    referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
    referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
    referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
    referee_component_size = referee_network_stats['Component_Size'].iloc[0]
    referral_component_size = referral_network_stats['Component_Size'].iloc[0]
    referee_avg_path_len = referee_network_stats['Avg_Shortest_Path_Length'].iloc[0]
    referral_avg_path_len = referral_network_stats['Avg_Shortest_Path_Length'].iloc[0]

    new_row = pd.Series({'referee': referee_id, 'referral': referral_id, 'contact': 1, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                         'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                         'referee_medical': referee_medical, 'referral_medical': referral_medical, 'medical_diff': abs(referee_medical - referral_medical),
                         'referee_isco08_label': referee_isco08_label, 'referral_isco08_label': referral_isco08_label, 'isco08_label_diff': abs(referee_isco08_label - referral_isco08_label),
                         'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                         'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                         'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                         'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size),
                         'referee_avg_path_len': referee_avg_path_len, 'referral_avg_path_len': referral_avg_path_len, 'avg_path_len_diff': abs(referee_avg_path_len - referral_avg_path_len)})
    bucharest_data = pd.concat([bucharest_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee_id, referral_id))
    added_combinations.add((referral_id, referee_id))

In [90]:
sample = np.random.choice(bucharest_nodelist['node_id'], size=150, replace=False)
sample

array(['4394S232148', '120360', '128766', '61879', '5175S107889',
       '247362', '229981', '58707', '196047', '131303', '284618',
       '193467', '172994', '94987', '312541', '300385', '229690',
       '116862', '237183', '242621', '149951', '136030', '8070', '149196',
       '20210', '244529', '103735', '202196', '87200', '9077S170606',
       '93263', '179085', '72414', '50039', '297676', '161847',
       '5418S174311', '286339', '299813', '203037', '13263S153068',
       '14207S122960', '140907', '261852', '267495', '7829S138649',
       '12712', '66929', '101104', '176873', '30964', '8842S122821',
       '211822', '7853S71416', '116601', '112154', '68592', '297456',
       '133232', '268920', '233318', '63133', '166087', '209310',
       '5091S119819', '277774', '7420S192463', '12878S178597', '164759',
       '162709', '92809', '286489', '139642', '265332', '6478S147312',
       '263525', '166876', '123483', '4674S181934', '6889S136711',
       '3527S95319', '113105', '263408', 

In [94]:
for ref_a in tqdm(sample):
    for ref_b in sample:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = bucharest_nodelist.query('node_id == @ref_a')
        referral = bucharest_nodelist.query('node_id == @ref_b')
        referee_network_stats = bucharest_network.query('Referee == @ref_a')
        referral_network_stats = bucharest_network.query('Referee == @ref_b')

        referee_age = referee['age'].iloc[0]
        referral_age = referral['age'].iloc[0]
        referee_gender = referee['gender'].iloc[0]
        referral_gender = referral['gender'].iloc[0]
        referee_medical = referee['medical_ints'].iloc[0]
        referral_medical = referral['medical_ints'].iloc[0]
        referee_isco08_label = referee['isco08_code_ints'].iloc[0]
        referral_isco08_label = referral['isco08_code_ints'].iloc[0]

        referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
        referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
        referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
        referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
        referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
        referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
        referee_component_size = referee_network_stats['Component_Size'].iloc[0]
        referral_component_size = referral_network_stats['Component_Size'].iloc[0]
        referee_avg_path_len = referee_network_stats['Avg_Shortest_Path_Length'].iloc[0]
        referral_avg_path_len = referral_network_stats['Avg_Shortest_Path_Length'].iloc[0]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                             'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                             'referee_medical': referee_medical, 'referral_medical': referral_medical, 'medical_diff': abs(referee_medical - referral_medical),
                             'referee_isco08_label': referee_isco08_label, 'referral_isco08_label': referral_isco08_label, 'isco08_label_diff': abs(referee_isco08_label - referral_isco08_label),
                             'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                             'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                             'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                             'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size),
                             'referee_avg_path_len': referee_avg_path_len, 'referral_avg_path_len': referral_avg_path_len, 'avg_path_len_diff': abs(referee_avg_path_len - referral_avg_path_len)})
        bucharest_data = pd.concat([bucharest_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

  0%|          | 0/150 [00:00<?, ?it/s]

In [95]:
bucharest_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                'referral_avg_path_len', 'avg_path_len_diff']] = normalize(bucharest_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                         'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                         'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                         'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                         'pagerank_centrality_diff', 'referee_component_size',
                                                                                         'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                                                                           'referral_avg_path_len', 'avg_path_len_diff']])
bucharest_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                'referral_avg_path_len', 'avg_path_len_diff']] = scale(bucharest_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                     'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                     'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                     'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                     'pagerank_centrality_diff', 'referee_component_size',
                                                                                     'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                                                                       'referral_avg_path_len', 'avg_path_len_diff']])

In [96]:
bucharest_data['medical_diff'].unique(), bucharest_data['gender_diff'].unique(), bucharest_data['isco08_label_diff'].unique()

(array([1, 0, 2], dtype=object),
 array([<NA>, 0, 1], dtype=object),
 array([1, 0, 3, 4, 2, 11, 7, 6, 5, 16, 8, 15, 12, 9, 10, 13, 17, 14],
       dtype=object))

In [97]:
#bucharest_data['gender_diff'] = bucharest_data['gender_diff'].fillna(value=-999)
bucharest_data['medical_diff'] = bucharest_data['medical_diff'].fillna(value=-999)
bucharest_data['isco08_label_diff'] = bucharest_data['isco08_label_diff'].fillna(value=-999)

#bucharest_data['gender_diff'] = bucharest_data['gender_diff'].where(bucharest_data['gender_diff'] <= 0, 1)
bucharest_data['medical_diff'] = bucharest_data['medical_diff'].where(bucharest_data['medical_diff'] <= 0, 1)
bucharest_data['isco08_label_diff'] = bucharest_data['isco08_label_diff'].where(bucharest_data['isco08_label_diff'] <= 0, 1)

#bucharest_data['gender_diff'] = bucharest_data['gender_diff'].where(bucharest_data['gender_diff'] != -999, np.nan)
bucharest_data['medical_diff'] = bucharest_data['medical_diff'].where(bucharest_data['medical_diff'] != -999, np.nan)
bucharest_data['isco08_label_diff'] = bucharest_data['isco08_label_diff'].where(bucharest_data['isco08_label_diff'] != -999, np.nan)

bucharest_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_medical,...,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff,referee_avg_path_len,referral_avg_path_len,avg_path_len_diff
0,53594,10330S53594,1,-0.051854,0.752419,-0.838612,2,,,-1,...,-0.259327,-0.206958,-0.326106,-0.319645,-0.382101,-0.433184,-0.403701,-0.201715,-0.193448,-0.423637
1,53594,10331S53594,1,0.913335,-1.025053,0.793839,2,,,-1,...,-0.240209,-0.004983,-0.081030,-0.197507,-0.207457,-0.258824,-0.403701,-0.001801,0.024905,-0.308756
2,58373,11577S58373,1,0.181020,0.640961,-1.148650,2,,,-1,...,-0.329478,-0.345273,0.232592,-0.767813,-0.502576,-0.553463,-0.403701,-0.265166,-0.506724,-0.845177
3,63203,5541S63203,1,0.908249,-0.283359,-0.033671,2,,,0,...,-0.268583,-0.304741,-0.444756,-0.378775,-0.466652,-0.517597,-0.403701,-0.298500,-0.299160,-0.479254
4,63203,5540S63203,1,0.936469,-0.429938,0.151639,2,,,0,...,-0.268201,-0.300708,-0.439862,-0.376337,-0.463164,-0.514115,-0.403701,-0.294508,-0.294800,-0.476960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24547,181467,97104,0,0.872844,-0.155319,-0.204146,1,2,1,0,...,-0.329478,-0.418513,-1.024880,-0.225769,-0.565799,-0.844778,0.019936,-0.346582,-0.994668,-0.168593
24548,181467,197955,0,0.910859,-0.284588,-0.030218,1,1,0,0,...,-0.329478,-0.414005,-1.023174,-0.221155,-0.561908,-0.842836,0.023541,-0.341572,-0.994668,-0.162834
24549,1475S156639,97104,0,0.918132,-0.314170,0.008442,,2,,0,...,-0.329478,-0.457721,-1.039725,-0.265901,-0.599644,-0.861674,-0.011430,-0.390168,-0.994668,-0.218686
24550,1475S156639,197955,0,0.940541,-0.440430,0.166545,,1,,0,...,-0.329478,-0.455286,-1.038803,-0.263408,-0.597542,-0.860624,-0.009481,-0.387461,-0.994668,-0.215574


In [98]:
bucharest_data = bucharest_data.fillna(value=-999)

In [99]:
bucharest_data['medical_diff'].unique(), bucharest_data['gender_diff'].unique(), bucharest_data['isco08_label_diff'].unique()

(array([1, 0], dtype=int64),
 array([-999,    0,    1], dtype=int64),
 array([1, 0], dtype=int64))

In [100]:
r_regression(bucharest_data[bucharest_data.columns[3:]], bucharest_data['contact'])

array([ 0.09996407, -0.16304809,  0.01308077,  0.32643396, -0.62382406,
       -0.50194852,  0.26334057,  0.24521254,  0.25243247,  0.14806399,
        0.18518453,  0.25481499,  0.55040941,  0.31713901,  0.30487823,
        0.27037807, -0.02838417,  0.24793928,  0.584051  ,  0.30140632,
        0.32361342,  0.41776502,  0.36051635, -0.44168793,  0.40831264,
        0.47662166, -0.08279867])

In [101]:
f_regression(bucharest_data[bucharest_data.columns[3:]], bucharest_data['contact'])

(array([2.47799825e+02, 6.70478346e+02, 4.20138216e+00, 2.92803562e+03,
        1.56403244e+04, 8.26876371e+03, 1.82936285e+03, 1.57061116e+03,
        1.67084886e+03, 5.50271911e+02, 8.71797543e+02, 1.70473804e+03,
        1.06698810e+04, 2.74528172e+03, 2.51578459e+03, 1.93625951e+03,
        1.97949339e+01, 1.60803615e+03, 1.27099487e+04, 2.45311912e+03,
        2.87176160e+03, 5.19054648e+03, 3.66748396e+03, 5.95023727e+03,
        4.91185770e+03, 7.21629134e+03, 1.69467287e+02]),
 array([1.46566407e-055, 7.14615561e-146, 4.04016463e-002, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 2.31799218e-120, 2.59500794e-188, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        8.65896413e-006, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 1.29848664e-

In [102]:
print(sm.OLS(bucharest_data['contact'], bucharest_data[bucharest_data.columns[3:]]).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                contact   R-squared (uncentered):                   0.888
Model:                            OLS   Adj. R-squared (uncentered):              0.888
Method:                 Least Squares   F-statistic:                              7188.
Date:                Fri, 30 Jun 2023   Prob (F-statistic):                        0.00
Time:                        13:41:58   Log-Likelihood:                         -528.95
No. Observations:               24552   AIC:                                      1112.
Df Residuals:                   24525   BIC:                                      1331.
Df Model:                          27                                                  
Covariance Type:            nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------

# China

In [103]:
china_nodelist = pd.read_csv('../Data/Preprocessed/china_nodelist.csv')
china_edgelist = pd.read_csv('../Data/Preprocessed/china_edgelist.csv')
china_network = pd.read_csv('../Data/Tables/basic/china_basic_analysis.csv')

In [104]:
china_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,residency,place_event,possible_source,symptom,symptom_severity,place_admission
0,0,anqing-1,49,Male,,Wuhan Travel,,,,Anhui_Anqing
1,1,anqing-10,54,Male,,Hainan Work,,Somatosensory Related-Sign Description,Stable,Anhui_Anqing
2,2,anqing-11,50,Female,Hubei_Wuhan,Wuhan Travel,,Respiratory System Related-Symptom Description,Stable,Anhui_Anqing
3,3,anqing-12,46,Male,,Yellowstone Work,,Somatosensory Related-Sign Description,Stable,Anhui_Anqing
4,4,anqing-13,58,Male,Hubei_Wuhan,Work In Wuhan,,,Stable,Anhui_Anqing
...,...,...,...,...,...,...,...,...,...,...
25872,25872,ningbo-255,,male,Zhejiang-Ningbo,,,,,Zhejiang-Ningbo
25873,25873,ningbo-256,,Female,Zhejiang-Ningbo,,,,,Zhejiang-Ningbo
25874,25874,ningbo-257,,Female,Zhejiang-Ningbo,,,,,Zhejiang-Ningbo
25875,25875,ningbo-258,,Female,Zhejiang-Ningbo,,,,,Zhejiang-Ningbo


In [105]:
china_nodelist['gender'] = china_nodelist['gender'].apply(lambda x: str(x).lower())
china_nodelist['gender'] = china_nodelist['gender'].apply(lambda x: x.strip())
china_nodelist['gender'] = china_nodelist['gender'].apply(lambda x: 'male' if x in ('male', 'man') else x)
china_nodelist['gender'] = china_nodelist['gender'].apply(lambda x: np.nan if x not in ('male', 'female') else x)
china_nodelist['gender_ints'] = LabelEncoder().fit_transform(china_nodelist['gender'])

china_nodelist['residency'] = china_nodelist['residency'].apply(lambda x: str(x).lower())
china_nodelist['residency'] = china_nodelist['residency'].apply(lambda x: x.strip())
china_nodelist['residency_ints'] = LabelEncoder().fit_transform(china_nodelist['residency'])

china_nodelist['place_event'] = china_nodelist['place_event'].apply(lambda x: str(x).lower())
china_nodelist['place_event'] = china_nodelist['place_event'].apply(lambda x: x.strip())
china_nodelist['place_event_ints'] = LabelEncoder().fit_transform(china_nodelist['place_event'])

china_nodelist['symptom'] = china_nodelist['symptom'].apply(lambda x: str(x).lower())
china_nodelist['symptom'] = china_nodelist['symptom'].apply(lambda x: x.strip())
most_common_symptoms = Counter(china_nodelist['symptom']).most_common()
symptom_keys = set([x[0] for x in most_common_symptoms])
china_nodelist['symptom'] = china_nodelist['symptom'].apply(lambda x: x if x in symptom_keys else np.nan)
china_nodelist['symptom_ints'] = LabelEncoder().fit_transform(china_nodelist['symptom'])

china_nodelist['place_admission'] = china_nodelist['place_admission'].apply(lambda x: str(x).lower())
china_nodelist['place_admission'] = china_nodelist['place_admission'].apply(lambda x: x.strip())
china_nodelist['place_admission_ints'] = LabelEncoder().fit_transform(china_nodelist['place_admission'])

china_nodelist['symptom_severity'] = china_nodelist['symptom_severity'].apply(lambda x: str(x).lower())
china_nodelist['symptom_severity'] = china_nodelist['symptom_severity'].apply(lambda x: x.strip())
china_nodelist['symptom_severity'] = china_nodelist['symptom_severity'].apply(lambda x: x if x in ('stable', 'mild', 'light', 'severe') else np.nan)
china_nodelist['symptom_severity_ints'] = LabelEncoder().fit_transform(china_nodelist['symptom_severity'])

In [106]:
china_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,residency,place_event,possible_source,symptom,symptom_severity,place_admission,gender_ints,residency_ints,place_event_ints,symptom_ints,place_admission_ints,symptom_severity_ints
0,0,anqing-1,49,male,,wuhan travel,,,,anhui_anqing,1,480,1538,55,0,4
1,1,anqing-10,54,male,,hainan work,,somatosensory related-sign description,stable,anhui_anqing,1,480,449,186,0,3
2,2,anqing-11,50,female,hubei_wuhan,wuhan travel,,respiratory system related-symptom description,stable,anhui_anqing,0,318,1538,103,0,3
3,3,anqing-12,46,male,,yellowstone work,,somatosensory related-sign description,stable,anhui_anqing,1,480,1624,186,0,3
4,4,anqing-13,58,male,hubei_wuhan,work in wuhan,,,stable,anhui_anqing,1,318,1481,55,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25872,25872,ningbo-255,,male,zhejiang-ningbo,,,,,zhejiang-ningbo,1,749,748,55,461,4
25873,25873,ningbo-256,,female,zhejiang-ningbo,,,,,zhejiang-ningbo,0,749,748,55,461,4
25874,25874,ningbo-257,,female,zhejiang-ningbo,,,,,zhejiang-ningbo,0,749,748,55,461,4
25875,25875,ningbo-258,,female,zhejiang-ningbo,,,,,zhejiang-ningbo,0,749,748,55,461,4


In [107]:
china_nodelist['age'] = pd.to_numeric(china_nodelist['age'], errors='coerce')
china_nodelist['age'].unique()

array([ 49. ,  54. ,  50. ,  46. ,  58. ,  51. ,  41. ,  37. ,  42. ,
        52. ,  25. ,  53. ,  45. ,  39. ,  65. ,  31. ,  27. ,  57. ,
        21. ,  47. ,  36. ,  64. ,  28. ,  40. ,  34. ,  29. ,  62. ,
        56. ,  75. ,  33. ,  43. ,  30. ,  55. ,  35. ,  38. ,  20. ,
        71. ,  24. ,  17. ,  86. ,  44. ,  63. ,  70. ,  48. ,  60. ,
        67. ,  59. ,  74. ,  68. ,  83. ,  80. ,  19. ,  78. ,  66. ,
        82. ,  69. ,   2. ,  22. ,  13. ,  12. ,  85. ,  18. ,  89. ,
        14. ,  72. ,  10. ,  32. ,   5. ,  16. ,   8. ,  26. ,   7. ,
        61. ,  73. ,   6. ,  76. ,   nan,  15. ,   1. ,  23. ,  91. ,
         3. ,   4. ,   9. ,  87. ,  79. ,  77. ,  11. ,  84. ,  81. ,
        94. ,  90. ,   0. ,  88. , 644. ,  95. ,  97. ,  96. ,  93. ,
        92. ,  99. ,   2.5])

In [108]:
china_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_residency', 'referral_residency', 'residency_diff',
                                       'referee_place_event', 'referral_place_event', 'place_event_diff', 'referee_symptom', 'referral_symptom', 'symptom_diff', 'referee_symptom_severity', 'referral_symptom_severity', 'symptom_severity_diff', 'referee_place_admission', 'referral_place_admission', 'place_admission_diff', 'referee_degree_centrality', 'referral_degree_centrality', 'degree_centrality_diff', 'referee_betweenness_centrality', 'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                       'referee_pagerank_centrality', 'referral_pagerank_centrality', 'pagerank_centrality_diff', 'referee_component_size', 'referral_component_size', 'component_size_diff',
                                   'referee_avg_path_len', 'referral_avg_path_len', 'avg_path_len_diff'])

In [109]:
added_combinations = set()
for index, row in china_edgelist[~china_edgelist['Referral'].isna()].iterrows():
    referee_id = row['Referee']
    referral_id = row['Referral']
    referee = china_nodelist.query('node_id == @referee_id')
    referral = china_nodelist.query('node_id == @referral_id')
    referee_network_stats = china_network.query('Referee == @referee_id')
    referral_network_stats = china_network.query('Referee == @referral_id')

    referee_age = referee['age'].iloc[0]
    referral_age = referral['age'].iloc[0]
    referee_gender = referee['gender_ints'].iloc[0]
    referral_gender = referral['gender_ints'].iloc[0]
    referee_residency = referee['residency_ints'].iloc[0]
    referral_residency = referral['residency_ints'].iloc[0]
    referee_place_event = referee['place_event_ints'].iloc[0]
    referral_place_event = referral['place_event_ints'].iloc[0]
    referee_symptom = referee['symptom_ints'].iloc[0]
    referral_symptom = referral['symptom_ints'].iloc[0]
    referee_symptom_severity = referee['symptom_severity_ints'].iloc[0]
    referral_symptom_severity = referral['symptom_severity_ints'].iloc[0]
    referee_place_admission = referee['place_admission_ints'].iloc[0]
    referral_place_admission = referral['place_admission_ints'].iloc[0]

    referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
    referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
    referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
    referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
    referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
    referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
    referee_component_size = referee_network_stats['Component_Size'].iloc[0]
    referral_component_size = referral_network_stats['Component_Size'].iloc[0]
    referee_avg_path_len = referee_network_stats['Avg_Shortest_Path_Length'].iloc[0]
    referral_avg_path_len = referral_network_stats['Avg_Shortest_Path_Length'].iloc[0]

    new_row = pd.Series({'referee': referee_id, 'referral': referral_id, 'contact': 1, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                         'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                         'referee_residency': referee_residency, 'referral_residency': referral_residency, 'residency_diff': abs(referee_residency - referral_residency),
                         'referee_place_event': referee_place_event, 'referral_place_event': referral_place_event, 'place_event_diff': abs(referee_place_event - referral_place_event),
                         'referee_symptom': referee_symptom, 'referral_symptom': referral_symptom, 'symptom_diff': abs(referee_symptom - referral_symptom),
                         'referee_symptom_severity': referee_symptom_severity, 'referral_symptom_severity': referral_symptom_severity, 'symptom_severity_diff': abs(referee_symptom_severity - referral_symptom_severity),
                         'referee_place_admission': referee_place_admission, 'referral_place_admission': referral_place_admission, 'place_admission_diff': abs(referee_place_admission - referral_place_admission),
                         'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                         'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                         'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                         'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size),
                         'referee_avg_path_len': referee_avg_path_len, 'referral_avg_path_len': referral_avg_path_len, 'avg_path_len_diff': abs(referee_avg_path_len - referral_avg_path_len)})
    china_data = pd.concat([china_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee_id, referral_id))
    added_combinations.add((referral_id, referee_id))

In [110]:
china_data.shape

(11436, 39)

In [111]:
sample = np.random.choice(china_nodelist['node_id'], size=150, replace=False)
sample

array(['hengshui-2', 'bozhou-78', 'liangjiangnewdistrict-12', 'an-1795',
       'hebeibaoding-30', 'shangqiu-5', 'shenzhen-370', 'xinxiang-38',
       'nanjing-10', 'tianjin-107', 'taizhou-135', 'an-1249',
       'fengjie-13', 'shenzhen-179', 'panjin-17', 'shanghai-991',
       'shenzhen-11', 'guangzhou-1389', 'guangdonghuizhou-36',
       'guangzhou-322', 'garz-15', 'changchun-52', 'chengdu-344',
       'shenzhen-247', 'shanghai-194', 'shijiazhuang-107', 'jiaxing-6',
       'shanghai-1584', 'yangzhou-33', 'putian-198', 'an-726',
       'hulunbeier-510', 'shanghai-1523', 'jiaozuo-3', 'kunming-353',
       'changjiang-7', 'guangzhou-1457', 'taizhou-31', 'tianjin-319',
       'jiangsuwuxi-15', 'an-2078', 'guangzhou-1563', 'tonghua-199',
       'changchun-49', 'shanghai-1552', 'fuyang-122', 'zhongshan-108',
       'tonghua-155', 'wenzhou-387', 'nanyang-95', 'chengdu-162',
       'wanzhou-13', 'chengdu-656', 'zhenjiang-13', 'wenzhou-241',
       'shanghai-1213', 'zhengzhou-64', 'harbin-2',

In [112]:
for ref_a in tqdm(sample):
    for ref_b in sample:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = china_nodelist.query('node_id == @ref_a')
        referral = china_nodelist.query('node_id == @ref_b')
        referee_network_stats = china_network.query('Referee == @ref_a')
        referral_network_stats = china_network.query('Referee == @ref_b')

        referee_age = referee['age'].iloc[0]
        referral_age = referral['age'].iloc[0]
        referee_gender = referee['gender_ints'].iloc[0]
        referral_gender = referral['gender_ints'].iloc[0]
        referee_residency = referee['residency_ints'].iloc[0]
        referral_residency = referral['residency_ints'].iloc[0]
        referee_place_event = referee['place_event_ints'].iloc[0]
        referral_place_event = referral['place_event_ints'].iloc[0]
        referee_symptom = referee['symptom_ints'].iloc[0]
        referral_symptom = referral['symptom_ints'].iloc[0]
        referee_symptom_severity = referee['symptom_severity_ints'].iloc[0]
        referral_symptom_severity = referral['symptom_severity_ints'].iloc[0]
        referee_place_admission = referee['place_admission_ints'].iloc[0]
        referral_place_admission = referral['place_admission_ints'].iloc[0]

        referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
        referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
        referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
        referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
        referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
        referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
        referee_component_size = referee_network_stats['Component_Size'].iloc[0]
        referral_component_size = referral_network_stats['Component_Size'].iloc[0]
        referee_avg_path_len = referee_network_stats['Avg_Shortest_Path_Length'].iloc[0]
        referral_avg_path_len = referral_network_stats['Avg_Shortest_Path_Length'].iloc[0]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                             'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                             'referee_residency': referee_residency, 'referral_residency': referral_residency, 'residency_diff': abs(referee_residency - referral_residency),
                             'referee_place_event': referee_place_event, 'referral_place_event': referral_place_event, 'place_event_diff': abs(referee_place_event - referral_place_event),
                             'referee_symptom': referee_symptom, 'referral_symptom': referral_symptom, 'symptom_diff': abs(referee_symptom - referral_symptom),
                             'referee_symptom_severity': referee_symptom_severity, 'referral_symptom_severity': referral_symptom_severity, 'symptom_severity_diff': abs(referee_symptom_severity - referral_symptom_severity),
                             'referee_place_admission': referee_place_admission, 'referral_place_admission': referral_place_admission, 'place_admission_diff': abs(referee_place_admission - referral_place_admission),
                             'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                             'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                             'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                             'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size),
                             'referee_avg_path_len': referee_avg_path_len, 'referral_avg_path_len': referral_avg_path_len, 'avg_path_len_diff': abs(referee_avg_path_len - referral_avg_path_len)})
        china_data = pd.concat([china_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

  0%|          | 0/150 [00:00<?, ?it/s]

In [113]:
china_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_residency,...,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff,referee_avg_path_len,referral_avg_path_len,avg_path_len_diff
0,shijiazhuang-155,shijiazhuang-176,1,29.0,61.0,32.0,0,1,1,244,...,0.0,0.000116,0.000116,0.0,4,4,0,1.0,1.0,0.0
1,shijiazhuang-155,shijiazhuang-156,1,29.0,35.0,6.0,0,0,0,244,...,0.0,0.000116,0.000116,0.0,4,4,0,1.0,1.0,0.0
2,shijiazhuang-150,shijiazhuang-38,1,3.0,55.0,52.0,0,0,0,244,...,0.0,0.000166,0.000091,0.000075,3,3,0,0.666667,1.0,0.333333
3,shijiazhuang-150,shijiazhuang-151,1,3.0,33.0,30.0,0,0,0,244,...,0.0,0.000166,0.000091,0.000075,3,3,0,0.666667,1.0,0.333333
4,shijiazhuang-145,shijiazhuang-146,1,15.0,52.0,37.0,0,0,0,244,...,0.0,0.0002,0.000061,0.000139,7,7,0,1.142857,1.857143,0.714286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22606,tonghua-163,an-450,0,54.0,35.0,19.0,1,1,0,416,...,0.0,0.000083,0.000019,0.000064,142,1,141,2.014085,0.0,2.014085
22607,tonghua-163,yingtan-8,0,54.0,29.0,25.0,1,1,0,416,...,0.0,0.000083,0.000019,0.000064,142,1,141,2.014085,0.0,2.014085
22608,guangzhou-1718,an-450,0,24.0,35.0,11.0,1,1,0,480,...,0.0,0.000019,0.000019,0.0,1,1,0,0.0,0.0,0.0
22609,guangzhou-1718,yingtan-8,0,24.0,29.0,5.0,1,1,0,480,...,0.0,0.000019,0.000019,0.0,1,1,0,0.0,0.0,0.0


In [114]:
china_data = china_data.fillna(value=-999)

china_data['gender_diff'] = china_data['gender_diff'].where(china_data['gender_diff'] <= 0, 1)
china_data['residency_diff'] = china_data['residency_diff'].where(china_data['residency_diff'] <= 0, 1)
china_data['place_event_diff'] = china_data['place_event_diff'].where(china_data['place_event_diff'] <= 0, 1)
china_data['symptom_diff'] = china_data['symptom_diff'].where(china_data['symptom_diff'] <= 0, 1)
china_data['symptom_severity_diff'] = china_data['symptom_severity_diff'].where(china_data['symptom_severity_diff'] <= 0, 1)
china_data['place_admission_diff'] = china_data['place_admission_diff'].where(china_data['place_admission_diff'] <= 0, 1)

In [115]:
china_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                'degree_centrality_diff', 'referee_betweenness_centrality',
                'referral_betweenness_centrality', 'betweenness_centrality_diff',
                'referee_pagerank_centrality', 'referral_pagerank_centrality',
                'pagerank_centrality_diff', 'referee_component_size',
                'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
            'referral_avg_path_len', 'avg_path_len_diff']] = normalize(china_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                               'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                               'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                               'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                               'pagerank_centrality_diff', 'referee_component_size',
                                                                                               'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                                                                   'referral_avg_path_len', 'avg_path_len_diff']])
china_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                'degree_centrality_diff', 'referee_betweenness_centrality',
                'referral_betweenness_centrality', 'betweenness_centrality_diff',
                'referee_pagerank_centrality', 'referral_pagerank_centrality',
                'pagerank_centrality_diff', 'referee_component_size',
                'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
            'referral_avg_path_len', 'avg_path_len_diff']] = scale(china_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                           'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                           'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                           'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                           'pagerank_centrality_diff', 'referee_component_size',
                                                                                           'referral_component_size', 'component_size_diff', 'referee_avg_path_len',
                                                                               'referral_avg_path_len', 'avg_path_len_diff']])

In [116]:
r_regression(china_data[china_data.columns[3:]], china_data['contact'])

array([ 0.27338409,  0.33910138,  0.34160139, -0.29405836, -0.27846425,
       -0.06479354, -0.011047  , -0.02769734, -0.65401936, -0.05050589,
       -0.01114833, -0.13039696,  0.06887405,  0.19934472, -0.00374581,
       -0.02996944, -0.01795474, -0.33296848, -0.22844268, -0.2082084 ,
       -0.93102273,  0.5375845 ,  0.52273315,  0.0882801 ,  0.0551724 ,
        0.13400891,  0.13509557,  0.54293197,  0.39075424,  0.11148472,
        0.41826937,  0.39754313, -0.24640582,  0.5098621 ,  0.57255551,
       -0.22590777])

In [117]:
f_regression(china_data[china_data.columns[3:]], china_data['contact'])

(array([1.82626385e+03, 2.93759674e+03, 2.98681424e+03, 2.14005904e+03,
        1.90052597e+03, 9.53173320e+01, 2.75945455e+00, 1.73576377e+01,
        1.68993571e+04, 5.78195365e+01, 2.81031354e+00, 3.91078765e+02,
        1.07760025e+02, 9.35623700e+02, 3.17233267e-01, 2.03249159e+01,
        7.29087687e+00, 2.81917072e+03, 1.24483771e+03, 1.02453101e+03,
        1.47132463e+05, 9.18974502e+03, 8.50073297e+03, 1.77584439e+02,
        6.90317980e+01, 4.13446031e+02, 4.20303541e+02, 9.45027756e+03,
        4.07423284e+03, 2.84540230e+02, 4.79416341e+03, 4.24383693e+03,
        1.46145778e+03, 7.94202512e+03, 1.10263262e+04, 1.21588695e+03]),
 array([0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 1.79652363e-022, 9.66950606e-002, 3.10801053e-005,
        0.00000000e+000, 2.98480642e-014, 9.36738940e-002, 2.58733874e-086,
        3.45616478e-025, 2.22823645e-201, 5.73280699e-001, 6.56720796e-006,
        6.93572190e-003, 0.00000000e+000, 1.70

In [118]:
model = sm.OLS(china_data['contact'], china_data[china_data.columns[3:]])

In [119]:
print(model.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                contact   R-squared (uncentered):                   0.927
Model:                            OLS   Adj. R-squared (uncentered):              0.927
Method:                 Least Squares   F-statistic:                              8010.
Date:                Fri, 30 Jun 2023   Prob (F-statistic):                        0.00
Time:                        13:51:27   Log-Likelihood:                          5274.1
No. Observations:               22611   AIC:                                 -1.048e+04
Df Residuals:                   22575   BIC:                                 -1.019e+04
Df Model:                          36                                                  
Covariance Type:            nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------

In [137]:
print(sm.OLS(yunnan_data['contact'], yunnan_data[yunnan_data.columns[3:]]).fit().summary().tables[1].as_csv().replace(' ','').replace('_', ' ').replace('referee', 'nominator').replace('referral', 'nominee'))

,coef,stderr,t,P>|t|,[0.025,0.975]
nominator age,-0.0015,0.001,-1.096,0.273,-0.004,0.001
nominee age,-0.0005,0.002,-0.292,0.770,-0.004,0.003
age diff,-0.0013,0.001,-1.035,0.301,-0.004,0.001
nominator gender,0.0036,0.001,4.304,0.000,0.002,0.005
nominee gender,0.0013,0.001,1.467,0.142,-0.000,0.003
gender diff,0.0033,0.001,3.952,0.000,0.002,0.005
nominator relatives,0.0036,0.001,4.134,0.000,0.002,0.005
nominee relatives,0.0011,0.001,1.226,0.220,-0.001,0.003
relatives diff,-0.0012,0.001,-1.457,0.145,-0.003,0.000
nominator degree centrality,0.0874,0.002,37.002,0.000,0.083,0.092
nominee degree centrality,0.0901,0.006,14.157,0.000,0.078,0.103
degree centrality diff,-0.0880,0.005,-16.376,0.000,-0.099,-0.077
nominator betweenness centrality,-0.0103,0.001,-8.399,0.000,-0.013,-0.008
nominee betweenness centrality,-0.0114,0.002,-4.669,0.000,-0.016,-0.007
betweenness centrality diff,0.0182,0.003,6.913,0.000,0.013,0.023
nominator pagerank centrality,-0.0182,0.001,-12.612,0.000,-0.021,-0.015
nominee 

In [138]:
print(sm.OLS(hainan_data['contact'], hainan_data[hainan_data.columns[3:]]).fit().summary().tables[1].as_csv().replace(' ','').replace('_', ' ').replace('referee', 'nominator').replace('referral', 'nominee'))

,coef,stderr,t,P>|t|,[0.025,0.975]
nominator age,0.0019,0.002,0.789,0.430,-0.003,0.007
nominee age,0.0005,0.002,0.203,0.839,-0.004,0.005
age diff,-0.0001,0.002,-0.054,0.957,-0.004,0.004
nominator gender,-0.0008,0.002,-0.497,0.619,-0.004,0.002
nominee gender,0.0021,0.002,1.367,0.172,-0.001,0.005
gender diff,0.0006,0.002,0.395,0.693,-0.002,0.004
nominator relatives,0.0025,0.002,1.057,0.290,-0.002,0.007
nominee relatives,0.0132,0.002,5.758,0.000,0.009,0.018
relatives diff,0.0026,0.002,1.291,0.197,-0.001,0.006
nominator degree centrality,0.0385,0.004,10.872,0.000,0.032,0.045
nominee degree centrality,0.0038,0.005,0.781,0.435,-0.006,0.013
degree centrality diff,-0.0125,0.003,-4.720,0.000,-0.018,-0.007
nominator betweenness centrality,0.0431,0.004,10.509,0.000,0.035,0.051
nominee betweenness centrality,0.0733,0.008,9.766,0.000,0.059,0.088
betweenness centrality diff,-0.0820,0.008,-9.696,0.000,-0.099,-0.065
nominator pagerank centrality,-0.0114,0.002,-4.890,0.000,-0.016,-0.007
nominee pageran

In [139]:
print(sm.OLS(shanxi_data['contact'], shanxi_data[shanxi_data.columns[3:]]).fit().summary().tables[1].as_csv().replace(' ','').replace('_', ' ').replace('referee', 'nominator').replace('referral', 'nominee'))

,coef,stderr,t,P>|t|,[0.025,0.975]
nominator age,-0.0033,0.001,-2.403,0.016,-0.006,-0.001
nominee age,-0.0032,0.001,-2.659,0.008,-0.005,-0.001
age diff,-0.0028,0.001,-2.689,0.007,-0.005,-0.001
nominator gender,0.0025,0.001,3.321,0.001,0.001,0.004
nominee gender,0.0028,0.001,3.965,0.000,0.001,0.004
gender diff,0.0040,0.001,5.633,0.000,0.003,0.005
nominator relatives,0.0107,0.001,10.532,0.000,0.009,0.013
nominee relatives,0.0008,0.001,0.857,0.391,-0.001,0.003
relatives diff,0.0017,0.001,2.001,0.045,3.52e-05,0.003
nominator hukou,0.0007,8.51e-05,7.936,0.000,0.001,0.001
nominee hukou,0.0006,8.6e-05,7.008,0.000,0.000,0.001
hukou diff,-0.0145,0.001,-15.937,0.000,-0.016,-0.013
nominator degree centrality,0.0171,0.002,9.103,0.000,0.013,0.021
nominee degree centrality,0.0101,0.001,8.757,0.000,0.008,0.012
degree centrality diff,-0.0110,0.001,-8.503,0.000,-0.013,-0.008
nominator betweenness centrality,-0.0163,0.004,-4.632,0.000,-0.023,-0.009
nominee betweenness centrality,0.0034,0.002,2.139,0.032

In [140]:
print(sm.OLS(bucharest_data['contact'], bucharest_data[bucharest_data.columns[3:]]).fit().summary().tables[1].as_csv().replace(' ','').replace('_', ' ').replace('referee', 'nominator').replace('referral', 'nominee'))

,coef,stderr,t,P>|t|,[0.025,0.975]
nominator age,0.0009,0.005,0.194,0.846,-0.008,0.010
nominee age,-0.0007,0.006,-0.113,0.910,-0.012,0.011
age diff,-0.0290,0.005,-5.925,0.000,-0.039,-0.019
nominator gender,0.0008,1.31e-05,62.061,0.000,0.001,0.001
nominee gender,0.0002,1.51e-05,11.915,0.000,0.000,0.000
gender diff,-0.0007,1.5e-05,-43.386,0.000,-0.001,-0.001
nominator medical,-0.0827,0.004,-21.338,0.000,-0.090,-0.075
nominee medical,-0.1616,0.005,-33.611,0.000,-0.171,-0.152
medical diff,0.0588,0.006,9.062,0.000,0.046,0.071
nominator isco08 label,-0.0084,0.001,-8.247,0.000,-0.010,-0.006
nominee isco08 label,0.0086,0.002,4.723,0.000,0.005,0.012
isco08 label diff,0.1852,0.007,25.077,0.000,0.171,0.200
nominator degree centrality,0.3564,0.035,10.231,0.000,0.288,0.425
nominee degree centrality,0.1728,0.014,12.540,0.000,0.146,0.200
degree centrality diff,0.1853,0.025,7.372,0.000,0.136,0.235
nominator betweenness centrality,-0.0820,0.017,-4.847,0.000,-0.115,-0.049
nominee betweenness centrality,

In [141]:
print(sm.OLS(china_data['contact'], china_data[china_data.columns[3:]]).fit().summary().tables[1].as_csv().replace(' ','').replace('_', ' ').replace('referee', 'nominator').replace('referral', 'nominee'))

,coef,stderr,t,P>|t|,[0.025,0.975]
nominator age,-0.0046,0.002,-2.145,0.032,-0.009,-0.000
nominee age,0.0130,0.002,5.902,0.000,0.009,0.017
age diff,0.0087,0.002,3.626,0.000,0.004,0.013
nominator gender,0.0144,0.002,6.070,0.000,0.010,0.019
nominee gender,0.0161,0.002,6.685,0.000,0.011,0.021
gender diff,0.0409,0.003,15.290,0.000,0.036,0.046
nominator residency,0.0001,1.09e-05,11.830,0.000,0.000,0.000
nominee residency,0.0002,1.11e-05,13.775,0.000,0.000,0.000
residency diff,0.0136,0.004,3.742,0.000,0.006,0.021
nominator place event,3.962e-05,4.09e-06,9.699,0.000,3.16e-05,4.76e-05
nominee place event,6.302e-05,4.17e-06,15.097,0.000,5.48e-05,7.12e-05
place event diff,-0.0148,0.003,-5.357,0.000,-0.020,-0.009
nominator symptom,0.0005,2.83e-05,17.301,0.000,0.000,0.001
nominee symptom,0.0006,2.99e-05,20.591,0.000,0.001,0.001
symptom diff,-0.0405,0.004,-11.342,0.000,-0.047,-0.033
nominator symptom severity,0.0689,0.002,45.680,0.000,0.066,0.072
nominee symptom severity,0.0788,0.002,52.136,0.000,0