In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize, scale
from sklearn.feature_selection import r_regression, f_regression
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
from collections import Counter

# Yunnan

In [13]:
yunnan_nodelist = pd.read_csv('../Data/Preprocessed/yunnan_nodelist.csv')
yunnan_edgelist = pd.read_csv('../Data/Preprocessed/yunnan_edgelist.csv')
yunnan_network = pd.read_csv('../Data/Tables/basic/yunnan_basic_analysis.csv')

In [14]:
yunnan_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,relatives
0,0,1.0,,1.0,1.0
1,1,2.0,53.0,1.0,
2,2,3.0,39.0,0.0,0.0
3,3,4.0,34.0,0.0,
4,4,5.0,49.0,1.0,
...,...,...,...,...,...
166,166,167.0,33.0,1.0,0.0
167,167,168.0,62.0,0.0,0.0
168,168,169.0,41.0,1.0,1.0
169,169,170.0,52.0,1.0,1.0


In [15]:
yunnan_network

Unnamed: 0.1,Unnamed: 0,Referee,Degree_Centrality,Betweenness_Centrality,Pagerank_Centrality,Component_Size
0,0,1,0.005882,0.0,0.013495,2
1,1,10,0.005882,0.0,0.013495,2
2,2,16,0.005882,0.0,0.013495,2
3,3,15,0.005882,0.0,0.013495,2
4,4,90,0.005882,0.0,0.013495,2
...,...,...,...,...,...,...
166,166,143,0.000000,0.0,0.002024,1
167,167,142,0.000000,0.0,0.002024,1
168,168,153,0.000000,0.0,0.002024,1
169,169,162,0.000000,0.0,0.002024,1


In [16]:
yunnan_nodelist['gender'].unique(), yunnan_nodelist['relatives'].unique()

(array([ 1.,  0., nan]), array([ 1., nan,  0.]))

In [17]:
yunnan_nodelist.columns, yunnan_network.columns

(Index(['Unnamed: 0', 'node_id', 'age', 'gender', 'relatives'], dtype='object'),
 Index(['Unnamed: 0', 'Referee', 'Degree_Centrality', 'Betweenness_Centrality',
        'Pagerank_Centrality', 'Component_Size'],
       dtype='object'))

In [18]:
yunnan_nodelist['age'] = yunnan_nodelist['age'].fillna(value=int(yunnan_nodelist['age'].mean()))
yunnan_nodelist['gender'] = yunnan_nodelist['gender'].mask(yunnan_nodelist['gender'].isnull(), np.random.randint(0, 2, size=yunnan_nodelist.shape[0]))
yunnan_nodelist['relatives'] = yunnan_nodelist['relatives'].mask(yunnan_nodelist['relatives'].isnull(), np.random.randint(0, 2, size=yunnan_nodelist.shape[0]))

In [19]:
yunnan_nodelist['gender'].unique(), yunnan_nodelist['relatives'].unique()

(array([1., 0.]), array([1., 0.]))

In [20]:
yunnan_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_relatives', 'referral_relatives', 'relatives_diff',
                                    'referee_degree_centrality', 'referral_degree_centrality', 'degree_centrality_diff', 'referee_betweenness_centrality', 'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                    'referee_pagerank_centrality', 'referral_pagerank_centrality', 'pagerank_centrality_diff', 'referee_component_size', 'referral_component_size', 'component_size_diff'])

In [26]:
added_combinations = set()
for index, row in yunnan_edgelist[~yunnan_edgelist['Referral'].isna()].iterrows():
    referee_id = row['Referee']
    referral_id = row['Referral']
    referee = yunnan_nodelist.query('node_id == @referee_id')
    referral = yunnan_nodelist.query('node_id == @referral_id')
    referee_network_stats = yunnan_network.query('Referee == @referee_id')
    referral_network_stats = yunnan_network.query('Referee == @referral_id')

    referee_age = referee['age'].iloc[0]
    referral_age = referral['age'].iloc[0]
    referee_gender = referee['gender'].iloc[0]
    referral_gender = referral['gender'].iloc[0]
    referee_relatives = referee['relatives'].iloc[0]
    referral_relatives = referral['relatives'].iloc[0]

    referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
    referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
    referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
    referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
    referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
    referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
    referee_component_size = referee_network_stats['Component_Size'].iloc[0]
    referral_component_size = referral_network_stats['Component_Size'].iloc[0]

    new_row = pd.Series({'referee': referee_id, 'referral': referral_id, 'contact': 1, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                         'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                         'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                         'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                         'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                         'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                         'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size)})
    yunnan_data = pd.concat([yunnan_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee_id, referral_id))
    added_combinations.add((referral_id, referee_id))

In [27]:
for ref_a in yunnan_nodelist['node_id']:
    for ref_b in yunnan_nodelist['node_id']:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = yunnan_nodelist.query('node_id == @ref_a')
        referral = yunnan_nodelist.query('node_id == @ref_b')
        referee_network_stats = yunnan_network.query('Referee == @ref_a')
        referral_network_stats = yunnan_network.query('Referee == @ref_b')

        referee_age = referee['age'].iloc[0]
        referral_age = referral['age'].iloc[0]
        referee_gender = referee['gender'].iloc[0]
        referral_gender = referral['gender'].iloc[0]
        referee_relatives = referee['relatives'].iloc[0]
        referral_relatives = referral['relatives'].iloc[0]

        referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
        referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
        referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
        referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
        referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
        referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
        referee_component_size = referee_network_stats['Component_Size'].iloc[0]
        referral_component_size = referral_network_stats['Component_Size'].iloc[0]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                             'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                             'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                             'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                             'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                             'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                             'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size)})
        yunnan_data = pd.concat([yunnan_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

In [28]:
yunnan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,degree_centrality_diff,referee_betweenness_centrality,referral_betweenness_centrality,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff
0,1.0,10.0,1.0,41.0,63.0,22.0,1.0,1.0,0.0,1.0,...,0.000000,0.0,0.0,0.0,0.013495,0.013495,0.000000,2.0,2.0,0.0
1,16.0,15.0,1.0,68.0,71.0,3.0,0.0,1.0,1.0,1.0,...,0.000000,0.0,0.0,0.0,0.013495,0.013495,0.000000,2.0,2.0,0.0
2,90.0,75.0,1.0,34.0,76.0,42.0,0.0,1.0,1.0,1.0,...,0.000000,0.0,0.0,0.0,0.013495,0.013495,0.000000,2.0,2.0,0.0
3,86.0,85.0,1.0,79.0,67.0,12.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.013495,0.013495,0.000000,3.0,3.0,0.0
4,86.0,81.0,1.0,79.0,46.0,33.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.013495,0.013495,0.000000,3.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14530,168.0,169.0,0.0,62.0,41.0,21.0,0.0,1.0,1.0,0.0,...,0.005882,0.0,0.0,0.0,0.010376,0.002024,0.008352,3.0,1.0,2.0
14531,168.0,170.0,0.0,62.0,52.0,10.0,0.0,1.0,1.0,0.0,...,0.011765,0.0,0.0,0.0,0.010376,0.013495,0.003119,3.0,4.0,1.0
14532,168.0,171.0,0.0,62.0,24.0,38.0,0.0,0.0,0.0,0.0,...,0.011765,0.0,0.0,0.0,0.010376,0.013495,0.003119,3.0,4.0,1.0
14533,169.0,170.0,0.0,41.0,52.0,11.0,1.0,1.0,0.0,1.0,...,0.017647,0.0,0.0,0.0,0.002024,0.013495,0.011471,1.0,4.0,3.0


In [29]:
(yunnan_nodelist.shape[0] * (yunnan_nodelist.shape[0] - 1)) / 2

14535.0

In [31]:
yunnan_data.columns

Index(['referee', 'referral', 'contact', 'referee_age', 'referral_age',
       'age_diff', 'referee_gender', 'referral_gender', 'gender_diff',
       'referee_relatives', 'referral_relatives', 'relatives_diff',
       'referee_degree_centrality', 'referral_degree_centrality',
       'degree_centrality_diff', 'referee_betweenness_centrality',
       'referral_betweenness_centrality', 'betweenness_centrality_diff',
       'referee_pagerank_centrality', 'referral_pagerank_centrality',
       'pagerank_centrality_diff', 'referee_component_size',
       'referral_component_size', 'component_size_diff'],
      dtype='object')

In [32]:
yunnan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff']] = normalize(yunnan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                         'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                         'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                         'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                         'pagerank_centrality_diff', 'referee_component_size',
                                                                                         'referral_component_size', 'component_size_diff']])

yunnan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff']] = scale(yunnan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                     'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                     'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                     'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                     'pagerank_centrality_diff', 'referee_component_size',
                                                                                     'referral_component_size', 'component_size_diff']])
yunnan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,degree_centrality_diff,referee_betweenness_centrality,referral_betweenness_centrality,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff
0,1.0,10.0,1.0,-0.873643,0.969757,0.145773,1.0,1.0,0.0,1.0,...,-0.555292,-0.120199,-0.277396,-0.300419,1.390432,0.543375,-0.852693,0.057503,-0.415690,-0.567064
1,16.0,15.0,1.0,0.189764,0.528873,-1.188070,0.0,1.0,1.0,1.0,...,-0.555292,-0.120199,-0.277396,-0.300419,0.921838,0.193908,-0.852693,-0.140043,-0.487675,-0.567064
2,90.0,75.0,1.0,-1.878759,1.027335,1.049123,0.0,1.0,1.0,1.0,...,-0.555292,-0.120199,-0.277396,-0.300419,1.022121,0.268696,-0.852693,-0.097767,-0.472270,-0.567064
3,86.0,85.0,1.0,0.606984,0.102548,-0.737680,0.0,0.0,0.0,0.0,...,-0.555292,-0.120199,-0.277396,-0.300419,0.816982,0.115708,-0.852693,0.180189,-0.370983,-0.567064
4,86.0,81.0,1.0,0.955931,-0.803455,0.457608,0.0,0.0,0.0,0.0,...,-0.555292,-0.120199,-0.277396,-0.300419,0.942839,0.209569,-0.852693,0.259775,-0.341982,-0.567064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14530,168.0,169.0,0.0,0.890647,-0.496269,0.096975,0.0,1.0,1.0,0.0,...,-0.341824,-0.120199,-0.277396,-0.300419,0.882048,-0.912674,0.233762,0.562403,-0.590158,-0.191814
14531,168.0,170.0,0.0,0.619421,0.073290,-0.698097,0.0,1.0,1.0,0.0,...,-0.151162,-0.120199,-0.277396,-0.300419,0.786213,0.473518,-0.468560,0.483581,-0.090774,-0.389462
14532,168.0,171.0,0.0,0.929290,-1.661225,1.288750,0.0,0.0,0.0,0.0,...,-0.125105,-0.120199,-0.277396,-0.300419,0.895702,0.579722,-0.443793,0.573634,-0.047021,-0.378011
14533,169.0,170.0,0.0,-0.329812,0.801335,-0.479592,1.0,1.0,0.0,1.0,...,0.180341,-0.120199,-0.277396,-0.300419,-0.509908,0.825238,0.861469,-0.348144,0.054125,0.079509


In [33]:
r_regression(yunnan_data[yunnan_data.columns[3:]], yunnan_data['contact'])

array([-0.01820585,  0.01615919, -0.02116207, -0.00440439,  0.0236299 ,
        0.00617396, -0.03715761, -0.03858047, -0.07069708,  0.4667575 ,
        0.1548591 , -0.01553951,  0.06451917,  0.17200157,  0.11800226,
        0.16831431,  0.11253321, -0.04322172,  0.47163262,  0.13479507,
       -0.04813939])

In [34]:
f_regression(yunnan_data[yunnan_data.columns[3:]], yunnan_data['contact'])

(array([4.81860476e+00, 3.79584120e+00, 6.51127598e+00, 2.81926377e-01,
        8.11935330e+00, 5.53986908e-01, 2.00932799e+01, 2.16639262e+01,
        7.30019291e+01, 4.04813345e+03, 3.57084195e+02, 3.51022250e+00,
        6.07497361e+01, 4.43059818e+02, 2.05222880e+02, 4.23719463e+02,
        1.86402451e+02, 2.72001611e+01, 4.15745492e+03, 2.68947107e+02,
        3.37570204e+01]),
 array([2.81698836e-02, 5.13992718e-02, 1.07294013e-02, 5.95449486e-01,
        4.38567396e-03, 4.56705240e-01, 7.43193460e-06, 3.27713514e-06,
        1.42268957e-17, 0.00000000e+00, 1.06424184e-78, 6.10113160e-02,
        6.91907561e-15, 6.49276118e-97, 3.12539937e-46, 7.94124714e-93,
        3.53003730e-42, 1.85961374e-07, 0.00000000e+00, 6.63999346e-60,
        6.37505155e-09]))

# Hainan

In [35]:
hainan_edgelist = pd.read_csv('../Data/Preprocessed/hainan_edgelist.csv')
hainan_nodelist = pd.read_csv('../Data/Preprocessed/hainan_nodelist.csv')
hainan_network = pd.read_csv('../Data/Tables/basic/hainan_basic_analysis.csv')

In [36]:
hainan_nodelist['gender'].unique(), hainan_nodelist['relatives'].unique()

(array([1., 0.]), array([0., 1.]))

Don't need to fill nans

In [37]:
hainan_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_relatives', 'referral_relatives', 'relatives_diff',
                                    'referee_degree_centrality', 'referral_degree_centrality', 'degree_centrality_diff', 'referee_betweenness_centrality', 'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                    'referee_pagerank_centrality', 'referral_pagerank_centrality', 'pagerank_centrality_diff', 'referee_component_size', 'referral_component_size', 'component_size_diff'])

In [38]:
added_combinations = set()
for index, row in hainan_edgelist[~hainan_edgelist['Referral'].isna()].iterrows():
    referee_id = row['Referee']
    referral_id = row['Referral']
    referee = hainan_nodelist.query('node_id == @referee_id')
    referral = hainan_nodelist.query('node_id == @referral_id')
    referee_network_stats = hainan_network.query('Referee == @referee_id')
    referral_network_stats = hainan_network.query('Referee == @referral_id')

    referee_age = referee['age'].iloc[0]
    referral_age = referral['age'].iloc[0]
    referee_gender = referee['gender'].iloc[0]
    referral_gender = referral['gender'].iloc[0]
    referee_relatives = referee['relatives'].iloc[0]
    referral_relatives = referral['relatives'].iloc[0]

    referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
    referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
    referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
    referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
    referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
    referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
    referee_component_size = referee_network_stats['Component_Size'].iloc[0]
    referral_component_size = referral_network_stats['Component_Size'].iloc[0]

    new_row = pd.Series({'referee': referee_id, 'referral': referral_id, 'contact': 1, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                         'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                         'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                         'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                         'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                         'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                         'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size)})
    hainan_data = pd.concat([hainan_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee_id, referral_id))
    added_combinations.add((referral_id, referee_id))

In [39]:
for ref_a in hainan_nodelist['node_id']:
    for ref_b in hainan_nodelist['node_id']:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = hainan_nodelist.query('node_id == @ref_a')
        referral = hainan_nodelist.query('node_id == @ref_b')
        referee_network_stats = hainan_network.query('Referee == @ref_a')
        referral_network_stats = hainan_network.query('Referee == @ref_b')

        referee_age = referee['age'].iloc[0]
        referral_age = referral['age'].iloc[0]
        referee_gender = referee['gender'].iloc[0]
        referral_gender = referral['gender'].iloc[0]
        referee_relatives = referee['relatives'].iloc[0]
        referral_relatives = referral['relatives'].iloc[0]

        referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
        referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
        referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
        referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
        referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
        referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
        referee_component_size = referee_network_stats['Component_Size'].iloc[0]
        referral_component_size = referral_network_stats['Component_Size'].iloc[0]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                             'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                             'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                             'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                             'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                             'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                             'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size)})
        hainan_data = pd.concat([hainan_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

In [40]:
hainan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,degree_centrality_diff,referee_betweenness_centrality,referral_betweenness_centrality,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff
0,5.0,3.0,1.0,27.0,27.0,0.0,0.0,1.0,1.0,0.0,...,0.000000,0.0,0.0,0.0,0.009838,0.009838,0.000000,2.0,2.0,0.0
1,15.0,7.0,1.0,54.0,53.0,1.0,0.0,1.0,1.0,1.0,...,0.000000,0.0,0.0,0.0,0.009838,0.009838,0.000000,2.0,2.0,0.0
2,14.0,13.0,1.0,44.0,47.0,3.0,0.0,1.0,1.0,1.0,...,0.000000,0.0,0.0,0.0,0.008852,0.008852,0.000000,6.0,6.0,0.0
3,36.0,35.0,1.0,17.0,8.0,9.0,0.0,1.0,1.0,1.0,...,0.000000,0.0,0.0,0.0,0.009838,0.009838,0.000000,2.0,2.0,0.0
4,43.0,42.0,1.0,69.0,68.0,1.0,0.0,1.0,1.0,1.0,...,0.000000,0.0,0.0,0.0,0.009838,0.009838,0.000000,2.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13036,158.0,159.0,0.0,62.0,27.0,35.0,1.0,0.0,1.0,1.0,...,0.018634,0.0,0.0,0.0,0.009838,0.009838,0.000000,7.0,4.0,3.0
13037,158.0,162.0,0.0,62.0,73.0,11.0,1.0,1.0,0.0,1.0,...,0.037267,0.0,0.0,0.0,0.009838,0.001476,0.008362,7.0,1.0,6.0
13038,160.0,159.0,0.0,25.0,27.0,2.0,0.0,0.0,0.0,1.0,...,0.018634,0.0,0.0,0.0,0.009838,0.009838,0.000000,7.0,4.0,3.0
13039,160.0,162.0,0.0,25.0,73.0,48.0,0.0,1.0,1.0,1.0,...,0.037267,0.0,0.0,0.0,0.009838,0.001476,0.008362,7.0,1.0,6.0


In [41]:
(hainan_nodelist.shape[0] * (hainan_nodelist.shape[0] - 1)) / 2

13041.0

In [42]:
hainan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff']] = normalize(hainan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                         'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                         'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                         'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                         'pagerank_centrality_diff', 'referee_component_size',
                                                                                         'referral_component_size', 'component_size_diff']])

hainan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff']] = scale(hainan_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                     'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                     'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                     'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                     'pagerank_centrality_diff', 'referee_component_size',
                                                                                     'referral_component_size', 'component_size_diff']])
hainan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,degree_centrality_diff,referee_betweenness_centrality,referral_betweenness_centrality,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff
0,5.0,3.0,1.0,0.344977,0.360479,-1.398907,0.0,1.0,1.0,0.0,...,-0.948469,-0.165307,-0.140069,-0.201868,2.441132,2.365345,-1.005913,0.505432,0.179204,-0.977502
1,15.0,7.0,1.0,0.388648,0.331234,-1.328305,0.0,1.0,1.0,1.0,...,-0.948469,-0.165307,-0.140069,-0.201868,0.655624,0.557864,-1.005913,-0.291031,-0.446993,-0.977502
2,14.0,13.0,1.0,0.189325,0.459596,-1.152186,0.0,1.0,1.0,1.0,...,-0.948469,-0.165307,-0.140069,-0.201868,0.743432,0.646752,-1.005913,1.741051,1.150673,-0.977502
3,36.0,35.0,1.0,0.913226,-1.445623,0.889752,0.0,1.0,1.0,1.0,...,-0.948469,-0.165307,-0.140069,-0.201868,5.407056,5.367767,-1.005913,1.828445,1.219384,-0.977502
4,43.0,42.0,1.0,0.381969,0.340539,-1.343747,0.0,1.0,1.0,1.0,...,-0.948469,-0.165307,-0.140069,-0.201868,0.256183,0.153507,-1.005913,-0.469211,-0.587081,-0.977502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13036,158.0,159.0,0.0,0.915896,-1.602360,1.042976,1.0,0.0,1.0,1.0,...,0.429174,-0.165307,-0.140069,-0.201868,0.634037,0.536011,-1.005913,1.711827,0.178340,0.159450
13037,158.0,162.0,0.0,-0.012194,0.630654,-0.791694,1.0,1.0,0.0,1.0,...,1.231540,-0.165307,-0.140069,-0.201868,0.257247,-1.074019,0.361705,1.123564,-0.837088,0.821633
13038,160.0,159.0,0.0,0.100335,0.406775,-1.116349,0.0,0.0,0.0,1.0,...,1.841233,-0.165307,-0.140069,-0.201868,2.483755,2.408492,-1.005913,4.599697,1.475771,1.324806
13039,160.0,162.0,0.0,-2.025581,0.883355,1.410430,0.0,1.0,1.0,1.0,...,1.362917,-0.165307,-0.140069,-0.201868,0.343295,-1.060953,0.444124,1.257907,-0.821999,0.930057


In [43]:
r_regression(hainan_data[hainan_data.columns[3:]], hainan_data['contact'])

array([-0.00614373, -0.01120263,  0.00741489,  0.0058602 ,  0.00772179,
        0.007496  ,  0.0871856 ,  0.08413633, -0.07812765,  0.16186753,
        0.07832695, -0.07204061,  0.06893415,  0.01106648,  0.03157675,
        0.09574911,  0.07687999, -0.07105161,  0.12982275,  0.0808674 ,
       -0.09499119])

In [44]:
f_regression(hainan_data[hainan_data.columns[3:]], hainan_data['contact'])

(array([  0.4921806 ,   1.63658725,   0.71693189,   0.44780005,
          0.77751038,   0.73270189,  99.8728961 ,  92.96012754,
         80.07793457, 350.82821091,  80.4895305 ,  68.02348356,
         62.25608058,   1.59704358,  13.01404911, 120.64620456,
         77.52565188,  66.15918386, 223.52588695,  85.83030683,
        118.72646017]),
 array([4.82968721e-01, 2.00817120e-01, 3.97167692e-01, 5.03393377e-01,
        3.77919892e-01, 3.92024531e-01, 1.97302275e-23, 6.31474686e-22,
        4.08053745e-19, 2.88000389e-77, 3.31747061e-19, 1.76950761e-16,
        3.25533112e-15, 2.06345275e-01, 3.10318752e-04, 6.05502239e-28,
        1.47377945e-18, 4.53360128e-16, 4.00270813e-50, 2.26400167e-20,
        1.57974423e-27]))

# Shanxi

In [2]:
shanxi_edgelist = pd.read_csv('../Data/Preprocessed/shanxi_edgelist.csv')
shanxi_nodelist = pd.read_csv('../Data/Preprocessed/shanxi_nodelist.csv')
shanxi_network = pd.read_csv('../Data/Tables/basic/shanxi_basic_analysis.csv')

In [3]:
shanxi_nodelist['gender'].unique(), shanxi_nodelist['relatives'].unique(), shanxi_nodelist['hukou'].unique()

(array([1, 0], dtype=int64),
 array([0, 1], dtype=int64),
 array(['xianyang', 'xian', 'ankang', 'yanan', 'wuhan', 'dazhi',
        'hanzhong', 'tongchuan', 'weinan', 'baoji', 'henan', 'shangluo',
        'xiaogan', 'hancheng', 'yulin', 'lantian', 'shanghai', 'lingbao',
        'tianmen', 'nanjing', 'yichang', 'suizhou', 'pingdingshan',
        'yingcheng', 'yanglin'], dtype=object))

No missing values to fill

In [4]:
shanxi_nodelist['hukou_ints'] = pd.factorize(shanxi_nodelist['hukou'])[0]
shanxi_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,hukou,relatives,hukou_ints
0,0,1,42,1,xianyang,0,0
1,1,2,32,0,xian,0,1
2,2,3,22,1,xian,0,1
3,3,4,49,1,ankang,0,2
4,4,5,23,1,yanan,0,3
...,...,...,...,...,...,...,...
232,232,233,67,0,weinan,0,8
233,233,234,46,1,hanzhong,0,6
234,234,235,71,1,hanzhong,1,6
235,235,236,66,0,hanzhong,1,6


In [48]:
shanxi_dummies = pd.get_dummies(shanxi_nodelist)

In [49]:
shanxi_dummies.columns

Index(['Unnamed: 0', 'node_id', 'age', 'gender', 'relatives', 'hukou_ints',
       'hukou_ankang', 'hukou_baoji', 'hukou_dazhi', 'hukou_hancheng',
       'hukou_hanzhong', 'hukou_henan', 'hukou_lantian', 'hukou_lingbao',
       'hukou_nanjing', 'hukou_pingdingshan', 'hukou_shanghai',
       'hukou_shangluo', 'hukou_suizhou', 'hukou_tianmen', 'hukou_tongchuan',
       'hukou_weinan', 'hukou_wuhan', 'hukou_xian', 'hukou_xianyang',
       'hukou_xiaogan', 'hukou_yanan', 'hukou_yanglin', 'hukou_yichang',
       'hukou_yingcheng', 'hukou_yulin'],
      dtype='object')

Don't need to fill nans

In [5]:
shanxi_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_relatives', 'referral_relatives', 'relatives_diff',
                                    'referee_hukou', 'referral_hukou', 'hukou_diff', 'referee_degree_centrality', 'referral_degree_centrality', 'degree_centrality_diff', 'referee_betweenness_centrality', 'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                    'referee_pagerank_centrality', 'referral_pagerank_centrality', 'pagerank_centrality_diff', 'referee_component_size', 'referral_component_size', 'component_size_diff'])

referees_hukou = pd.DataFrame(columns=['hukou_ankang', 'hukou_baoji', 'hukou_dazhi', 'hukou_hancheng',
                                       'hukou_hanzhong', 'hukou_henan', 'hukou_lantian', 'hukou_lingbao',
                                       'hukou_nanjing', 'hukou_pingdingshan', 'hukou_shanghai',
                                       'hukou_shangluo', 'hukou_suizhou', 'hukou_tianmen', 'hukou_tongchuan',
                                       'hukou_weinan', 'hukou_wuhan', 'hukou_xian', 'hukou_xianyang',
                                       'hukou_xiaogan', 'hukou_yanan', 'hukou_yanglin', 'hukou_yichang',
                                       'hukou_yingcheng', 'hukou_yulin'])
referrals_hukou = referees_hukou.copy()

In [7]:
added_combinations = set()
for index, row in shanxi_edgelist[~shanxi_edgelist['Referral'].isna()].iterrows():
    """
    referee = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == row['Referee']]
    referral = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == row['Referral']]
    new_row = pd.Series({'referee': row['Referee'], 'referral': row['Referral'], 'contact': 1, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                         'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                         'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0]),
                         'hukou_diff': abs(referee['hukou_ints'].iloc[0] - referral['hukou_ints'].iloc[0])})

    referee_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == row['Referee']][shanxi_dummies.columns[6:]]
    referral_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == row['Referral']][shanxi_dummies.columns[6:]]

    shanxi_data = pd.concat([shanxi_data, new_row.to_frame().T], axis='index', ignore_index=True)
    shanxi_data['hukou_diff'] = shanxi_data['hukou_diff'].where(shanxi_data['hukou_diff'] == 0, 1)

    referees_hukou = pd.concat([referees_hukou, referee_dummies], axis='index', ignore_index=True)
    referrals_hukou = pd.concat([referrals_hukou, referral_dummies], axis='index', ignore_index=True)

    added_combinations.add((referee['node_id'].iloc[0], referral['node_id'].iloc[0]))
    added_combinations.add((referral['node_id'].iloc[0], referee['node_id'].iloc[0]))
    """
    referee_id = row['Referee']
    referral_id = row['Referral']
    referee = shanxi_nodelist.query('node_id == @referee_id')
    referral = shanxi_nodelist.query('node_id == @referral_id')
    referee_network_stats = shanxi_network.query('Referee == @referee_id')
    referral_network_stats = shanxi_network.query('Referee == @referral_id')

    referee_age = referee['age'].iloc[0]
    referral_age = referral['age'].iloc[0]
    referee_gender = referee['gender'].iloc[0]
    referral_gender = referral['gender'].iloc[0]
    referee_relatives = referee['relatives'].iloc[0]
    referral_relatives = referral['relatives'].iloc[0]
    referee_hukou = referee['hukou_ints'].iloc[0]
    referral_hukou = referral['hukou_ints'].iloc[0]

    referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
    referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
    referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
    referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
    referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
    referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
    referee_component_size = referee_network_stats['Component_Size'].iloc[0]
    referral_component_size = referral_network_stats['Component_Size'].iloc[0]

    new_row = pd.Series({'referee': referee_id, 'referral': referral_id, 'contact': 1, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                         'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                         'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                         'referee_hukou': referee_hukou, 'referral_hukou': referral_hukou, 'hukou_diff': abs(referee_hukou - referral_hukou),
                         'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                         'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                         'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                         'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size)})
    shanxi_data = pd.concat([shanxi_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee_id, referral_id))
    added_combinations.add((referral_id, referee_id))

In [8]:
shanxi_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,degree_centrality_diff,referee_betweenness_centrality,referral_betweenness_centrality,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff
0,11.0,10.0,1.0,9.0,45.0,36.0,0.0,0.0,0.0,1.0,...,0.000000,0.0,0.000000,0.000000,0.007450,0.007450,0.000000,8.0,8.0,0.0
1,11.0,9.0,1.0,9.0,46.0,37.0,0.0,1.0,1.0,1.0,...,0.012712,0.0,0.000541,0.000541,0.007450,0.015512,0.008062,8.0,8.0,0.0
2,10.0,9.0,1.0,45.0,46.0,1.0,0.0,1.0,1.0,1.0,...,0.012712,0.0,0.000541,0.000541,0.007450,0.015512,0.008062,8.0,8.0,0.0
3,22.0,2.0,1.0,33.0,32.0,1.0,1.0,0.0,1.0,0.0,...,0.008475,0.0,0.000108,0.000108,0.004612,0.012776,0.008164,4.0,4.0,0.0
4,20.0,19.0,1.0,52.0,70.0,18.0,0.0,0.0,0.0,0.0,...,0.012712,0.0,0.000216,0.000216,0.003142,0.010087,0.006946,8.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,235.0,234.0,1.0,71.0,46.0,25.0,1.0,1.0,0.0,1.0,...,0.000000,0.0,0.000000,0.000000,0.006653,0.006653,0.000000,4.0,4.0,0.0
121,236.0,234.0,1.0,66.0,46.0,20.0,0.0,1.0,1.0,1.0,...,0.000000,0.0,0.000000,0.000000,0.006653,0.006653,0.000000,4.0,4.0,0.0
122,236.0,235.0,1.0,66.0,71.0,5.0,0.0,1.0,1.0,1.0,...,0.000000,0.0,0.000000,0.000000,0.006653,0.006653,0.000000,4.0,4.0,0.0
123,237.0,234.0,1.0,65.0,46.0,19.0,0.0,1.0,1.0,1.0,...,0.000000,0.0,0.000000,0.000000,0.006653,0.006653,0.000000,4.0,4.0,0.0


In [9]:
for ref_a in shanxi_nodelist['node_id']:
    for ref_b in shanxi_nodelist['node_id']:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        """
        referee = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == ref_a]
        referral = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == ref_b]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                             'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                             'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0]),
                             'hukou_diff': abs(referee['hukou_ints'].iloc[0] - referral['hukou_ints'].iloc[0])})
        referee_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == ref_a][shanxi_dummies.columns[6:]]
        referral_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == ref_b][shanxi_dummies.columns[6:]]

        shanxi_data = pd.concat([shanxi_data, new_row.to_frame().T], axis='index', ignore_index=True)
        shanxi_data['hukou_diff'] = shanxi_data['hukou_diff'].where(shanxi_data['hukou_diff'] == 0, 1)

        referees_hukou = pd.concat([referees_hukou, referee_dummies], axis='index', ignore_index=True)
        referrals_hukou = pd.concat([referrals_hukou, referral_dummies], axis='index', ignore_index=True)
        """
        referee = shanxi_nodelist.query('node_id == @ref_a')
        referral = shanxi_nodelist.query('node_id == @ref_b')
        referee_network_stats = shanxi_network.query('Referee == @ref_a')
        referral_network_stats = shanxi_network.query('Referee == @ref_b')

        referee_age = referee['age'].iloc[0]
        referral_age = referral['age'].iloc[0]
        referee_gender = referee['gender'].iloc[0]
        referral_gender = referral['gender'].iloc[0]
        referee_relatives = referee['relatives'].iloc[0]
        referral_relatives = referral['relatives'].iloc[0]
        referee_hukou = referee['hukou_ints'].iloc[0]
        referral_hukou = referral['hukou_ints'].iloc[0]

        referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
        referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
        referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
        referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
        referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
        referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
        referee_component_size = referee_network_stats['Component_Size'].iloc[0]
        referral_component_size = referral_network_stats['Component_Size'].iloc[0]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                             'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                             'referee_relatives': referee_relatives, 'referral_relatives': referral_relatives, 'relatives_diff': abs(referee_relatives - referral_relatives),
                             'referee_hukou': referee_hukou, 'referral_hukou': referral_hukou, 'hukou_diff': abs(referee_hukou - referral_hukou),
                             'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                             'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                             'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                             'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size)})
        shanxi_data = pd.concat([shanxi_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))


In [10]:
#shanxi_data = shanxi_data.join(referees_hukou)
#shanxi_data = shanxi_data.join(referrals_hukou, lsuffix='_referee', rsuffix='_referral')
shanxi_data['hukou_diff'] = shanxi_data['hukou_diff'].where(shanxi_data['hukou_diff'] == 0, 1)

In [11]:
shanxi_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,degree_centrality_diff,referee_betweenness_centrality,referral_betweenness_centrality,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff
0,11.0,10.0,1.0,9.0,45.0,36.0,0.0,0.0,0.0,1.0,...,0.000000,0.0,0.000000,0.000000,0.007450,0.007450,0.000000,8.0,8.0,0.0
1,11.0,9.0,1.0,9.0,46.0,37.0,0.0,1.0,1.0,1.0,...,0.012712,0.0,0.000541,0.000541,0.007450,0.015512,0.008062,8.0,8.0,0.0
2,10.0,9.0,1.0,45.0,46.0,1.0,0.0,1.0,1.0,1.0,...,0.012712,0.0,0.000541,0.000541,0.007450,0.015512,0.008062,8.0,8.0,0.0
3,22.0,2.0,1.0,33.0,32.0,1.0,1.0,0.0,1.0,0.0,...,0.008475,0.0,0.000108,0.000108,0.004612,0.012776,0.008164,4.0,4.0,0.0
4,20.0,19.0,1.0,52.0,70.0,18.0,0.0,0.0,0.0,0.0,...,0.012712,0.0,0.000216,0.000216,0.003142,0.010087,0.006946,8.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27967,232.0,237.0,0.0,46.0,65.0,19.0,0.0,0.0,0.0,0.0,...,0.012712,0.0,0.000000,0.000000,0.000998,0.006653,0.005655,1.0,4.0,3.0
27968,233.0,234.0,0.0,67.0,46.0,21.0,0.0,1.0,1.0,0.0,...,0.012712,0.0,0.000000,0.000000,0.000998,0.006653,0.005655,1.0,4.0,3.0
27969,233.0,235.0,0.0,67.0,71.0,4.0,0.0,1.0,1.0,0.0,...,0.012712,0.0,0.000000,0.000000,0.000998,0.006653,0.005655,1.0,4.0,3.0
27970,233.0,236.0,0.0,67.0,66.0,1.0,0.0,0.0,0.0,0.0,...,0.012712,0.0,0.000000,0.000000,0.000998,0.006653,0.005655,1.0,4.0,3.0


In [13]:
shanxi_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff']] = normalize(shanxi_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                         'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                         'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                         'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                         'pagerank_centrality_diff', 'referee_component_size',
                                                                                         'referral_component_size', 'component_size_diff']])
shanxi_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff']] = scale(shanxi_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                         'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                         'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                         'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                         'pagerank_centrality_diff', 'referee_component_size',
                                                                                         'referral_component_size', 'component_size_diff']])
shanxi_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,degree_centrality_diff,referee_betweenness_centrality,referral_betweenness_centrality,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff
0,11.0,10.0,1.0,-2.564772,0.458266,2.027137,0.0,0.0,0.0,1.0,...,-0.803756,-0.1903,-0.114410,-0.217595,1.096269,1.323232,-0.975878,1.914651,2.610474,-0.815370
1,11.0,9.0,1.0,-2.584027,0.454640,2.043845,0.0,1.0,1.0,1.0,...,1.362364,-0.1903,9.589142,3.857281,1.048491,4.024155,1.479956,1.849997,2.528125,-0.815370
2,10.0,9.0,1.0,0.476704,0.106205,-1.435078,0.0,1.0,1.0,1.0,...,1.210975,-0.1903,8.910965,3.572490,0.903095,3.653076,1.308319,1.653242,2.277520,-0.815370
3,22.0,2.0,1.0,0.610947,0.019382,-1.398245,1.0,0.0,1.0,0.0,...,1.090876,-0.1903,2.431801,0.851652,0.657964,4.452361,2.287056,0.881558,1.294634,-0.815370
4,20.0,19.0,1.0,-0.142136,0.606472,-0.349182,0.0,0.0,0.0,0.0,...,0.663045,-0.1903,2.513918,0.886136,-0.437826,1.052755,0.456904,0.941118,1.370495,-0.815370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27967,232.0,237.0,0.0,-0.247137,0.689131,-0.166914,0.0,0.0,0.0,0.0,...,0.801292,-0.1903,-0.114410,-0.217595,-0.825382,0.402140,0.300654,-0.704477,0.270872,-0.011454
27968,233.0,234.0,0.0,1.088579,-0.927678,-0.060936,0.0,1.0,1.0,0.0,...,0.761769,-0.1903,-0.114410,-0.217595,-0.830467,0.360586,0.269220,-0.710898,0.238159,-0.031250
27969,233.0,235.0,0.0,0.454992,0.249454,-1.285120,0.0,1.0,1.0,0.0,...,0.541887,-0.1903,-0.114410,-0.217595,-0.858757,0.129410,0.094343,-0.746620,0.056166,-0.141382
27970,233.0,236.0,0.0,0.605054,0.084787,-1.462551,0.0,0.0,0.0,0.0,...,0.593964,-0.1903,-0.114410,-0.217595,-0.852057,0.184163,0.135762,-0.738159,0.099270,-0.115298


In [14]:
r_regression(shanxi_data[shanxi_data.columns[3:]], shanxi_data['contact'])

array([ 0.01450487, -0.01756589, -0.01055913, -0.01539246,  0.00552877,
        0.01150495,  0.05974816,  0.02571722, -0.00035044,  0.00788541,
        0.01891385, -0.09406561,  0.03384481,  0.14420642,  0.02752034,
       -0.00160701,  0.21003348,  0.08112133,  0.02974129,  0.11684756,
        0.01862876,  0.04094681,  0.0636641 , -0.05462866])

In [15]:
f_regression(shanxi_data[shanxi_data.columns[3:]], shanxi_data['contact'])

(array([5.88588400e+00, 8.63309898e+00, 3.11887093e+00, 6.62843994e+00,
        8.54992530e-01, 3.70270591e+00, 1.00206212e+02, 1.85109144e+01,
        3.43501594e-03, 1.73927197e+00, 1.00093878e+01, 2.49697442e+02,
        3.20755617e+01, 5.94002489e+02, 2.11996723e+01, 7.22321195e-02,
        1.29081331e+03, 1.85280628e+02, 2.47626058e+01, 3.87170464e+02,
        9.70981631e+00, 4.69744061e+01, 1.13827072e+02, 8.37204591e+01]),
 array([1.52690138e-002, 3.30378981e-003, 7.74012973e-002, 1.00414063e-002,
        3.55151778e-001, 5.43344351e-002, 1.50458858e-023, 1.69507094e-005,
        9.53263954e-001, 1.87241897e-001, 1.55910923e-003, 5.28348334e-056,
        1.49744656e-008, 7.64599317e-130, 4.15654911e-006, 7.88116098e-001,
        2.16220984e-276, 4.64126800e-042, 6.52278945e-007, 1.29845554e-085,
        1.83471789e-003, 7.34117189e-012, 1.59913336e-026, 6.07597237e-020]))

# Bucharest

In [2]:
bucharest_nodelist = pd.read_csv('../Data/Preprocessed/bucharest_nodelist.csv')
bucharest_edgelist = pd.read_csv('../Data/Preprocessed/bucharest_edgelist.csv')
bucharest_network = pd.read_csv('../Data/Tables/basic/bucharest_basic_analysis.csv')

In [3]:
bucharest_nodelist['medical_ints'] = pd.factorize(bucharest_nodelist['medical'])[0]
bucharest_nodelist['isco08_code_ints'] = pd.factorize(bucharest_nodelist['isco08_code'])[0]

In [4]:
bucharest_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,medical,isco08_code,isco08_label,medical_ints,isco08_code_ints
0,0,179373,65.0,1,NAP,NAP,Not Active - pensioner,0,0
1,1,1S179373,63.0,,,,,-1,-1
2,2,146179,40.0,1,No,EMP,Employee - unknown group,1,1
3,3,235990,32.0,2.0,,,,-1,-1
4,4,3S146179,3.0,,NAC,NAC,Not Active - Child,2,2
...,...,...,...,...,...,...,...,...,...
57830,57830,19,32.0,male,,,,-1,-1
57831,57831,18,30.0,female,,,,-1,-1
57832,57832,16,60.0,male,,,,-1,-1
57833,57833,14,42.0,female,,,,-1,-1


In [5]:
bucharest_nodelist['gender'].unique()

array(['1', nan, '2.0', '2', '1.0', 'male', 'female'], dtype=object)

In [6]:
bucharest_nodelist['age'] = bucharest_nodelist['age'].fillna(value=int(bucharest_nodelist['age'].mean()))
bucharest_nodelist['gender'] = bucharest_nodelist['gender'].where(bucharest_nodelist['gender'] != 'male', 1)
bucharest_nodelist['gender'] = bucharest_nodelist['gender'].where(bucharest_nodelist['gender'] != 'female', 2)
bucharest_nodelist['gender'][~bucharest_nodelist['gender'].isnull()] = bucharest_nodelist['gender'][~bucharest_nodelist['gender'].isnull()].apply(lambda x: float(x))
bucharest_nodelist['gender'] = bucharest_nodelist['gender'].astype('Int64', errors='ignore')
bucharest_nodelist['gender'] = bucharest_nodelist['gender'].where(bucharest_nodelist['gender'] != 0, np.nan)
#bucharest_nodelist['gender'] = bucharest_nodelist['gender'].mask(bucharest_nodelist['gender'].isnull(), np.random.randint(0, 2, size=bucharest_nodelist.shape[0]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bucharest_nodelist['gender'][~bucharest_nodelist['gender'].isnull()] = bucharest_nodelist['gender'][~bucharest_nodelist['gender'].isnull()].apply(lambda x: float(x))


In [31]:
bucharest_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_medical', 'referral_medical', 'medical_diff',
                                    'referee_isco08_label', 'referral_isco08_label', 'isco08_label_diff', 'referee_degree_centrality', 'referral_degree_centrality', 'degree_centrality_diff', 'referee_betweenness_centrality', 'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                    'referee_pagerank_centrality', 'referral_pagerank_centrality', 'pagerank_centrality_diff', 'referee_component_size', 'referral_component_size', 'component_size_diff'])

In [32]:
added_combinations = set()
for index, row in bucharest_edgelist[~bucharest_edgelist['Referral'].isna()].iterrows():
    referee_id = row['Referee']
    referral_id = row['Referral']
    referee = bucharest_nodelist.query('node_id == @referee_id')
    referral = bucharest_nodelist.query('node_id == @referral_id')
    referee_network_stats = bucharest_network.query('Referee == @referee_id')
    referral_network_stats = bucharest_network.query('Referee == @referral_id')

    referee_age = referee['age'].iloc[0]
    referral_age = referral['age'].iloc[0]
    referee_gender = referee['gender'].iloc[0]
    referral_gender = referral['gender'].iloc[0]
    referee_medical = referee['medical_ints'].iloc[0]
    referral_medical = referral['medical_ints'].iloc[0]
    referee_isco08_label = referee['isco08_code_ints'].iloc[0]
    referral_isco08_label = referral['isco08_code_ints'].iloc[0]

    referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
    referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
    referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
    referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
    referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
    referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
    referee_component_size = referee_network_stats['Component_Size'].iloc[0]
    referral_component_size = referral_network_stats['Component_Size'].iloc[0]

    new_row = pd.Series({'referee': referee_id, 'referral': referral_id, 'contact': 1, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                         'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                         'referee_medical': referee_medical, 'referral_medical': referral_medical, 'medical_diff': abs(referee_medical - referral_medical),
                         'referee_isco08_label': referee_isco08_label, 'referral_isco08_label': referral_isco08_label, 'isco08_label_diff': abs(referee_isco08_label - referral_isco08_label),
                         'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                         'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                         'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                         'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size)})
    bucharest_data = pd.concat([bucharest_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee_id, referral_id))
    added_combinations.add((referral_id, referee_id))

In [19]:
sample = np.random.choice(bucharest_nodelist['node_id'], size=150, replace=False)
sample

array(['10678', '1240S135972', '64904', '37094', '192597', '309273',
       '35502', '166657', '235478', '26785', '121325', '302894', '113898',
       '176239', '125757', '142503', '230357', '251894', '32876',
       '152220', '244868', '191845', '154113', '123997', '262911',
       '164958', '61172', '157907', '170039', '8309S126602', '64336',
       '275271', '307748', '76161', '214746', '214933', '270799',
       '164630', '98427', '133183', '133136', '93340', '305', '247259',
       '3389S126108', '35344', '296708', '33935', '149377', '62162',
       '224606', '93035', '266495', '8659S179417', '266586', '124673',
       '265304', '247186', '11381', '271698', '312489', '191289',
       '232846', '10458S216629', '295747', '83762', '164417', '214937',
       '162552', '307039', '49179', '222865', '7511S213625', '251256',
       '264405', '93993', '168920', '303025', '9404S111064', '128523',
       '112263', '1203S150706', '9068S112099', '1642S76388', '90404',
       '316768', '1687S74

In [33]:
for ref_a in tqdm(sample):
    for ref_b in sample:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = bucharest_nodelist.query('node_id == @ref_a')
        referral = bucharest_nodelist.query('node_id == @ref_b')
        referee_network_stats = bucharest_network.query('Referee == @ref_a')
        referral_network_stats = bucharest_network.query('Referee == @ref_b')

        referee_age = referee['age'].iloc[0]
        referral_age = referral['age'].iloc[0]
        referee_gender = referee['gender'].iloc[0]
        referral_gender = referral['gender'].iloc[0]
        referee_medical = referee['medical_ints'].iloc[0]
        referral_medical = referral['medical_ints'].iloc[0]
        referee_isco08_label = referee['isco08_code_ints'].iloc[0]
        referral_isco08_label = referral['isco08_code_ints'].iloc[0]

        referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
        referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
        referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
        referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
        referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
        referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
        referee_component_size = referee_network_stats['Component_Size'].iloc[0]
        referral_component_size = referral_network_stats['Component_Size'].iloc[0]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                             'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                             'referee_medical': referee_medical, 'referral_medical': referral_medical, 'medical_diff': abs(referee_medical - referral_medical),
                             'referee_isco08_label': referee_isco08_label, 'referral_isco08_label': referral_isco08_label, 'isco08_label_diff': abs(referee_isco08_label - referral_isco08_label),
                             'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                             'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                             'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                             'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size)})
        bucharest_data = pd.concat([bucharest_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

  0%|          | 0/150 [00:00<?, ?it/s]

In [34]:
bucharest_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff']] = normalize(bucharest_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                         'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                         'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                         'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                         'pagerank_centrality_diff', 'referee_component_size',
                                                                                         'referral_component_size', 'component_size_diff']])
bucharest_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
             'degree_centrality_diff', 'referee_betweenness_centrality',
             'referral_betweenness_centrality', 'betweenness_centrality_diff',
             'referee_pagerank_centrality', 'referral_pagerank_centrality',
             'pagerank_centrality_diff', 'referee_component_size',
             'referral_component_size', 'component_size_diff']] = scale(bucharest_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                     'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                     'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                     'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                     'pagerank_centrality_diff', 'referee_component_size',
                                                                                     'referral_component_size', 'component_size_diff']])

In [35]:
bucharest_data['medical_diff'].unique(), bucharest_data['gender_diff'].unique(), bucharest_data['isco08_label_diff'].unique()

(array([1, 0, 3, 4, 2, 7, 5, 6], dtype=object),
 array([<NA>, 0, 1], dtype=object),
 array([1, 0, 3, 4, 2, 11, 7, 6, 5, 16, 8, 15, 12, 9, 10, 13, 17, 14],
       dtype=object))

In [36]:
#bucharest_data['gender_diff'] = bucharest_data['gender_diff'].fillna(value=-999)
bucharest_data['medical_diff'] = bucharest_data['medical_diff'].fillna(value=-999)
bucharest_data['isco08_label_diff'] = bucharest_data['isco08_label_diff'].fillna(value=-999)

#bucharest_data['gender_diff'] = bucharest_data['gender_diff'].where(bucharest_data['gender_diff'] <= 0, 1)
bucharest_data['medical_diff'] = bucharest_data['medical_diff'].where(bucharest_data['medical_diff'] <= 0, 1)
bucharest_data['isco08_label_diff'] = bucharest_data['isco08_label_diff'].where(bucharest_data['isco08_label_diff'] <= 0, 1)

#bucharest_data['gender_diff'] = bucharest_data['gender_diff'].where(bucharest_data['gender_diff'] != -999, np.nan)
bucharest_data['medical_diff'] = bucharest_data['medical_diff'].where(bucharest_data['medical_diff'] != -999, np.nan)
bucharest_data['isco08_label_diff'] = bucharest_data['isco08_label_diff'].where(bucharest_data['isco08_label_diff'] != -999, np.nan)

bucharest_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_medical,...,degree_centrality_diff,referee_betweenness_centrality,referral_betweenness_centrality,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff
0,53594,10330S53594,1,-0.041165,0.761358,-0.825894,2,,,-1,...,-0.340700,-0.244387,-0.130587,-0.248910,-0.190461,-0.285841,-0.296191,-0.364853,-0.401506,-0.350583
1,53594,10331S53594,1,0.884368,-1.007123,0.778263,2,,,-1,...,-0.247164,-0.225550,-0.130587,-0.229749,0.008784,-0.061280,-0.176198,-0.196799,-0.236993,-0.350583
2,58373,11577S58373,1,0.181887,0.650306,-1.130545,2,,,-1,...,-0.683815,-0.313487,-0.130587,-0.319197,-0.326897,0.225870,-0.736364,-0.480774,-0.514985,-0.350583
3,63203,5541S63203,1,0.879118,-0.269276,-0.035003,2,,,0,...,-0.385977,-0.253505,-0.130587,-0.258185,-0.286909,-0.394544,-0.354276,-0.446202,-0.481141,-0.350583
4,63203,5540S63203,1,0.906174,-0.415114,0.147077,2,,,0,...,-0.384110,-0.253129,-0.130587,-0.257802,-0.282932,-0.390061,-0.351881,-0.442847,-0.477857,-0.350583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24547,203441,430S181880,0,0.282401,0.584044,-1.285613,2,,,-1,...,-0.082510,-0.313487,-0.130587,-0.319197,-0.787455,0.811998,0.135562,-0.621206,-0.299926,0.274379
24548,203441,135465,0,-1.124992,0.983058,0.305877,2,1,1,-1,...,-0.683815,-0.313487,-0.130587,-0.319197,-0.838131,-0.922637,-0.736364,-0.757498,-0.785877,-0.350583
24549,55S241953,430S181880,0,-2.763561,0.742235,1.385419,,,,3,...,-0.683815,-0.313487,-0.130587,-0.319197,-0.355481,0.918261,-0.329337,0.918844,-0.260936,1.627981
24550,55S241953,135465,0,-3.078706,0.665300,1.559477,,1,,3,...,-0.346514,-0.313487,-0.130587,-0.319197,-0.620562,-0.940862,-0.463616,0.028716,-0.807237,1.051701


In [39]:
bucharest_data = bucharest_data.fillna(value=-999)

In [40]:
bucharest_data['medical_diff'].unique(), bucharest_data['gender_diff'].unique(), bucharest_data['isco08_label_diff'].unique()

(array([1, 0], dtype=int64),
 array([-999,    0,    1], dtype=int64),
 array([1, 0], dtype=int64))

In [41]:
r_regression(bucharest_data[bucharest_data.columns[3:]], bucharest_data['contact'])

array([ 0.10644435, -0.14761348,  0.01134502,  0.26053551, -0.66511142,
       -0.58003904,  0.27985634,  0.29603158,  0.39838073,  0.16094323,
        0.08227855,  0.25537315,  0.56034267,  0.35189251,  0.32530651,
        0.27427833,  0.01638329,  0.2611126 ,  0.59235566,  0.29120703,
        0.33847135,  0.40635151,  0.34927571, -0.38357129])

In [43]:
f_regression(bucharest_data[bucharest_data.columns[3:]], bucharest_data['contact'])

(array([2.81349099e+02, 5.46853892e+02, 3.16022563e+00, 1.78777535e+03,
        1.94758607e+04, 1.24477027e+04, 2.08613024e+03, 2.35808151e+03,
        4.63127944e+03, 6.52821708e+02, 1.67330382e+02, 1.71273599e+03,
        1.12363326e+04, 3.46962345e+03, 2.90545504e+03, 1.99710128e+03,
        6.59129208e+00, 1.79628412e+03, 1.32707383e+04, 2.27478278e+03,
        3.17641763e+03, 4.85547800e+03, 3.41106939e+03, 4.23505712e+03]),
 array([8.54086834e-063, 1.23714864e-119, 7.54651300e-002, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 3.91892798e-142, 3.77575768e-038, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        1.02537435e-002, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000]))

# China

In [68]:
china_nodelist = pd.read_csv('../Data/Preprocessed/china_nodelist.csv')
china_edgelist = pd.read_csv('../Data/Preprocessed/china_edgelist.csv')
china_network = pd.read_csv('../Data/Tables/basic/china_basic_analysis.csv')

In [69]:
china_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,residency,place_event,possible_source,symptom,symptom_severity,place_admission
0,0,anqing-1,49,Male,,Wuhan Travel,,,,Anhui_Anqing
1,1,anqing-10,54,Male,,Hainan Work,,Somatosensory Related-Sign Description,Stable,Anhui_Anqing
2,2,anqing-11,50,Female,Hubei_Wuhan,Wuhan Travel,,Respiratory System Related-Symptom Description,Stable,Anhui_Anqing
3,3,anqing-12,46,Male,,Yellowstone Work,,Somatosensory Related-Sign Description,Stable,Anhui_Anqing
4,4,anqing-13,58,Male,Hubei_Wuhan,Work In Wuhan,,,Stable,Anhui_Anqing
...,...,...,...,...,...,...,...,...,...,...
25872,25872,ningbo-255,,male,Zhejiang-Ningbo,,,,,Zhejiang-Ningbo
25873,25873,ningbo-256,,Female,Zhejiang-Ningbo,,,,,Zhejiang-Ningbo
25874,25874,ningbo-257,,Female,Zhejiang-Ningbo,,,,,Zhejiang-Ningbo
25875,25875,ningbo-258,,Female,Zhejiang-Ningbo,,,,,Zhejiang-Ningbo


In [70]:
china_nodelist['gender'] = china_nodelist['gender'].apply(lambda x: str(x).lower())
china_nodelist['gender'] = china_nodelist['gender'].apply(lambda x: x.strip())
china_nodelist['gender'] = china_nodelist['gender'].apply(lambda x: 'male' if x in ('male', 'man') else x)
china_nodelist['gender'] = china_nodelist['gender'].apply(lambda x: np.nan if x not in ('male', 'female') else x)
china_nodelist['gender_ints'] = LabelEncoder().fit_transform(china_nodelist['gender'])

china_nodelist['residency'] = china_nodelist['residency'].apply(lambda x: str(x).lower())
china_nodelist['residency'] = china_nodelist['residency'].apply(lambda x: x.strip())
china_nodelist['residency_ints'] = LabelEncoder().fit_transform(china_nodelist['residency'])

china_nodelist['place_event'] = china_nodelist['place_event'].apply(lambda x: str(x).lower())
china_nodelist['place_event'] = china_nodelist['place_event'].apply(lambda x: x.strip())
china_nodelist['place_event_ints'] = LabelEncoder().fit_transform(china_nodelist['place_event'])

china_nodelist['symptom'] = china_nodelist['symptom'].apply(lambda x: str(x).lower())
china_nodelist['symptom'] = china_nodelist['symptom'].apply(lambda x: x.strip())
most_common_symptoms = Counter(china_nodelist['symptom']).most_common()
symptom_keys = set([x[0] for x in most_common_symptoms])
china_nodelist['symptom'] = china_nodelist['symptom'].apply(lambda x: x if x in symptom_keys else np.nan)
china_nodelist['symptom_ints'] = LabelEncoder().fit_transform(china_nodelist['symptom'])

china_nodelist['place_admission'] = china_nodelist['place_admission'].apply(lambda x: str(x).lower())
china_nodelist['place_admission'] = china_nodelist['place_admission'].apply(lambda x: x.strip())
china_nodelist['place_admission_ints'] = LabelEncoder().fit_transform(china_nodelist['place_admission'])

china_nodelist['symptom_severity'] = china_nodelist['symptom_severity'].apply(lambda x: str(x).lower())
china_nodelist['symptom_severity'] = china_nodelist['symptom_severity'].apply(lambda x: x.strip())
china_nodelist['symptom_severity'] = china_nodelist['symptom_severity'].apply(lambda x: x if x in ('stable', 'mild', 'light', 'severe') else np.nan)
china_nodelist['symptom_severity_ints'] = LabelEncoder().fit_transform(china_nodelist['symptom_severity'])

In [71]:
china_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,residency,place_event,possible_source,symptom,symptom_severity,place_admission,gender_ints,residency_ints,place_event_ints,symptom_ints,place_admission_ints,symptom_severity_ints
0,0,anqing-1,49,male,,wuhan travel,,,,anhui_anqing,1,480,1538,55,0,4
1,1,anqing-10,54,male,,hainan work,,somatosensory related-sign description,stable,anhui_anqing,1,480,449,186,0,3
2,2,anqing-11,50,female,hubei_wuhan,wuhan travel,,respiratory system related-symptom description,stable,anhui_anqing,0,318,1538,103,0,3
3,3,anqing-12,46,male,,yellowstone work,,somatosensory related-sign description,stable,anhui_anqing,1,480,1624,186,0,3
4,4,anqing-13,58,male,hubei_wuhan,work in wuhan,,,stable,anhui_anqing,1,318,1481,55,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25872,25872,ningbo-255,,male,zhejiang-ningbo,,,,,zhejiang-ningbo,1,749,748,55,461,4
25873,25873,ningbo-256,,female,zhejiang-ningbo,,,,,zhejiang-ningbo,0,749,748,55,461,4
25874,25874,ningbo-257,,female,zhejiang-ningbo,,,,,zhejiang-ningbo,0,749,748,55,461,4
25875,25875,ningbo-258,,female,zhejiang-ningbo,,,,,zhejiang-ningbo,0,749,748,55,461,4


In [72]:
china_nodelist['age'] = pd.to_numeric(china_nodelist['age'], errors='coerce')
china_nodelist['age'].unique()

array([ 49. ,  54. ,  50. ,  46. ,  58. ,  51. ,  41. ,  37. ,  42. ,
        52. ,  25. ,  53. ,  45. ,  39. ,  65. ,  31. ,  27. ,  57. ,
        21. ,  47. ,  36. ,  64. ,  28. ,  40. ,  34. ,  29. ,  62. ,
        56. ,  75. ,  33. ,  43. ,  30. ,  55. ,  35. ,  38. ,  20. ,
        71. ,  24. ,  17. ,  86. ,  44. ,  63. ,  70. ,  48. ,  60. ,
        67. ,  59. ,  74. ,  68. ,  83. ,  80. ,  19. ,  78. ,  66. ,
        82. ,  69. ,   2. ,  22. ,  13. ,  12. ,  85. ,  18. ,  89. ,
        14. ,  72. ,  10. ,  32. ,   5. ,  16. ,   8. ,  26. ,   7. ,
        61. ,  73. ,   6. ,  76. ,   nan,  15. ,   1. ,  23. ,  91. ,
         3. ,   4. ,   9. ,  87. ,  79. ,  77. ,  11. ,  84. ,  81. ,
        94. ,  90. ,   0. ,  88. , 644. ,  95. ,  97. ,  96. ,  93. ,
        92. ,  99. ,   2.5])

In [85]:
china_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_residency', 'referral_residency', 'residency_diff',
                                       'referee_place_event', 'referral_place_event', 'place_event_diff', 'referee_symptom', 'referral_symptom', 'symptom_diff', 'referee_symptom_severity', 'referral_symptom_severity', 'symptom_severity_diff', 'referee_place_admission', 'referral_place_admission', 'place_admission_diff', 'referee_degree_centrality', 'referral_degree_centrality', 'degree_centrality_diff', 'referee_betweenness_centrality', 'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                       'referee_pagerank_centrality', 'referral_pagerank_centrality', 'pagerank_centrality_diff', 'referee_component_size', 'referral_component_size', 'component_size_diff'])

In [86]:
added_combinations = set()
for index, row in china_edgelist[~china_edgelist['Referral'].isna()].iterrows():
    referee_id = row['Referee']
    referral_id = row['Referral']
    referee = china_nodelist.query('node_id == @referee_id')
    referral = china_nodelist.query('node_id == @referral_id')
    referee_network_stats = china_network.query('Referee == @referee_id')
    referral_network_stats = china_network.query('Referee == @referral_id')

    referee_age = referee['age'].iloc[0]
    referral_age = referral['age'].iloc[0]
    referee_gender = referee['gender_ints'].iloc[0]
    referral_gender = referral['gender_ints'].iloc[0]
    referee_residency = referee['residency_ints'].iloc[0]
    referral_residency = referral['residency_ints'].iloc[0]
    referee_place_event = referee['place_event_ints'].iloc[0]
    referral_place_event = referral['place_event_ints'].iloc[0]
    referee_symptom = referee['symptom_ints'].iloc[0]
    referral_symptom = referral['symptom_ints'].iloc[0]
    referee_symptom_severity = referee['symptom_severity_ints'].iloc[0]
    referral_symptom_severity = referral['symptom_severity_ints'].iloc[0]
    referee_place_admission = referee['place_admission_ints'].iloc[0]
    referral_place_admission = referral['place_admission_ints'].iloc[0]

    referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
    referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
    referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
    referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
    referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
    referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
    referee_component_size = referee_network_stats['Component_Size'].iloc[0]
    referral_component_size = referral_network_stats['Component_Size'].iloc[0]

    new_row = pd.Series({'referee': referee_id, 'referral': referral_id, 'contact': 1, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                         'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                         'referee_residency': referee_residency, 'referral_residency': referral_residency, 'residency_diff': abs(referee_residency - referral_residency),
                         'referee_place_event': referee_place_event, 'referral_place_event': referral_place_event, 'place_event_diff': abs(referee_place_event - referral_place_event),
                         'referee_symptom': referee_symptom, 'referral_symptom': referral_symptom, 'symptom_diff': abs(referee_symptom - referral_symptom),
                         'referee_symptom_severity': referee_symptom_severity, 'referral_symptom_severity': referral_symptom_severity, 'symptom_severity_diff': abs(referee_symptom_severity - referral_symptom_severity),
                         'referee_place_admission': referee_place_admission, 'referral_place_admission': referral_place_admission, 'place_admission_diff': abs(referee_place_admission - referral_place_admission),
                         'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                         'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                         'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                         'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size)})
    china_data = pd.concat([china_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee_id, referral_id))
    added_combinations.add((referral_id, referee_id))

In [87]:
china_data.shape

(11436, 36)

In [88]:
sample = np.random.choice(china_nodelist['node_id'], size=150, replace=False)
sample

array(['shenzhen-185', 'nanjing-247', 'shijiazhuang-1022', 'shanghai-388',
       'shenzhen-428', 'an-98', 'an-2555', 'wenzhou-128', 'anshun-3',
       'xiamen-383', 'chengdu-554', 'yuzhong-15', 'shijiazhuang-4',
       'yueyang-36', 'quanzhou-21', 'qingdao-54', 'zhengzhou-224',
       'alxa-77', 'shanghai-1307', 'ganzhou-3', 'shijiazhuang-396',
       'gannan-3', 'wenchang-3', 'an-2438', 'anyang-12',
       'shijiazhuang-804', 'xiangtan-9', 'an-148', 'wenzhou-420',
       'changjiang-3', 'an-1324', 'yinchuan-35', 'an-486', 'xinyu-46',
       'yangzhou-74', 'putian-73', 'shijiazhuang-823', 'fujianputian-1',
       'hulunbeier-290', 'nanchong-18', 'jiaxing-32', 'fuyang-72',
       'shanghai-1842', 'tianjin-692', 'shanghai-556', 'an-351',
       'guangzhou-819', 'yangzhou-531', 'guangzhou-14', 'rizhao-25',
       'hulunbeier-520', 'shijiazhuang-844', 'an-1570', 'yongzhou-23',
       'shangqiu-36', 'lanzhou-103', 'jining-23', 'xinyu-80',
       'guangzhou-1064', 'yangzhou-38', 'guangzhou-

In [89]:
for ref_a in tqdm(sample):
    for ref_b in sample:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = china_nodelist.query('node_id == @ref_a')
        referral = china_nodelist.query('node_id == @ref_b')
        referee_network_stats = china_network.query('Referee == @ref_a')
        referral_network_stats = china_network.query('Referee == @ref_b')

        referee_age = referee['age'].iloc[0]
        referral_age = referral['age'].iloc[0]
        referee_gender = referee['gender_ints'].iloc[0]
        referral_gender = referral['gender_ints'].iloc[0]
        referee_residency = referee['residency_ints'].iloc[0]
        referral_residency = referral['residency_ints'].iloc[0]
        referee_place_event = referee['place_event_ints'].iloc[0]
        referral_place_event = referral['place_event_ints'].iloc[0]
        referee_symptom = referee['symptom_ints'].iloc[0]
        referral_symptom = referral['symptom_ints'].iloc[0]
        referee_symptom_severity = referee['symptom_severity_ints'].iloc[0]
        referral_symptom_severity = referral['symptom_severity_ints'].iloc[0]
        referee_place_admission = referee['place_admission_ints'].iloc[0]
        referral_place_admission = referral['place_admission_ints'].iloc[0]

        referee_degree_centrality = referee_network_stats['Degree_Centrality'].iloc[0]
        referral_degree_centrality = referral_network_stats['Degree_Centrality'].iloc[0]
        referee_betweenness_centrality = referee_network_stats['Betweenness_Centrality'].iloc[0]
        referral_betweenness_centrality = referral_network_stats['Betweenness_Centrality'].iloc[0]
        referee_pagerank_centrality = referee_network_stats['Pagerank_Centrality'].iloc[0]
        referral_pagerank_centrality = referral_network_stats['Pagerank_Centrality'].iloc[0]
        referee_component_size = referee_network_stats['Component_Size'].iloc[0]
        referral_component_size = referral_network_stats['Component_Size'].iloc[0]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee_age, 'referral_age': referral_age, 'age_diff': abs(referee_age - referral_age),
                             'referee_gender': referee_gender, 'referral_gender': referral_gender, 'gender_diff': abs(referee_gender - referral_gender),
                             'referee_residency': referee_residency, 'referral_residency': referral_residency, 'residency_diff': abs(referee_residency - referral_residency),
                             'referee_place_event': referee_place_event, 'referral_place_event': referral_place_event, 'place_event_diff': abs(referee_place_event - referral_place_event),
                             'referee_symptom': referee_symptom, 'referral_symptom': referral_symptom, 'symptom_diff': abs(referee_symptom - referral_symptom),
                             'referee_symptom_severity': referee_symptom_severity, 'referral_symptom_severity': referral_symptom_severity, 'symptom_severity_diff': abs(referee_symptom_severity - referral_symptom_severity),
                             'referee_place_admission': referee_place_admission, 'referral_place_admission': referral_place_admission, 'place_admission_diff': abs(referee_place_admission - referral_place_admission),
                             'referee_degree_centrality': referee_degree_centrality, 'referral_degree_centrality': referral_degree_centrality, 'degree_centrality_diff': abs(referee_degree_centrality - referral_degree_centrality),
                             'referee_betweenness_centrality': referee_betweenness_centrality, 'referral_betweenness_centrality': referral_betweenness_centrality, 'betweenness_centrality_diff': abs(referee_betweenness_centrality - referral_betweenness_centrality),
                             'referee_pagerank_centrality': referee_pagerank_centrality, 'referral_pagerank_centrality': referral_pagerank_centrality, 'pagerank_centrality_diff': abs(referee_pagerank_centrality - referral_pagerank_centrality),
                             'referee_component_size': referee_component_size, 'referral_component_size': referral_component_size, 'component_size_diff': abs(referee_component_size - referral_component_size)})
        china_data = pd.concat([china_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

  0%|          | 0/150 [00:00<?, ?it/s]

In [90]:
china_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_residency,...,degree_centrality_diff,referee_betweenness_centrality,referral_betweenness_centrality,betweenness_centrality_diff,referee_pagerank_centrality,referral_pagerank_centrality,pagerank_centrality_diff,referee_component_size,referral_component_size,component_size_diff
0,shijiazhuang-155,shijiazhuang-176,1,29.0,61.0,32.0,0,1,1,244,...,0.0,0.0,0.0,0.0,0.000116,0.000116,0.0,4,4,0
1,shijiazhuang-155,shijiazhuang-156,1,29.0,35.0,6.0,0,0,0,244,...,0.0,0.0,0.0,0.0,0.000116,0.000116,0.0,4,4,0
2,shijiazhuang-150,shijiazhuang-38,1,3.0,55.0,52.0,0,0,0,244,...,0.000039,0.0,0.0,0.0,0.000166,0.000091,0.000075,3,3,0
3,shijiazhuang-150,shijiazhuang-151,1,3.0,33.0,30.0,0,0,0,244,...,0.000039,0.0,0.0,0.0,0.000166,0.000091,0.000075,3,3,0
4,shijiazhuang-145,shijiazhuang-146,1,15.0,52.0,37.0,0,0,0,244,...,0.000116,0.0,0.0,0.0,0.0002,0.000061,0.000139,7,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22606,an-408,yichun-24,0,49.0,38.0,11.0,1,2,1,480,...,0.0,0.0,0.0,0.0,0.000019,0.000019,0.0,1,1,0
22607,an-408,tianjin-139,0,49.0,31.0,18.0,1,0,1,480,...,0.000039,0.0,0.0,0.0,0.000019,0.000116,0.000097,1,2,1
22608,an-1057,yichun-24,0,61.0,38.0,23.0,0,2,2,539,...,0.0,0.0,0.0,0.0,0.000019,0.000019,0.0,1,1,0
22609,an-1057,tianjin-139,0,61.0,31.0,30.0,0,0,0,539,...,0.000039,0.0,0.0,0.0,0.000019,0.000116,0.000097,1,2,1


In [91]:
china_data = china_data.fillna(value=-999)

china_data['gender_diff'] = china_data['gender_diff'].where(china_data['gender_diff'] <= 0, 1)
china_data['residency_diff'] = china_data['residency_diff'].where(china_data['residency_diff'] <= 0, 1)
china_data['place_event_diff'] = china_data['place_event_diff'].where(china_data['place_event_diff'] <= 0, 1)
china_data['symptom_diff'] = china_data['symptom_diff'].where(china_data['symptom_diff'] <= 0, 1)
china_data['symptom_severity_diff'] = china_data['symptom_severity_diff'].where(china_data['symptom_severity_diff'] <= 0, 1)
china_data['place_admission_diff'] = china_data['place_admission_diff'].where(china_data['place_admission_diff'] <= 0, 1)

In [92]:
china_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                'degree_centrality_diff', 'referee_betweenness_centrality',
                'referral_betweenness_centrality', 'betweenness_centrality_diff',
                'referee_pagerank_centrality', 'referral_pagerank_centrality',
                'pagerank_centrality_diff', 'referee_component_size',
                'referral_component_size', 'component_size_diff']] = normalize(china_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                               'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                               'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                               'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                               'pagerank_centrality_diff', 'referee_component_size',
                                                                                               'referral_component_size', 'component_size_diff']])
china_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                'degree_centrality_diff', 'referee_betweenness_centrality',
                'referral_betweenness_centrality', 'betweenness_centrality_diff',
                'referee_pagerank_centrality', 'referral_pagerank_centrality',
                'pagerank_centrality_diff', 'referee_component_size',
                'referral_component_size', 'component_size_diff']] = scale(china_data[['referee_age', 'referral_age', 'age_diff', 'referee_degree_centrality', 'referral_degree_centrality',
                                                                                           'degree_centrality_diff', 'referee_betweenness_centrality',
                                                                                           'referral_betweenness_centrality', 'betweenness_centrality_diff',
                                                                                           'referee_pagerank_centrality', 'referral_pagerank_centrality',
                                                                                           'pagerank_centrality_diff', 'referee_component_size',
                                                                                           'referral_component_size', 'component_size_diff']])

In [93]:
r_regression(china_data[china_data.columns[3:]], china_data['contact'])

array([ 0.3333702 ,  0.41207751,  0.38931445, -0.30083026, -0.23272957,
       -0.09646081, -0.03643705, -0.00890749, -0.63299648, -0.02298823,
        0.01322318, -0.07837373,  0.13398849,  0.11490282,  0.0336319 ,
       -0.04831851, -0.02206979, -0.32785323, -0.23998541, -0.20738531,
       -0.93147155,  0.56505346,  0.52427747,  0.11257618,  0.0308492 ,
        0.13327541,  0.13213098,  0.57097862,  0.37805741,  0.11430363,
        0.39692128,  0.44650685, -0.22236369])

In [94]:
f_regression(china_data[china_data.columns[3:]], china_data['contact'])

(array([2.82682831e+03, 4.62445518e+03, 4.03891023e+03, 2.24968206e+03,
        1.29469725e+03, 2.12345492e+02, 3.00569421e+01, 1.79401772e+00,
        1.51157067e+04, 1.19542388e+01, 3.95393321e+00, 1.39732749e+02,
        4.13317730e+02, 3.02492622e+02, 2.56021057e+01, 5.29082709e+01,
        1.10176610e+01, 2.72286489e+03, 1.38169612e+03, 1.01608333e+03,
        1.48204468e+05, 1.06046265e+04, 8.57010224e+03, 2.90210751e+02,
        2.15368705e+01, 4.08850755e+02, 4.01735108e+02, 1.09363358e+04,
        3.77032917e+03, 2.99304299e+02, 4.22808963e+03, 5.62995433e+03,
        1.17606712e+03]),
 array([0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        9.39031988e-276, 6.97226186e-048, 4.24030133e-008, 1.80450873e-001,
        0.00000000e+000, 5.46234741e-004, 4.67739850e-002, 3.78764493e-032,
        4.53301663e-091, 2.58901136e-067, 4.22866761e-007, 3.60882594e-013,
        9.03905062e-004, 0.00000000e+000, 1.32881667e-293, 3.82058982e-218,
        0.00000000