In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize, scale
from sklearn.feature_selection import r_regression, f_regression

# Yunnan

In [2]:
yunnan_nodelist = pd.read_csv('../Data/Preprocessed/yunnan_nodelist.csv')
yunnan_edgelist = pd.read_csv('../Data/Preprocessed/yunnan_edgelist.csv')

In [3]:
yunnan_edgelist

Unnamed: 0.1,Unnamed: 0,Referee,Referral,Date
0,0,1,,200117
1,1,1,10.0,200117
2,2,2,,200122
3,3,3,,200124
4,4,4,,200124
...,...,...,...,...
270,273,171,156.0,200216
271,269,170,155.0,200216
272,268,170,,200216
273,270,170,156.0,200216


In [4]:
yunnan_nodelist['gender'].unique(), yunnan_nodelist['relatives'].unique()

(array([ 1.,  0., nan]), array([ 1., nan,  0.]))

In [5]:
yunnan_nodelist.columns

Index(['Unnamed: 0', 'node_id', 'age', 'gender', 'relatives'], dtype='object')

In [6]:
yunnan_nodelist['age'] = yunnan_nodelist['age'].fillna(value=int(yunnan_nodelist['age'].mean()))
yunnan_nodelist['gender'] = yunnan_nodelist['gender'].mask(yunnan_nodelist['gender'].isnull(), np.random.randint(0, 2, size=yunnan_nodelist.shape[0]))
yunnan_nodelist['relatives'] = yunnan_nodelist['relatives'].mask(yunnan_nodelist['relatives'].isnull(), np.random.randint(0, 2, size=yunnan_nodelist.shape[0]))

In [7]:
yunnan_nodelist['gender'].unique(), yunnan_nodelist['relatives'].unique()

(array([1., 0.]), array([1., 0.]))

In [8]:
yunnan_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_relatives', 'referral_relatives', 'relatives_diff'])

In [9]:
added_combinations = set()
for index, row in yunnan_edgelist[~yunnan_edgelist['Referral'].isna()].iterrows():
    referee = yunnan_nodelist.loc[yunnan_nodelist['node_id'] == row['Referee']]
    referral = yunnan_nodelist.loc[yunnan_nodelist['node_id'] == row['Referral']]
    new_row = pd.Series({'referee': row['Referee'], 'referral': row['Referral'], 'contact': 1, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                         'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                         'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0])})
    yunnan_data = pd.concat([yunnan_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee['node_id'].iloc[0], referral['node_id'].iloc[0]))
    added_combinations.add((referral['node_id'].iloc[0], referee['node_id'].iloc[0]))

In [10]:
for ref_a in yunnan_nodelist['node_id']:
    for ref_b in yunnan_nodelist['node_id']:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = yunnan_nodelist.loc[yunnan_nodelist['node_id'] == ref_a]
        referral = yunnan_nodelist.loc[yunnan_nodelist['node_id'] == ref_b]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                             'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                             'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0])})
        yunnan_data = pd.concat([yunnan_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

In [11]:
yunnan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,referral_relatives,relatives_diff
0,1.0,10.0,1.0,41.0,63.0,22.0,1.0,1.0,0.0,1.0,0.0,1.0
1,16.0,15.0,1.0,68.0,71.0,3.0,0.0,1.0,1.0,1.0,1.0,0.0
2,90.0,75.0,1.0,34.0,76.0,42.0,0.0,1.0,1.0,1.0,0.0,1.0
3,86.0,85.0,1.0,79.0,67.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0
4,86.0,81.0,1.0,79.0,46.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
14530,168.0,169.0,0.0,62.0,41.0,21.0,0.0,1.0,1.0,0.0,1.0,1.0
14531,168.0,170.0,0.0,62.0,52.0,10.0,0.0,1.0,1.0,0.0,1.0,1.0
14532,168.0,171.0,0.0,62.0,24.0,38.0,0.0,0.0,0.0,0.0,1.0,1.0
14533,169.0,170.0,0.0,41.0,52.0,11.0,1.0,1.0,0.0,1.0,1.0,0.0


In [91]:
(yunnan_nodelist.shape[0] * (yunnan_nodelist.shape[0] - 1)) / 2

14535.0

In [92]:
yunnan_data[['referee_age', 'referral_age', 'age_diff']] = normalize(yunnan_data[['referee_age', 'referral_age', 'age_diff']])
yunnan_data[['referee_age', 'referral_age', 'age_diff']] = scale(yunnan_data[['referee_age', 'referral_age', 'age_diff']])
yunnan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,referral_relatives,relatives_diff
0,1.0,10.0,1.0,-0.900396,0.938726,0.135464,1.0,1.0,0.0,1.0,1.0,0.0
1,16.0,15.0,1.0,0.156549,0.499161,-1.193047,0.0,1.0,1.0,1.0,1.0,0.0
2,90.0,75.0,1.0,-1.900825,0.995179,1.034717,0.0,1.0,1.0,1.0,1.0,0.0
3,86.0,85.0,1.0,0.573515,0.076514,-0.744325,0.0,0.0,0.0,0.0,0.0,0.0
4,86.0,81.0,1.0,0.921381,-0.824737,0.446583,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
14530,168.0,169.0,0.0,0.857522,-0.518396,0.087612,0.0,1.0,1.0,0.0,1.0,1.0
14531,168.0,170.0,0.0,0.591272,0.051218,-0.704163,0.0,1.0,1.0,0.0,1.0,1.0
14532,168.0,171.0,0.0,0.901288,-1.676312,1.277942,0.0,0.0,0.0,0.0,1.0,1.0
14533,169.0,170.0,0.0,-0.350791,0.780328,-0.485451,1.0,1.0,0.0,1.0,1.0,0.0


In [95]:
r_regression(yunnan_data[yunnan_data.columns[3:]], yunnan_data['contact'])

array([-0.00678707,  0.02617738, -0.01872828, -0.00044435,  0.02559778,
        0.00603384, -0.03436659, -0.04232978, -0.07552437])

In [96]:
f_regression(yunnan_data[yunnan_data.columns[3:]], yunnan_data['contact'])

(array([6.69482834e-01, 9.96564130e+00, 5.09921708e+00, 2.86951615e-03,
        9.52893597e+00, 5.29124778e-01, 1.71846820e+01, 2.60871260e+01,
        8.33707713e+01]),
 array([4.13245388e-01, 1.59814490e-03, 2.39514127e-02, 9.57280196e-01,
        2.02637953e-03, 4.66986771e-01, 3.41083645e-05, 3.30483031e-07,
        7.68428865e-20]))

# Hainan

In [2]:
hainan_edgelist = pd.read_csv('../Data/Preprocessed/hainan_edgelist.csv')
hainan_nodelist = pd.read_csv('../Data/Preprocessed/hainan_nodelist.csv')

In [3]:
hainan_nodelist['gender'].unique(), hainan_nodelist['relatives'].unique()

(array([1., 0.]), array([0., 1.]))

Don't need to fill nans

In [6]:
hainan_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_relatives', 'referral_relatives', 'relatives_diff'])

In [7]:
added_combinations = set()
for index, row in hainan_edgelist[~hainan_edgelist['Referral'].isna()].iterrows():
    referee = hainan_nodelist.loc[hainan_nodelist['node_id'] == row['Referee']]
    referral = hainan_nodelist.loc[hainan_nodelist['node_id'] == row['Referral']]
    new_row = pd.Series({'referee': row['Referee'], 'referral': row['Referral'], 'contact': 1, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                         'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                         'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0])})
    hainan_data = pd.concat([hainan_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee['node_id'].iloc[0], referral['node_id'].iloc[0]))
    added_combinations.add((referral['node_id'].iloc[0], referee['node_id'].iloc[0]))

In [8]:
for ref_a in hainan_nodelist['node_id']:
    for ref_b in hainan_nodelist['node_id']:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = hainan_nodelist.loc[hainan_nodelist['node_id'] == ref_a]
        referral = hainan_nodelist.loc[hainan_nodelist['node_id'] == ref_b]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                             'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                             'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0])})
        hainan_data = pd.concat([hainan_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

In [9]:
hainan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,referral_relatives,relatives_diff
0,5.0,3.0,1.0,27.0,27.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,15.0,7.0,1.0,54.0,53.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
2,14.0,13.0,1.0,44.0,47.0,3.0,0.0,1.0,1.0,1.0,1.0,0.0
3,36.0,35.0,1.0,17.0,8.0,9.0,0.0,1.0,1.0,1.0,1.0,0.0
4,43.0,42.0,1.0,69.0,68.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
13036,158.0,159.0,0.0,62.0,27.0,35.0,1.0,0.0,1.0,1.0,1.0,0.0
13037,158.0,162.0,0.0,62.0,73.0,11.0,1.0,1.0,0.0,1.0,0.0,1.0
13038,160.0,159.0,0.0,25.0,27.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0
13039,160.0,162.0,0.0,25.0,73.0,48.0,0.0,1.0,1.0,1.0,0.0,1.0


In [10]:
(hainan_nodelist.shape[0] * (hainan_nodelist.shape[0] - 1)) / 2

13041.0

In [11]:
hainan_data[['referee_age', 'referral_age', 'age_diff']] = normalize(hainan_data[['referee_age', 'referral_age', 'age_diff']])
hainan_data[['referee_age', 'referral_age', 'age_diff']] = scale(hainan_data[['referee_age', 'referral_age', 'age_diff']])
hainan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,referral_relatives,relatives_diff
0,5.0,3.0,1.0,0.339552,0.355452,-1.397497,0.0,1.0,1.0,0.0,0.0,0.0
1,15.0,7.0,1.0,0.375219,0.318290,-1.327254,0.0,1.0,1.0,1.0,1.0,0.0
2,14.0,13.0,1.0,0.205842,0.478156,-1.150086,0.0,1.0,1.0,1.0,1.0,0.0
3,36.0,35.0,1.0,0.935850,-1.436547,0.898807,0.0,1.0,1.0,1.0,1.0,0.0
4,43.0,42.0,1.0,0.367497,0.326514,-1.342633,0.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
13036,158.0,159.0,0.0,0.926073,-1.599742,1.045712,1.0,0.0,1.0,1.0,1.0,0.0
13037,158.0,162.0,0.0,-0.010768,0.633392,-0.791008,1.0,1.0,0.0,1.0,0.0,1.0
13038,160.0,159.0,0.0,0.182428,0.497286,-1.109020,0.0,0.0,0.0,1.0,1.0,0.0
13039,160.0,162.0,0.0,-2.025503,0.889053,1.410104,0.0,1.0,1.0,1.0,0.0,1.0


In [12]:
r_regression(hainan_data[hainan_data.columns[3:]], hainan_data['contact'])

array([-0.00488427, -0.01012421,  0.00804319,  0.0058602 ,  0.00772179,
        0.007496  ,  0.0871856 ,  0.08413633, -0.07812765])

In [13]:
f_regression(hainan_data[hainan_data.columns[3:]], hainan_data['contact'])

(array([ 0.31106709,  1.33662901,  0.84358488,  0.44780005,  0.77751038,
         0.73270189, 99.8728961 , 92.96012754, 80.07793457]),
 array([5.77035687e-01, 2.47650508e-01, 3.58390491e-01, 5.03393377e-01,
        3.77919892e-01, 3.92024531e-01, 1.97302275e-23, 6.31474686e-22,
        4.08053745e-19]))

# Shanxi

In [67]:
shanxi_edgelist = pd.read_csv('../Data/Preprocessed/shanxi_edgelist.csv')
shanxi_nodelist = pd.read_csv('../Data/Preprocessed/shanxi_nodelist.csv')

In [68]:
shanxi_nodelist['gender'].unique(), shanxi_nodelist['relatives'].unique(), shanxi_nodelist['hukou'].unique()

(array([1, 0], dtype=int64),
 array([0, 1], dtype=int64),
 array(['xianyang', 'xian', 'ankang', 'yanan', 'wuhan', 'dazhi',
        'hanzhong', 'tongchuan', 'weinan', 'baoji', 'henan', 'shangluo',
        'xiaogan', 'hancheng', 'yulin', 'lantian', 'shanghai', 'lingbao',
        'tianmen', 'nanjing', 'yichang', 'suizhou', 'pingdingshan',
        'yingcheng', 'yanglin'], dtype=object))

In [69]:
shanxi_nodelist['hukou_ints'] = pd.factorize(shanxi_nodelist['hukou'])[0]
shanxi_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,hukou,relatives,hukou_ints
0,0,1,42,1,xianyang,0,0
1,1,2,32,0,xian,0,1
2,2,3,22,1,xian,0,1
3,3,4,49,1,ankang,0,2
4,4,5,23,1,yanan,0,3
...,...,...,...,...,...,...,...
232,232,233,67,0,weinan,0,8
233,233,234,46,1,hanzhong,0,6
234,234,235,71,1,hanzhong,1,6
235,235,236,66,0,hanzhong,1,6


In [70]:
shanxi_dummies = pd.get_dummies(shanxi_nodelist)

In [71]:
shanxi_dummies.columns

Index(['Unnamed: 0', 'node_id', 'age', 'gender', 'relatives', 'hukou_ints',
       'hukou_ankang', 'hukou_baoji', 'hukou_dazhi', 'hukou_hancheng',
       'hukou_hanzhong', 'hukou_henan', 'hukou_lantian', 'hukou_lingbao',
       'hukou_nanjing', 'hukou_pingdingshan', 'hukou_shanghai',
       'hukou_shangluo', 'hukou_suizhou', 'hukou_tianmen', 'hukou_tongchuan',
       'hukou_weinan', 'hukou_wuhan', 'hukou_xian', 'hukou_xianyang',
       'hukou_xiaogan', 'hukou_yanan', 'hukou_yanglin', 'hukou_yichang',
       'hukou_yingcheng', 'hukou_yulin'],
      dtype='object')

Don't need to fill nans

In [100]:
shanxi_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_relatives', 'referral_relatives', 'relatives_diff',
                                    'hukou_diff'])
referees_hukou = pd.DataFrame(columns=['hukou_ankang', 'hukou_baoji', 'hukou_dazhi', 'hukou_hancheng',
                                       'hukou_hanzhong', 'hukou_henan', 'hukou_lantian', 'hukou_lingbao',
                                       'hukou_nanjing', 'hukou_pingdingshan', 'hukou_shanghai',
                                       'hukou_shangluo', 'hukou_suizhou', 'hukou_tianmen', 'hukou_tongchuan',
                                       'hukou_weinan', 'hukou_wuhan', 'hukou_xian', 'hukou_xianyang',
                                       'hukou_xiaogan', 'hukou_yanan', 'hukou_yanglin', 'hukou_yichang',
                                       'hukou_yingcheng', 'hukou_yulin'])
referrals_hukou = referees_hukou.copy()

In [101]:
added_combinations = set()
for index, row in shanxi_edgelist[~shanxi_edgelist['Referral'].isna()].iterrows():
    referee = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == row['Referee']]
    referral = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == row['Referral']]
    new_row = pd.Series({'referee': row['Referee'], 'referral': row['Referral'], 'contact': 1, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                         'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                         'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0]),
                         'hukou_diff': abs(referee['hukou_ints'].iloc[0] - referral['hukou_ints'].iloc[0])})

    referee_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == row['Referee']][shanxi_dummies.columns[6:]]
    referral_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == row['Referral']][shanxi_dummies.columns[6:]]

    shanxi_data = pd.concat([shanxi_data, new_row.to_frame().T], axis='index', ignore_index=True)
    shanxi_data['hukou_diff'] = shanxi_data['hukou_diff'].where(shanxi_data['hukou_diff'] == 0, 1)

    referees_hukou = pd.concat([referees_hukou, referee_dummies], axis='index', ignore_index=True)
    referrals_hukou = pd.concat([referrals_hukou, referral_dummies], axis='index', ignore_index=True)

    added_combinations.add((referee['node_id'].iloc[0], referral['node_id'].iloc[0]))
    added_combinations.add((referral['node_id'].iloc[0], referee['node_id'].iloc[0]))

In [102]:
shanxi_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,referral_relatives,relatives_diff,hukou_diff
0,11.0,10.0,1.0,9.0,45.0,36.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,11.0,9.0,1.0,9.0,46.0,37.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
2,10.0,9.0,1.0,45.0,46.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
3,22.0,2.0,1.0,33.0,32.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,20.0,19.0,1.0,52.0,70.0,18.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,235.0,234.0,1.0,71.0,46.0,25.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
121,236.0,234.0,1.0,66.0,46.0,20.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
122,236.0,235.0,1.0,66.0,71.0,5.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
123,237.0,234.0,1.0,65.0,46.0,19.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0


In [103]:
for ref_a in shanxi_nodelist['node_id']:
    for ref_b in shanxi_nodelist['node_id']:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == ref_a]
        referral = shanxi_nodelist.loc[shanxi_nodelist['node_id'] == ref_b]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                             'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                             'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0]),
                             'hukou_diff': abs(referee['hukou_ints'].iloc[0] - referral['hukou_ints'].iloc[0])})
        referee_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == ref_a][shanxi_dummies.columns[6:]]
        referral_dummies = shanxi_dummies.loc[shanxi_dummies['node_id'] == ref_b][shanxi_dummies.columns[6:]]

        shanxi_data = pd.concat([shanxi_data, new_row.to_frame().T], axis='index', ignore_index=True)
        shanxi_data['hukou_diff'] = shanxi_data['hukou_diff'].where(shanxi_data['hukou_diff'] == 0, 1)

        referees_hukou = pd.concat([referees_hukou, referee_dummies], axis='index', ignore_index=True)
        referrals_hukou = pd.concat([referrals_hukou, referral_dummies], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

In [104]:
shanxi_data = shanxi_data.join(referees_hukou)
shanxi_data = shanxi_data.join(referrals_hukou, lsuffix='_referee', rsuffix='_referral')

In [105]:
shanxi_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,hukou_weinan_referral,hukou_wuhan_referral,hukou_xian_referral,hukou_xianyang_referral,hukou_xiaogan_referral,hukou_yanan_referral,hukou_yanglin_referral,hukou_yichang_referral,hukou_yingcheng_referral,hukou_yulin_referral
0,11.0,10.0,1.0,9.0,45.0,36.0,0.0,0.0,0.0,1.0,...,0,1,0,0,0,0,0,0,0,0
1,11.0,9.0,1.0,9.0,46.0,37.0,0.0,1.0,1.0,1.0,...,0,1,0,0,0,0,0,0,0,0
2,10.0,9.0,1.0,45.0,46.0,1.0,0.0,1.0,1.0,1.0,...,0,1,0,0,0,0,0,0,0,0
3,22.0,2.0,1.0,33.0,32.0,1.0,1.0,0.0,1.0,0.0,...,0,0,1,0,0,0,0,0,0,0
4,20.0,19.0,1.0,52.0,70.0,18.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27967,232.0,237.0,0.0,46.0,65.0,19.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
27968,233.0,234.0,0.0,67.0,46.0,21.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
27969,233.0,235.0,0.0,67.0,71.0,4.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
27970,233.0,236.0,0.0,67.0,66.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
shanxi_data[['referee_age', 'referral_age', 'age_diff']] = normalize(shanxi_data[['referee_age', 'referral_age', 'age_diff']])
shanxi_data[['referee_age', 'referral_age', 'age_diff']] = scale(shanxi_data[['referee_age', 'referral_age', 'age_diff']])
shanxi_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,...,hukou_weinan_referral,hukou_wuhan_referral,hukou_xian_referral,hukou_xianyang_referral,hukou_xiaogan_referral,hukou_yanan_referral,hukou_yanglin_referral,hukou_yichang_referral,hukou_yingcheng_referral,hukou_yulin_referral
0,11.0,10.0,1.0,-2.550020,0.526766,2.075856,0.0,0.0,0.0,1.0,...,0,1,0,0,0,0,0,0,0,0
1,11.0,9.0,1.0,-2.570205,0.518849,2.089778,0.0,1.0,1.0,1.0,...,0,1,0,0,0,0,0,0,0,0
2,10.0,9.0,1.0,0.513760,0.153427,-1.434770,0.0,1.0,1.0,1.0,...,0,1,0,0,0,0,0,0,0,0
3,22.0,2.0,1.0,0.617895,0.030009,-1.398528,1.0,0.0,1.0,0.0,...,0,0,1,0,0,0,0,0,0,0
4,20.0,19.0,1.0,-0.133950,0.623130,-0.345889,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27967,232.0,237.0,0.0,-0.258300,0.674514,-0.171259,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
27968,233.0,234.0,0.0,1.070256,-0.943135,-0.065708,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
27969,233.0,235.0,0.0,0.438066,0.231947,-1.286554,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
27970,233.0,236.0,0.0,0.587715,0.067820,-1.463393,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
r_regression(shanxi_data[shanxi_data.columns[3:]], shanxi_data['contact'])

array([ 0.01518204, -0.01658974, -0.01035794, -0.01539246,  0.00552877,
        0.01150495,  0.05974816,  0.02571722, -0.00035044, -0.09406561,
       -0.01563898, -0.00247156, -0.00599254, -0.0053919 ,  0.01984591,
       -0.00567128, -0.00528536,  0.00872287, -0.00476882, -0.00439773,
       -0.00522352,  0.00775906, -0.00083405,  0.00258529,  0.00653225,
        0.01552357,  0.0050572 , -0.00205082, -0.01283002, -0.0053919 ,
        0.00130601, -0.00411259, -0.00466575,  0.01263303, -0.00141748,
       -0.00901522,  0.00086992, -0.00149926, -0.00300077,  0.02854928,
       -0.00243833,  0.01943025, -0.00335581,  0.01451457, -0.0043235 ,
        0.01864667,  0.00164337,  0.00024069,  0.00565797, -0.00386471,
       -0.00488967,  0.05070957, -0.03270524, -0.00487585,  0.02098141,
       -0.00316615, -0.00459578, -0.0040332 , -0.0044526 , -0.00394016])

In [109]:
f_regression(shanxi_data[shanxi_data.columns[3:]], shanxi_data['contact'])

(array([6.44840887e+00, 7.70000462e+00, 3.00113997e+00, 6.62843994e+00,
        8.54992530e-01, 3.70270591e+00, 1.00206212e+02, 1.85109144e+01,
        3.43501594e-03, 2.49697442e+02, 6.84251119e+00, 1.70858304e-01,
        1.00445306e+00, 8.13185067e-01, 1.10206131e+01, 8.99639085e-01,
        7.81363511e-01, 2.12835701e+00, 6.36098311e-01, 5.40950247e-01,
        7.63186992e-01, 1.68397859e+00, 1.94571934e-02, 1.86945294e-01,
        1.19353722e+00, 6.74186572e+00, 7.15359676e-01, 1.17638405e-01,
        4.60488020e+00, 8.13185067e-01, 4.77074253e-02, 4.73075538e-01,
        6.08898435e-01, 4.46453955e+00, 5.61987561e-02, 2.27342367e+00,
        2.11668080e-02, 6.28705089e-02, 2.51862096e-01, 2.28158684e+01,
        1.66295193e-01, 1.05636348e+01, 3.14986297e-01, 5.89375791e+00,
        5.22843145e-01, 9.72850894e+00, 7.55379519e-02, 1.62034067e-03,
        8.95422386e-01, 4.17765315e-01, 6.68746238e-01, 7.21091688e+01,
        2.99496690e+01, 6.64973036e-01, 1.23183622e+01, 2.803888

# Bucharest

In [110]:
bucharest_nodelist = pd.read_csv('../Data/Preprocessed/bucharest_nodelist.csv')
bucharest_edgelist = pd.read_csv('../Data/Preprocessed/bucharest_edgelist.csv')

In [112]:
bucharest_nodelist['medical_ints'] = pd.factorize(bucharest_nodelist['medical'])[0]
bucharest_nodelist['isco08_code_ints'] = pd.factorize(bucharest_nodelist['isco08_code'])[0]

In [113]:
bucharest_nodelist

Unnamed: 0.1,Unnamed: 0,node_id,age,gender,medical,isco08_code,isco08_label,medical_ints,isco08_code_ints
0,0,179373,65.0,1,NAP,NAP,Not Active - pensioner,0,0
1,1,1S179373,63.0,,,,,-1,-1
2,2,146179,40.0,1,No,EMP,Employee - unknown group,1,1
3,3,235990,32.0,2.0,,,,-1,-1
4,4,3S146179,3.0,,NAC,NAC,Not Active - Child,2,2
...,...,...,...,...,...,...,...,...,...
57830,57830,19,32.0,male,,,,-1,-1
57831,57831,18,30.0,female,,,,-1,-1
57832,57832,16,60.0,male,,,,-1,-1
57833,57833,14,42.0,female,,,,-1,-1


In [None]:
bucharest_nodelist['age'] = bucharest_nodelist['age'].fillna(value=int(bucharest_nodelist['age'].mean()))
bucharest_nodelist['gender'] = bucharest_nodelist['gender'].mask(bucharest_nodelist['gender'].isnull(), np.random.randint(0, 2, size=bucharest_nodelist.shape[0]))