In [76]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize, scale
from sklearn.feature_selection import r_regression, f_regression

In [20]:
yunnan_nodelist = pd.read_csv('../Data/Preprocessed/yunnan_nodelist.csv')
yunnan_edgelist = pd.read_csv('../Data/Preprocessed/yunnan_edgelist.csv')

In [21]:
yunnan_edgelist

Unnamed: 0.1,Unnamed: 0,Referee,Referral,Date
0,0,1,,200117
1,1,1,10.0,200117
2,2,2,,200122
3,3,3,,200124
4,4,4,,200124
...,...,...,...,...
270,273,171,156.0,200216
271,269,170,155.0,200216
272,268,170,,200216
273,270,170,156.0,200216


In [22]:
yunnan_nodelist['gender'].unique(), yunnan_nodelist['relatives'].unique()

(array([ 1.,  0., nan]), array([ 1., nan,  0.]))

In [23]:
yunnan_nodelist.columns

Index(['Unnamed: 0', 'node_id', 'age', 'gender', 'relatives'], dtype='object')

In [24]:
yunnan_nodelist['age'] = yunnan_nodelist['age'].fillna(value=int(yunnan_nodelist['age'].mean()))
yunnan_nodelist['gender'] = yunnan_nodelist['gender'].mask(yunnan_nodelist['gender'].isnull(), np.random.randint(0, 2, size=yunnan_nodelist.shape[0]))
yunnan_nodelist['relatives'] = yunnan_nodelist['relatives'].mask(yunnan_nodelist['relatives'].isnull(), np.random.randint(0, 2, size=yunnan_nodelist.shape[0]))

In [25]:
yunnan_nodelist['gender'].unique(), yunnan_nodelist['relatives'].unique()

(array([1., 0.]), array([1., 0.]))

In [87]:
yunnan_data = pd.DataFrame(columns=['referee', 'referral', 'contact', 'referee_age', 'referral_age', 'age_diff', 'referee_gender', 'referral_gender', 'gender_diff', 'referee_relatives', 'referral_relatives', 'relatives_diff'])

In [88]:
added_combinations = set()
for index, row in yunnan_edgelist[~yunnan_edgelist['Referral'].isna()].iterrows():
    referee = yunnan_nodelist.loc[yunnan_nodelist['node_id'] == row['Referee']]
    referral = yunnan_nodelist.loc[yunnan_nodelist['node_id'] == row['Referral']]
    new_row = pd.Series({'referee': row['Referee'], 'referral': row['Referral'], 'contact': 1, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                         'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                         'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0])})
    yunnan_data = pd.concat([yunnan_data, new_row.to_frame().T], axis='index', ignore_index=True)

    added_combinations.add((referee['node_id'].iloc[0], referral['node_id'].iloc[0]))
    added_combinations.add((referral['node_id'].iloc[0], referee['node_id'].iloc[0]))

In [89]:
for ref_a in yunnan_nodelist['node_id']:
    for ref_b in yunnan_nodelist['node_id']:
        if ref_a == ref_b:
            continue
        if (ref_a, ref_b) in added_combinations:
            continue
        if (ref_b, ref_a) in added_combinations:
            continue

        referee = yunnan_nodelist.loc[yunnan_nodelist['node_id'] == ref_a]
        referral = yunnan_nodelist.loc[yunnan_nodelist['node_id'] == ref_b]

        new_row = pd.Series({'referee': ref_a, 'referral': ref_b, 'contact': 0, 'referee_age': referee['age'].iloc[0], 'referral_age': referral['age'].iloc[0], 'age_diff': abs(referee['age'].iloc[0] - referral['age'].iloc[0]),
                             'referee_gender': referee['gender'].iloc[0], 'referral_gender': referral['gender'].iloc[0], 'gender_diff': abs(referee['gender'].iloc[0] - referral['gender'].iloc[0]),
                             'referee_relatives': referee['relatives'].iloc[0], 'referral_relatives': referral['relatives'].iloc[0], 'relatives_diff': abs(referee['relatives'].iloc[0] - referral['relatives'].iloc[0])})
        yunnan_data = pd.concat([yunnan_data, new_row.to_frame().T], axis='index', ignore_index=True)

        added_combinations.add((ref_a, ref_b))
        added_combinations.add((ref_b, ref_a))

In [90]:
yunnan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,referral_relatives,relatives_diff
0,1.0,10.0,1.0,41.0,63.0,22.0,1.0,1.0,0.0,1.0,1.0,0.0
1,16.0,15.0,1.0,68.0,71.0,3.0,0.0,1.0,1.0,1.0,1.0,0.0
2,90.0,75.0,1.0,34.0,76.0,42.0,0.0,1.0,1.0,1.0,1.0,0.0
3,86.0,85.0,1.0,79.0,67.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0
4,86.0,81.0,1.0,79.0,46.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
14530,168.0,169.0,0.0,62.0,41.0,21.0,0.0,1.0,1.0,0.0,1.0,1.0
14531,168.0,170.0,0.0,62.0,52.0,10.0,0.0,1.0,1.0,0.0,1.0,1.0
14532,168.0,171.0,0.0,62.0,24.0,38.0,0.0,0.0,0.0,0.0,1.0,1.0
14533,169.0,170.0,0.0,41.0,52.0,11.0,1.0,1.0,0.0,1.0,1.0,0.0


In [91]:
(yunnan_nodelist.shape[0] * (yunnan_nodelist.shape[0] - 1)) / 2

14535.0

In [92]:
yunnan_data[['referee_age', 'referral_age', 'age_diff']] = normalize(yunnan_data[['referee_age', 'referral_age', 'age_diff']])
yunnan_data[['referee_age', 'referral_age', 'age_diff']] = scale(yunnan_data[['referee_age', 'referral_age', 'age_diff']])
yunnan_data

Unnamed: 0,referee,referral,contact,referee_age,referral_age,age_diff,referee_gender,referral_gender,gender_diff,referee_relatives,referral_relatives,relatives_diff
0,1.0,10.0,1.0,-0.900396,0.938726,0.135464,1.0,1.0,0.0,1.0,1.0,0.0
1,16.0,15.0,1.0,0.156549,0.499161,-1.193047,0.0,1.0,1.0,1.0,1.0,0.0
2,90.0,75.0,1.0,-1.900825,0.995179,1.034717,0.0,1.0,1.0,1.0,1.0,0.0
3,86.0,85.0,1.0,0.573515,0.076514,-0.744325,0.0,0.0,0.0,0.0,0.0,0.0
4,86.0,81.0,1.0,0.921381,-0.824737,0.446583,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
14530,168.0,169.0,0.0,0.857522,-0.518396,0.087612,0.0,1.0,1.0,0.0,1.0,1.0
14531,168.0,170.0,0.0,0.591272,0.051218,-0.704163,0.0,1.0,1.0,0.0,1.0,1.0
14532,168.0,171.0,0.0,0.901288,-1.676312,1.277942,0.0,0.0,0.0,0.0,1.0,1.0
14533,169.0,170.0,0.0,-0.350791,0.780328,-0.485451,1.0,1.0,0.0,1.0,1.0,0.0


In [95]:
r_regression(yunnan_data[yunnan_data.columns[3:]], yunnan_data['contact'])

array([-0.00678707,  0.02617738, -0.01872828, -0.00044435,  0.02559778,
        0.00603384, -0.03436659, -0.04232978, -0.07552437])

In [96]:
f_regression(yunnan_data[yunnan_data.columns[3:]], yunnan_data['contact'])

(array([6.69482834e-01, 9.96564130e+00, 5.09921708e+00, 2.86951615e-03,
        9.52893597e+00, 5.29124778e-01, 1.71846820e+01, 2.60871260e+01,
        8.33707713e+01]),
 array([4.13245388e-01, 1.59814490e-03, 2.39514127e-02, 9.57280196e-01,
        2.02637953e-03, 4.66986771e-01, 3.41083645e-05, 3.30483031e-07,
        7.68428865e-20]))