In [1]:
%matplotlib inline
import pylab as pl
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import pandas as pd

  from pandas.core import datetools


In [2]:
u = "/home/yu/PycharmProjects/A-fast-method/auto_data/"

In [24]:
def calcu_distance(data, num_train=50):
    '''
    data: [[p, q, P, Q, M, r2]]
    '''
    df = pd.DataFrame(data, columns=['p', 'q', 'P', 'Q', 'M', 'r2'])
    
    # 随机生成训练集和测试集
    num_tol = len(data)
    idx_cont = np.arange(num_tol)
    idx_train = np.random.choice(idx_cont, size=num_train, replace=False)  # for the training set
    idx_test = np.array([i for i in idx_cont if i not in idx_train])  # the remaining is the test set
    train_set = df.loc[idx_train, ['p', 'q', 'P', 'Q']]
    test_set = df.loc[idx_test, ['p', 'q', 'P', 'Q']]
    
    # 预测 p
    result_p = smf.ols('p ~ P-1', data=train_set).fit()
    k_p = result_p.params['P']
    r2_p = result_p.rsquared
    train_set['pred_p'] = k_p * train_set['P']
    test_set['pred_p'] = k_p * test_set['P']
    
    # 预测 q
    result_q = smf.ols('q ~ Q-1', data=train_set).fit()
    k_q = result_q.params['Q']
    r2_q = result_q.rsquared
    train_set['pred_q'] = k_q * train_set['Q']
    test_set['pred_q'] = k_q * test_set['Q']
    
    # 测试集中预测点和实际点之前的距离
    dis_train = np.sqrt((train_set['p'] - train_set['pred_p']) ** 2 
                                      + (train_set['q'] - train_set['pred_q']) ** 2)
    
    dis_test = np.sqrt((test_set['p'] - test_set['pred_p']) ** 2 
                                      + (test_set['q'] - test_set['pred_q']) ** 2)
    
    return dis_train, dis_test

$p=k_1\cdot P$

$q=k_2\cdot Q$

$D(\mathrm{fit, predict}) = \sqrt{(\hat p - p)^2 + (\hat q - q)^2}$

$\mathrm{Indicator} = \frac{\mathrm{D_{test} - D_{train}}}{\mathrm{D_{train}}}$

### 1. topology

In [5]:
coeff_cont_gnm = np.load(u +"estimate_gnm_random_graph(10000,30000).npy")
coeff_cont_ba =  np.load(u +"estimate_barabasi_albert_graph(10000,3).npy")
coeff_cont_exp = np.load(u +"estimate_exponential_graph(10000,3).npy")
coeff_cont_gua = np.load(u +"estimate_gaussian_graph(10000,3).npy")
coeff_cont_log = np.load(u +"estimate_lognormal_graph(10000,3).npy")
coeff_cont_ws0 =  np.load(u +"estimate_watts_strogatz_graph(10000,6,0).npy")
coeff_cont_ws01 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.1).npy")
coeff_cont_ws03 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.3).npy")
coeff_cont_ws05 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.5).npy")
coeff_cont_ws07 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.7).npy")
coeff_cont_ws09 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.9).npy")
coeff_cont_ws10 = np.load(u +"estimate_watts_strogatz_graph(10000,6,1.0).npy")

In [6]:
d_cont = [coeff_cont_log,coeff_cont_ba,coeff_cont_exp,coeff_cont_gua,
          coeff_cont_gnm,coeff_cont_ws0,coeff_cont_ws01,coeff_cont_ws03,
          coeff_cont_ws05,coeff_cont_ws07,coeff_cont_ws09,coeff_cont_ws10]

title_cont = ['LOG','BA','EXP','GAU','ER','WS-0','WS-0.1','WS-0.3','WS-0.5','WS-0.7','WS-0.9','WS-1']

#### (1) P and p

In [8]:
for i, title in enumerate(title_cont):
    X_data = d_cont[i][:, 2]
    Y_data = d_cont[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

LOG 
 R2: 0.9952, beta: 0.56
BA 
 R2: 0.9908, beta: 0.61
EXP 
 R2: 0.9989, beta: 0.73
GAU 
 R2: 0.9985, beta: 0.80
ER 
 R2: 0.9986, beta: 0.81
WS-0 
 R2: 0.9624, beta: 0.43
WS-0.1 
 R2: 0.9919, beta: 0.61
WS-0.3 
 R2: 0.9969, beta: 0.73
WS-0.5 
 R2: 0.9964, beta: 0.77
WS-0.7 
 R2: 0.9966, beta: 0.78
WS-0.9 
 R2: 0.9966, beta: 0.79
WS-1 
 R2: 0.9970, beta: 0.79


#### (2) Q and q

In [9]:
for i, title in enumerate(title_cont):
    X_data = d_cont[i][:, 3]
    Y_data = d_cont[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

LOG 
 R2: 0.9883, beta: 0.08
BA 
 R2: 0.9916, beta: 0.13
EXP 
 R2: 0.9958, beta: 0.13
GAU 
 R2: 0.9996, beta: 0.17
ER 
 R2: 0.9991, beta: 0.20
WS-0 
 R2: 0.8970, beta: 0.36
WS-0.1 
 R2: 0.9769, beta: 0.30
WS-0.3 
 R2: 0.9917, beta: 0.25
WS-0.5 
 R2: 0.9975, beta: 0.23
WS-0.7 
 R2: 0.9984, beta: 0.22
WS-0.9 
 R2: 0.9984, beta: 0.22
WS-1 
 R2: 0.9985, beta: 0.22


#### (3) Cross-validation

In [32]:
res_dict = {}
for i, title in enumerate(title_cont):
    res_cont = []
    data = d_cont[i]
    for j in range(100):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
    res_dict[title] = res_cont

In [33]:
for i, title in enumerate(title_cont):
    d = res_dict[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

LOG	  Train: 0.00181 (0.000044)	  Test: 0.00071 (0.000261)
BA	  Train: 0.00175 (0.000620)	  Test: 0.00350 (0.000510)
EXP	  Train: 0.00310 (0.000364)	  Test: 0.00371 (0.000353)
GAU	  Train: 0.00130 (0.000222)	  Test: 0.00127 (0.000218)
ER	  Train: 0.00175 (0.000451)	  Test: 0.00154 (0.000311)
WS-0	  Train: 0.01950 (0.000878)	  Test: 0.03314 (0.007656)
WS-0.1	  Train: 0.00934 (0.000624)	  Test: 0.00930 (0.000638)
WS-0.3	  Train: 0.01009 (0.001632)	  Test: 0.00690 (0.001141)
WS-0.5	  Train: 0.00358 (0.001755)	  Test: 0.00326 (0.000731)
WS-0.7	  Train: 0.00337 (0.000385)	  Test: 0.00261 (0.000020)
WS-0.9	  Train: 0.00188 (0.000608)	  Test: 0.00206 (0.000817)
WS-1	  Train: 0.00219 (0.000542)	  Test: 0.00243 (0.000324)


### 2.  average degree

In [12]:
coeff_cont_gnm_3 = np.load(u + "estimate_gnm_random_graph(10000,30000).npy")
coeff_cont_gnm_4 = np.load(u + "estimate_gnm_random_graph(10000,40000).npy")
coeff_cont_gnm_5 = np.load(u + "estimate_gnm_random_graph(10000,50000).npy")
coeff_cont_gnm_6 = np.load(u + "estimate_gnm_random_graph(10000,60000).npy")
coeff_cont_gnm_7 = np.load(u + "estimate_gnm_random_graph(10000,70000).npy")
coeff_cont_gnm_8 = np.load(u + "estimate_gnm_random_graph(10000,80000).npy")
coeff_cont_gnm_9 = np.load(u + "estimate_gnm_random_graph(10000,90000).npy")
coeff_cont_gnm_10 = np.load(u + "estimate_gnm_random_graph(10000,100000).npy")

In [13]:
d_cont2 = [coeff_cont_gnm_3,coeff_cont_gnm_4,coeff_cont_gnm_5,coeff_cont_gnm_6,
                     coeff_cont_gnm_7,coeff_cont_gnm_8,coeff_cont_gnm_9,coeff_cont_gnm_10]
title_cont2 = ['gnm3', 'gnm4', 'gnm5', 'gnm6', 'gnm7', 'gnm8', 'gnm9', 'gnm10']

#### (1) P and p

In [14]:
for i, title in enumerate(title_cont2):
    X_data = d_cont2[i][:, 2]
    Y_data = d_cont2[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

gnm3 
 R2: 0.9986, beta: 0.81
gnm4 
 R2: 0.9986, beta: 0.83
gnm5 
 R2: 0.9987, beta: 0.85
gnm6 
 R2: 0.9989, beta: 0.86
gnm7 
 R2: 0.9989, beta: 0.87
gnm8 
 R2: 0.9987, beta: 0.88
gnm9 
 R2: 0.9990, beta: 0.88
gnm10 
 R2: 0.9988, beta: 0.88


#### (2) Q and q

In [15]:
for i, title in enumerate(title_cont2):
    X_data = d_cont2[i][:, 3]
    Y_data = d_cont2[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

gnm3 
 R2: 0.9991, beta: 0.20
gnm4 
 R2: 0.9993, beta: 0.15
gnm5 
 R2: 0.9995, beta: 0.11
gnm6 
 R2: 0.9995, beta: 0.09
gnm7 
 R2: 0.9994, beta: 0.08
gnm8 
 R2: 0.9994, beta: 0.07
gnm9 
 R2: 0.9995, beta: 0.06
gnm10 
 R2: 0.9995, beta: 0.06


### (3) Cross-validation

In [34]:
res_dict2 = {}
for i, title in enumerate(title_cont2):
    res_cont = []
    data = d_cont2[i]
    for j in range(100):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
        
    res_dict2[title] = res_cont

In [45]:
for i, title in enumerate(title_cont2):
    d = res_dict[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

gnm3, mean:0.0248, std:0.1390
gnm4, mean:0.0636, std:0.1550
gnm5, mean:0.0461, std:0.1104
gnm6, mean:0.0490, std:0.1253
gnm7, mean:0.0540, std:0.1285
gnm8, mean:0.0250, std:0.0976
gnm9, mean:0.0306, std:0.1249
gnm10, mean:0.0487, std:0.1055


### 3. decision rule

In [16]:
coeff_cont_gnm01 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.1.npy")
coeff_cont_gnm03 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.3.npy")
coeff_cont_gnm05 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.5.npy")
coeff_cont_gnm07 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.7.npy")
coeff_cont_gnm09 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.9.npy")
coeff_cont_gnm10 = np.load(u +"estimate_gnm_random_graph(10000,30000),1.0.npy")

In [17]:
d_cont3 = [coeff_cont_gnm01,coeff_cont_gnm03,coeff_cont_gnm05,coeff_cont_gnm07,coeff_cont_gnm09,coeff_cont_gnm10]
title_cont3 = ['alpha0.1', 'alpha0.3', 'alpha0.5', 'alpha0.7', 'alpha0.9', 'alpha1.0']

#### (1) P and p

In [18]:
for i, title in enumerate(title_cont3):
    X_data = d_cont3[i][:, 2]
    Y_data = d_cont3[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

alpha0.1 
 R2: 0.9986, beta: 0.81
alpha0.3 
 R2: 0.9986, beta: 0.81
alpha0.5 
 R2: 0.9985, beta: 0.81
alpha0.7 
 R2: 0.9987, beta: 0.82
alpha0.9 
 R2: 0.9984, beta: 0.80
alpha1.0 
 R2: 0.9985, beta: 0.80


#### (2) Q and q

In [19]:
for i, title in enumerate(title_cont3):
    X_data = d_cont3[i][:, 3]
    Y_data = d_cont3[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

alpha0.1 
 R2: 0.9993, beta: 0.24
alpha0.3 
 R2: 0.9992, beta: 0.36
alpha0.5 
 R2: 0.9954, beta: 0.54
alpha0.7 
 R2: 0.9990, beta: 0.79
alpha0.9 
 R2: 0.9951, beta: 1.17
alpha1.0 
 R2: 0.9989, beta: 1.41


In [35]:
res_dict3 = {}
for i, title in enumerate(title_cont3):
    res_cont = []
    data = d_cont3[i]
    for j in range(100):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
    res_dict3[title] = res_cont

In [36]:
for i, title in enumerate(title_cont3):
    d = res_dict3[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

alpha0.1	  Train: 0.00178 (0.000367)	  Test: 0.00179 (0.000480)
alpha0.3	  Train: 0.00311 (0.000984)	  Test: 0.00332 (0.001081)
alpha0.5	  Train: 0.01469 (0.003008)	  Test: 0.01700 (0.003477)
alpha0.7	  Train: 0.00653 (0.000166)	  Test: 0.00834 (0.000746)
alpha0.9	  Train: 0.02125 (0.002651)	  Test: 0.03647 (0.000758)
alpha1.0	  Train: 0.01437 (0.002607)	  Test: 0.01387 (0.001840)


### 4. individual heterogeneity

In [20]:
coeff_cont_sgima01 = np.load(u +"estimate_sigma-0.1.npy")
coeff_cont_sgima02 = np.load(u +"estimate_sigma-0.2.npy")
coeff_cont_sgima04 = np.load(u +"estimate_sigma-0.4.npy")
coeff_cont_sgima06 = np.load(u +"estimate_sigma-0.6.npy")
coeff_cont_sgima08 = np.load(u +"estimate_sigma-0.8.npy")
coeff_cont_sgima10 = np.load(u +"estimate_sigma-1.0.npy")

In [21]:
d_cont4 = [coeff_cont_sgima01,coeff_cont_sgima02,coeff_cont_sgima04,
           coeff_cont_sgima06,coeff_cont_sgima08,coeff_cont_sgima10]
title_cont4 = ['sigma0.1', 'sigma0.2', 'sigma0.4', 'sigma0.6', 'sigma0.8', 'sigma1.0']

#### (1) P and p

In [22]:
for i, title in enumerate(title_cont4):
    X_data = d_cont4[i][:, 2]
    Y_data = d_cont4[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

sigma0.1 
 R2: 0.9982, beta: 0.80
sigma0.2 
 R2: 0.9980, beta: 0.79
sigma0.4 
 R2: 0.9978, beta: 0.75
sigma0.6 
 R2: 0.9976, beta: 0.68
sigma0.8 
 R2: 0.9977, beta: 0.61
sigma1.0 
 R2: 0.9973, beta: 0.55


#### (2) Q and q

In [23]:
for i, title in enumerate(title_cont4):
    X_data = d_cont4[i][:, 3]
    Y_data = d_cont4[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

sigma0.1 
 R2: 0.9991, beta: 0.20
sigma0.2 
 R2: 0.9990, beta: 0.20
sigma0.4 
 R2: 0.9988, beta: 0.20
sigma0.6 
 R2: 0.9960, beta: 0.20
sigma0.8 
 R2: 0.9961, beta: 0.20
sigma1.0 
 R2: 0.9954, beta: 0.19


### (3) Cross-validation

In [37]:
res_dict4 = {}
for i, title in enumerate(title_cont4):
    res_cont = []
    data = d_cont4[i]
    for j in range(100):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
        
    res_dict4[title] = res_cont

In [38]:
for i, title in enumerate(title_cont4):
    d = res_dict4[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

sigma0.1	  Train: 0.00191 (0.000558)	  Test: 0.00194 (0.000746)
sigma0.2	  Train: 0.00263 (0.000213)	  Test: 0.00229 (0.000154)
sigma0.4	  Train: 0.00200 (0.000595)	  Test: 0.00228 (0.000313)
sigma0.6	  Train: 0.00426 (0.000041)	  Test: 0.00466 (0.000039)
sigma0.8	  Train: 0.00175 (0.000251)	  Test: 0.00136 (0.000474)
sigma1.0	  Train: 0.00179 (0.000218)	  Test: 0.00170 (0.000372)
