In [1]:
%matplotlib inline
import pylab as pl
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import pandas as pd
import networkx as nx
import random

  from pandas.core import datetools


In [2]:
from __future__ import print_function
import time
from scipy import stats

In [3]:
u = "C:/Users/XIAOYU/PycharmProjects/A-fast-method/auto_data/"

In [4]:
def calcu_distance(data, num_train=50):
    '''
    data: [[p, q, P, Q, M, r2]]
    '''
    df = pd.DataFrame(data, columns=['p', 'q', 'P', 'Q', 'M', 'r2'])
    
    # 随机生成训练集和测试集
    num_tol = len(data)
    idx_cont = np.arange(num_tol)
    idx_train = np.random.choice(idx_cont, size=num_train, replace=False)  # the index for the training set
    idx_test = np.array([i for i in idx_cont if i not in idx_train])  # the remaining is the test set
    train_set = df.loc[idx_train, ['p', 'q', 'P', 'Q']]
    test_set = df.loc[idx_test, ['p', 'q', 'P', 'Q']]
    
    # 预测 p
    result_p = smf.ols('p ~ P + Q -1', data=train_set).fit()
    k_p_p = result_p.params['P']
    k_p_q = result_p.params['Q']
    train_set['pred_p'] = k_p_p * train_set['P'] + k_p_q * train_set['Q']
    test_set['pred_p'] = k_p_p * test_set['P'] + k_p_q * test_set['Q']
    
    # 预测 q
    result_q = smf.ols('q ~ P + Q -1', data=train_set).fit()
    k_q_p = result_q.params['P']
    k_q_q = result_q.params['Q']
    
    train_set['pred_q'] = k_q_p * train_set['P'] + k_q_q * train_set['Q']
    test_set['pred_q'] = k_q_p * test_set['P'] + k_q_q * test_set['Q']
    
    # 预测点和实际点之间的距离    
    dis_train = np.sqrt(np.square(train_set.p - train_set.pred_p) + np.square(train_set.q - train_set.pred_q))
    dis_test = np.sqrt(np.square(test_set.p - test_set.pred_p) + np.square(test_set.q - test_set.pred_q))
    
    return dis_train, dis_test

In [5]:
def grid_distance(data, num_samp=10000):
    df = pd.DataFrame(data, columns=['p', 'q', 'P', 'Q', 'M', 'r2'])
    min_p, max_p = df.p.min(), df.p.max()
    min_q, max_q = df.q.min(), df.q.max()
    samp_p = (max_p - min_p) * np.random.random(num_samp) + min_p
    samp_q = (max_q - min_q) * np.random.random(num_samp) + min_q
    d_cont = []
    for i in df.index:
        d = np.mean(np.sqrt(np.square(samp_p - df.p[i]) + np.square(samp_q - df.q[i])))
        d_cont.append(d)
    
    return np.mean(d_cont)

$p=k_1\cdot P$

$q=k_2\cdot Q$

$D(\mathrm{fit, predict}) = \sqrt{(\hat p - p)^2 + (\hat q - q)^2}$

### 1. topology

In [6]:
coeff_cont_gnm = np.load(u +"estimate_gnm_random_graph(10000,30000).npy")
coeff_cont_ba =  np.load(u +"estimate_barabasi_albert_graph(10000,3).npy")
coeff_cont_exp = np.load(u +"estimate_exponential_graph(10000,3).npy")
coeff_cont_gua = np.load(u +"estimate_gaussian_graph(10000,3).npy")
coeff_cont_log = np.load(u +"estimate_lognormal_graph(10000,3).npy")
coeff_cont_full = np.load(u+"estimate_complete_graph(10000).npy")
coeff_cont_ws0 =  np.load(u +"estimate_watts_strogatz_graph(10000,6,0).npy")
coeff_cont_ws01 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.1).npy")
coeff_cont_ws03 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.3).npy")
coeff_cont_ws05 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.5).npy")
coeff_cont_ws07 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.7).npy")
coeff_cont_ws09 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.9).npy")
coeff_cont_ws10 = np.load(u +"estimate_watts_strogatz_graph(10000,6,1.0).npy")

In [7]:
d_cont = [coeff_cont_log, coeff_cont_ba, coeff_cont_exp, coeff_cont_gua,
          coeff_cont_gnm, coeff_cont_full,
          coeff_cont_ws0, coeff_cont_ws01, coeff_cont_ws03,
          coeff_cont_ws05, coeff_cont_ws07, coeff_cont_ws09, coeff_cont_ws10]

title_cont = ['LOG','BA','EXP','GAU','ER', 'Full', 'WS-0','WS-0.1','WS-0.3','WS-0.5','WS-0.7','WS-0.9','WS-1']

In [8]:
mean_r2 = [np.mean(x[:, 5]) for x in d_cont]
std_r2 = [np.std(x[:, 5]) for x in d_cont]

for i, x in enumerate(zip(mean_r2, std_r2)):
    print(title_cont[i], '\t' ,'%6.4f, %.4f' % tuple(x))

LOG 	 0.9959, 0.0042
BA 	 0.9920, 0.0037
EXP 	 0.9988, 0.0016
GAU 	 0.9993, 0.0004
ER 	 0.9992, 0.0005
Full 	 0.9988, 0.0011
WS-0 	 0.9746, 0.0149
WS-0.1 	 0.9947, 0.0025
WS-0.3 	 0.9975, 0.0026
WS-0.5 	 0.9987, 0.0007
WS-0.7 	 0.9988, 0.0007
WS-0.9 	 0.9989, 0.0007
WS-1 	 0.9988, 0.0007


#### (1) P and p

In [None]:
res_p = []
for i, title in enumerate(title_cont):
    X_data = d_cont[i][:, 2]
    Y_data = d_cont[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s || R2: %.4f, beta: %.2E' %(title, r2, a))
    res_p.append([r2, a])

In [None]:
res_p

#### (2) Q and q

In [None]:
res_q = []
for i, title in enumerate(title_cont):
    X_data = d_cont[i][:, 3]
    Y_data = d_cont[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s || R2: %.4f, beta: %.2E' %(title, r2, a))
    res_q.append([r2, a])

#### (3) Cross-validation

In [9]:
res_dict = {}
for i, title in enumerate(title_cont):
    res_cont = []
    data = d_cont[i]
    for j in range(1000):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
    res_dict[title] = res_cont

In [10]:
mean_dict1 = {}
for i, title in enumerate(title_cont):
    d = res_dict[title]
    
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    mean_dict1[title] = [mean_train, mean_test]
    
    print ('%s' % title, end='\t')
    print ('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print ('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

LOG	  Train: 0.00084 (0.000269)	  Test: 0.00267 (0.000117)
BA	  Train: 0.00230 (0.000317)	  Test: 0.00102 (0.000257)
EXP	  Train: 0.00077 (0.000285)	  Test: 0.00086 (0.000221)
GAU	  Train: 0.00089 (0.000172)	  Test: 0.00066 (0.000179)
ER	  Train: 0.00071 (0.000300)	  Test: 0.00094 (0.000195)
Full	  Train: 0.00110 (0.000197)	  Test: 0.00094 (0.000161)
WS-0	  Train: 0.01280 (0.001728)	  Test: 0.02730 (0.004131)
WS-0.1	  Train: 0.00376 (0.001026)	  Test: 0.00249 (0.000754)
WS-0.3	  Train: 0.00367 (0.000107)	  Test: 0.00139 (0.000477)
WS-0.5	  Train: 0.00220 (0.000329)	  Test: 0.00194 (0.000506)
WS-0.7	  Train: 0.00199 (0.000220)	  Test: 0.00242 (0.000382)
WS-0.9	  Train: 0.00214 (0.000667)	  Test: 0.00139 (0.000536)
WS-1	  Train: 0.00192 (0.000745)	  Test: 0.00139 (0.000123)


#### (4) 和随机抽取点进行对比

In [11]:
for i, title in enumerate(title_cont):
    data = d_cont[i]
    samp = mean_dict1[title][1]
    x = grid_distance(data)
    print('%s' % title, end='\t')
    print('%.5f' % x,  end='\t')
    print('t-statistic = %6.3f\t pvalue = %6.4f' %  stats.ttest_1samp(samp, x))

LOG	0.01419	t-statistic = -98.405	 pvalue = 0.0065
BA	0.01816	t-statistic = -66.791	 pvalue = 0.0095
EXP	0.01596	t-statistic = -68.213	 pvalue = 0.0093
GAU	0.01621	t-statistic = -87.005	 pvalue = 0.0073
ER	0.01769	t-statistic = -86.037	 pvalue = 0.0074
Full	0.01656	t-statistic = -96.758	 pvalue = 0.0066
WS-0	0.01971	t-statistic =  1.836	 pvalue = 0.3175
WS-0.1	0.01648	t-statistic = -18.548	 pvalue = 0.0343
WS-0.3	0.01615	t-statistic = -30.911	 pvalue = 0.0206
WS-0.5	0.02996	t-statistic = -55.399	 pvalue = 0.0115
WS-0.7	0.03002	t-statistic = -72.306	 pvalue = 0.0088
WS-0.9	0.03010	t-statistic = -53.515	 pvalue = 0.0119
WS-1	0.02996	t-statistic = -231.367	 pvalue = 0.0028


### 2.  average degree

In [12]:
coeff_cont_gnm_3 = np.load(u + "estimate_gnm_random_graph(10000,30000).npy")
coeff_cont_gnm_4 = np.load(u + "estimate_gnm_random_graph(10000,40000).npy")
coeff_cont_gnm_5 = np.load(u + "estimate_gnm_random_graph(10000,50000).npy")
coeff_cont_gnm_6 = np.load(u + "estimate_gnm_random_graph(10000,60000).npy")
coeff_cont_gnm_7 = np.load(u + "estimate_gnm_random_graph(10000,70000).npy")
coeff_cont_gnm_8 = np.load(u + "estimate_gnm_random_graph(10000,80000).npy")
coeff_cont_gnm_9 = np.load(u + "estimate_gnm_random_graph(10000,90000).npy")
coeff_cont_gnm_10 = np.load(u + "estimate_gnm_random_graph(10000,100000).npy")

In [13]:
d_cont2 = [coeff_cont_gnm_3,coeff_cont_gnm_4,coeff_cont_gnm_5,coeff_cont_gnm_6,
                     coeff_cont_gnm_7,coeff_cont_gnm_8,coeff_cont_gnm_9,coeff_cont_gnm_10]
title_cont2 = ['gnm3', 'gnm4', 'gnm5', 'gnm6', 'gnm7', 'gnm8', 'gnm9', 'gnm10']

In [14]:
mean_r2 = [np.mean(x[:, 5]) for x in d_cont2]
std_r2 = [np.std(x[:, 5]) for x in d_cont2]

for i, x in enumerate(zip(mean_r2, std_r2)):
    print(title_cont2[i], '\t' ,'%6.4f, %.4f' % tuple(x))

gnm3 	 0.9992, 0.0005
gnm4 	 0.9992, 0.0005
gnm5 	 0.9993, 0.0004
gnm6 	 0.9994, 0.0004
gnm7 	 0.9994, 0.0004
gnm8 	 0.9995, 0.0004
gnm9 	 0.9994, 0.0004
gnm10 	 0.9995, 0.0004


#### (1) P and p

In [None]:
for i, title in enumerate(title_cont2):
    X_data = d_cont2[i][:, 2]
    Y_data = d_cont2[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (2) Q and q

In [None]:
for i, title in enumerate(title_cont2):
    X_data = d_cont2[i][:, 3]
    Y_data = d_cont2[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (3) Cross-validation

In [16]:
res_dict2 = {}
t1= time.clock()
for i, title in enumerate(title_cont2):
    res_cont = []
    data = d_cont2[i]
    for j in range(1000):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
        
    res_dict2[title] = res_cont
print('Time elapsed: %.2fs ' % (time.clock() - t1))

Time elapsed: 98.44s 


In [17]:
mean_dict2 = {}
for i, title in enumerate(title_cont2):
    d = res_dict2[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    mean_dict2[title] = [mean_train, mean_test]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

gnm3	  Train: 0.00047 (0.000117)	  Test: 0.00097 (0.000250)
gnm4	  Train: 0.00158 (0.000229)	  Test: 0.00033 (0.000125)
gnm5	  Train: 0.00058 (0.000190)	  Test: 0.00041 (0.000080)
gnm6	  Train: 0.00069 (0.000280)	  Test: 0.00045 (0.000139)
gnm7	  Train: 0.00049 (0.000160)	  Test: 0.00050 (0.000163)
gnm8	  Train: 0.00063 (0.000137)	  Test: 0.00056 (0.000153)
gnm9	  Train: 0.00042 (0.000164)	  Test: 0.00028 (0.000025)
gnm10	  Train: 0.00031 (0.000123)	  Test: 0.00036 (0.000075)


#### (4) 和随机抽取点对比

In [18]:
for i, title in enumerate(title_cont2):
    data = d_cont2[i]    
    samp = mean_dict2[title][1]
    x = grid_distance(data)
    print('%s' % title, end='\t')
    print('%.5f' % x,  end='\t')
    print('t-statistic = %6.3f\t pvalue = %6.4f' %  stats.ttest_1samp(samp, x))

gnm3	0.01776	t-statistic = -67.225	 pvalue = 0.0095
gnm4	0.01489	t-statistic = -116.576	 pvalue = 0.0055
gnm5	0.01334	t-statistic = -161.440	 pvalue = 0.0039
gnm6	0.01248	t-statistic = -86.341	 pvalue = 0.0074
gnm7	0.01280	t-statistic = -75.508	 pvalue = 0.0084
gnm8	0.01233	t-statistic = -76.733	 pvalue = 0.0083
gnm9	0.01090	t-statistic = -427.449	 pvalue = 0.0015
gnm10	0.01126	t-statistic = -144.842	 pvalue = 0.0044


### 3. decision rule

In [19]:
coeff_cont_gnm01 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.1.npy")
coeff_cont_gnm03 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.3.npy")
coeff_cont_gnm05 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.5.npy")
coeff_cont_gnm07 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.7.npy")
coeff_cont_gnm09 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.9.npy")
coeff_cont_gnm10 = np.load(u +"estimate_gnm_random_graph(10000,30000),1.0.npy")

In [20]:
d_cont3 = [coeff_cont_gnm01,coeff_cont_gnm03,coeff_cont_gnm05,coeff_cont_gnm07,coeff_cont_gnm09,coeff_cont_gnm10]
title_cont3 = ['alpha0.1', 'alpha0.3', 'alpha0.5', 'alpha0.7', 'alpha0.9', 'alpha1.0']

In [21]:
mean_r2 = [np.mean(x[:, 5]) for x in d_cont3]
std_r2 = [np.std(x[:, 5]) for x in d_cont3]

for i, x in enumerate(zip(mean_r2, std_r2)):
    print(title_cont3[i], '\t' ,'%6.4f, %.4f' % tuple(x))

alpha0.1 	 0.9991, 0.0005
alpha0.3 	 0.9991, 0.0005
alpha0.5 	 0.9987, 0.0013
alpha0.7 	 0.9989, 0.0007
alpha0.9 	 0.9987, 0.0012
alpha1.0 	 0.9989, 0.0007


#### (1) P and p

In [None]:
for i, title in enumerate(title_cont3):
    X_data = d_cont3[i][:, 2]
    Y_data = d_cont3[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (2) Q and q

In [None]:
for i, title in enumerate(title_cont3):
    X_data = d_cont3[i][:, 3]
    Y_data = d_cont3[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (3) Cross-validation

In [22]:
res_dict3 = {}
t1= time.clock()
for i, title in enumerate(title_cont3):
    res_cont = []
    data = d_cont3[i]
    for j in range(1000):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
    res_dict3[title] = res_cont
print('Time elapsed: %.2fs ' % (time.clock() - t1))

Time elapsed: 75.92s 


In [23]:
mean_dict3 = {}
for i, title in enumerate(title_cont3):
    d = res_dict3[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    mean_dict3[title] = [mean_train, mean_test]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

alpha0.1	  Train: 0.00146 (0.000262)	  Test: 0.00135 (0.000448)
alpha0.3	  Train: 0.00181 (0.000662)	  Test: 0.00262 (0.000272)
alpha0.5	  Train: 0.01008 (0.003014)	  Test: 0.00284 (0.000171)
alpha0.7	  Train: 0.00553 (0.001758)	  Test: 0.00328 (0.000827)
alpha0.9	  Train: 0.00719 (0.001705)	  Test: 0.00935 (0.000711)
alpha1.0	  Train: 0.00445 (0.000480)	  Test: 0.00448 (0.000904)


#### (4) 和随机取点的对比

In [24]:
for i, title in enumerate(title_cont3):
    data = d_cont3[i]
    samp = mean_dict3[title][1]
    x = grid_distance(data)
    print('%s' % title, end='\t')
    print('%.5f' % x,  end='\t')
    print('t-statistic = %6.3f pvalue = %6.4f' %  stats.ttest_1samp(samp, x))

alpha0.1	0.01825	t-statistic = -37.705 pvalue = 0.0169
alpha0.3	0.02385	t-statistic = -78.009 pvalue = 0.0082
alpha0.5	0.03223	t-statistic = -171.614 pvalue = 0.0037
alpha0.7	0.04457	t-statistic = -49.937 pvalue = 0.0127
alpha0.9	0.06227	t-statistic = -74.458 pvalue = 0.0085
alpha1.0	0.07358	t-statistic = -76.425 pvalue = 0.0083


### 4. individual heterogeneity

In [25]:
coeff_cont_sgima01 = np.load(u +"estimate_sigma-0.1.npy")
coeff_cont_sgima02 = np.load(u +"estimate_sigma-0.2.npy")
coeff_cont_sgima04 = np.load(u +"estimate_sigma-0.4.npy")
coeff_cont_sgima06 = np.load(u +"estimate_sigma-0.6.npy")
coeff_cont_sgima08 = np.load(u +"estimate_sigma-0.8.npy")
coeff_cont_sgima10 = np.load(u +"estimate_sigma-1.0.npy")

In [26]:
d_cont4 = [coeff_cont_sgima01,coeff_cont_sgima02,coeff_cont_sgima04,
           coeff_cont_sgima06,coeff_cont_sgima08,coeff_cont_sgima10]
title_cont4 = ['sigma0.1', 'sigma0.2', 'sigma0.4', 'sigma0.6', 'sigma0.8', 'sigma1.0']

In [27]:
mean_r2 = [np.mean(x[:, 5]) for x in d_cont4]
std_r2 = [np.std(x[:, 5]) for x in d_cont4]

for i, x in enumerate(zip(mean_r2, std_r2)):
    print(title_cont4[i], '\t' ,'%6.4f, %.4f' % tuple(x))

sigma0.1 	 0.9991, 0.0005
sigma0.2 	 0.9992, 0.0005
sigma0.4 	 0.9989, 0.0006
sigma0.6 	 0.9987, 0.0007
sigma0.8 	 0.9985, 0.0009
sigma1.0 	 0.9985, 0.0010


#### (1) P and p

In [None]:
for i, title in enumerate(title_cont4):
    X_data = d_cont4[i][:, 2]
    Y_data = d_cont4[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (2) Q and q

In [None]:
for i, title in enumerate(title_cont4):
    X_data = d_cont4[i][:, 3]
    Y_data = d_cont4[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (3) Cross-validation

In [28]:
res_dict4 = {}
t1 = time.clock()
for i, title in enumerate(title_cont4):
    res_cont = []
    data = d_cont4[i]
    for j in range(1000):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
        
    res_dict4[title] = res_cont
print('Time elapsed: %.2fs ' % (time.clock() - t1))

Time elapsed: 73.35s 


In [29]:
mean_dict4 = {}
for i, title in enumerate(title_cont4):
    d = res_dict4[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    mean_dict4[title] = [mean_train, mean_test]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

sigma0.1	  Train: 0.00151 (0.000041)	  Test: 0.00093 (0.000234)
sigma0.2	  Train: 0.00144 (0.000344)	  Test: 0.00102 (0.000135)
sigma0.4	  Train: 0.00157 (0.000495)	  Test: 0.00107 (0.000249)
sigma0.6	  Train: 0.00113 (0.000169)	  Test: 0.00290 (0.000250)
sigma0.8	  Train: 0.00301 (0.000295)	  Test: 0.00094 (0.000296)
sigma1.0	  Train: 0.00241 (0.000459)	  Test: 0.00227 (0.000483)


#### (4) 和随机取点的对比

In [30]:
for i, title in enumerate(title_cont4):
    data = d_cont4[i]
    samp = mean_dict4[title][1]
    x = grid_distance(data)
    print('%s' % title, end='\t')
    print('%.5f' % x,  end='\t')
    print('t-statistic = %6.3f pvalue = %6.4f' %  stats.ttest_1samp(samp, x))

sigma0.1	0.02131	t-statistic = -87.197 pvalue = 0.0073
sigma0.2	0.02127	t-statistic = -149.552 pvalue = 0.0043
sigma0.4	0.02084	t-statistic = -79.453 pvalue = 0.0080
sigma0.6	0.02076	t-statistic = -71.494 pvalue = 0.0089
sigma0.8	0.01818	t-statistic = -58.256 pvalue = 0.0109
sigma1.0	0.01818	t-statistic = -32.934 pvalue = 0.0193


#### 5. GMM decision rule (for online supplementary document)

In [7]:
coeff_gmm_3 = np.load(u +"estimate_gnm_random_graph(10000,30000)-gmm.npy")
coeff_gmm_4 = np.load(u +"estimate_gnm_random_graph(10000,40000)-gmm.npy")
coeff_gmm_5 = np.load(u +"estimate_gnm_random_graph(10000,50000)-gmm.npy")
coeff_gmm_6 = np.load(u +"estimate_gnm_random_graph(10000,60000)-gmm.npy")
coeff_gmm_7 = np.load(u +"estimate_gnm_random_graph(10000,70000)-gmm.npy")
coeff_gmm_8 = np.load(u +"estimate_gnm_random_graph(10000,80000)-gmm.npy")
coeff_gmm_9 = np.load(u +"estimate_gnm_random_graph(10000,90000)-gmm.npy")
coeff_gmm_10 = np.load(u +"estimate_gnm_random_graph(10000,100000)-gmm.npy")

In [8]:
d_cont5 = [coeff_gmm_3, coeff_gmm_4, coeff_gmm_5, coeff_gmm_6, coeff_gmm_7, coeff_gmm_8, coeff_gmm_9, coeff_gmm_10, ]
title_cont5 = ['gmm6', 'gmm8', 'gmm10', 'gmm12', 'gmm14', 'gmm16', 'gmm18', 'gmm20']

In [10]:
mean_r2 = [np.mean(x[:, 5]) for x in d_cont5]
std_r2 = [np.std(x[:, 5]) for x in d_cont5]

for i, x in enumerate(zip(mean_r2, std_r2)):
    print(title_cont5[i], '\t' ,'%6.4f, %.4f' % tuple(x))

gmm6 	 0.9984, 0.0009
gmm8 	 0.9986, 0.0008
gmm10 	 0.9986, 0.0009
gmm12 	 0.9987, 0.0007
gmm14 	 0.9988, 0.0007
gmm16 	 0.9988, 0.0007
gmm18 	 0.9988, 0.0007
gmm20 	 0.9988, 0.0006


#### (1) P and p

In [11]:
for i, title in enumerate(title_cont5):
    X_data = d_cont5[i][:, 2]
    Y_data = d_cont5[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

gmm6 
 R2: 0.9969, beta: 0.73
gmm8 
 R2: 0.9972, beta: 0.75
gmm10 
 R2: 0.9975, beta: 0.76
gmm12 
 R2: 0.9973, beta: 0.78
gmm14 
 R2: 0.9972, beta: 0.78
gmm16 
 R2: 0.9972, beta: 0.78
gmm18 
 R2: 0.9975, beta: 0.79
gmm20 
 R2: 0.9977, beta: 0.79


#### (2) q and Q

In [12]:
for i, title in enumerate(title_cont5):
    X_data = d_cont5[i][:, 3]
    Y_data = d_cont5[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

gmm6 
 R2: 0.9992, beta: 0.21
gmm8 
 R2: 0.9992, beta: 0.15
gmm10 
 R2: 0.9988, beta: 0.12
gmm12 
 R2: 0.9989, beta: 0.10
gmm14 
 R2: 0.9990, beta: 0.09
gmm16 
 R2: 0.9992, beta: 0.07
gmm18 
 R2: 0.9992, beta: 0.07
gmm20 
 R2: 0.9993, beta: 0.06


#### (3) Cross-validation

In [13]:
res_dict5 = {}
t1 = time.clock()
for i, title in enumerate(title_cont5):
    res_cont = []
    data = d_cont5[i]
    for j in range(1000):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
        
    res_dict5[title] = res_cont
print('Time elapsed: %.2fs ' % (time.clock() - t1))

Time elapsed: 109.27s 


In [14]:
mean_dict5 = {}
for i, title in enumerate(title_cont5):
    d = res_dict5[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    mean_dict5[title] = [mean_train, mean_test]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

gmm6	  Train: 0.00181 (0.000293)	  Test: 0.00182 (0.000703)
gmm8	  Train: 0.00150 (0.000028)	  Test: 0.00114 (0.000362)
gmm10	  Train: 0.00112 (0.000096)	  Test: 0.00101 (0.000144)
gmm12	  Train: 0.00058 (0.000233)	  Test: 0.00096 (0.000351)
gmm14	  Train: 0.00106 (0.000289)	  Test: 0.00110 (0.000177)
gmm16	  Train: 0.00065 (0.000238)	  Test: 0.00043 (0.000179)
gmm18	  Train: 0.00049 (0.000198)	  Test: 0.00041 (0.000235)
gmm20	  Train: 0.00032 (0.000183)	  Test: 0.00056 (0.000157)


#### (4) 和随机取点的对比

In [15]:
for i, title in enumerate(title_cont5):
    data = d_cont5[i]
    samp = mean_dict5[title][1]
    x = grid_distance(data)
    print('%s' % title, end='\t')
    print('%.5f' % x,  end='\t')
    print('t-statistic = %6.3f pvalue = %6.4f' %  stats.ttest_1samp(samp, x))

gmm6	0.01827	t-statistic = -23.405 pvalue = 0.0272
gmm8	0.01503	t-statistic = -38.391 pvalue = 0.0166
gmm10	0.01321	t-statistic = -84.792 pvalue = 0.0075
gmm12	0.01322	t-statistic = -34.953 pvalue = 0.0182
gmm14	0.01236	t-statistic = -63.776 pvalue = 0.0100
gmm16	0.01148	t-statistic = -61.604 pvalue = 0.0103
gmm18	0.01099	t-statistic = -45.117 pvalue = 0.0141
gmm20	0.01012	t-statistic = -60.829 pvalue = 0.0105


### 6. different points of data (for online supplementary document)

In [16]:
coeff_peak_1 = np.load(u +"estimate_gnm_random_graph(10000,30000)_Peak-1.npy")
coeff_peak0 =  np.load(u +"estimate_gnm_random_graph(10000,30000)_Peak0.npy")
coeff_peak1 = np.load(u +"estimate_gnm_random_graph(10000,30000)_Peak1.npy")
coeff_peak2 = np.load(u +"estimate_gnm_random_graph(10000,30000)_Peak2.npy")
coeff_peak3 = np.load(u +"estimate_gnm_random_graph(10000,30000)_Peak3.npy")
coeff_peak4 = np.load(u +"estimate_gnm_random_graph(10000,30000)_Peak4.npy")
coeff_peak5 = np.load(u +"estimate_gnm_random_graph(10000,30000)_Peak5.npy")

In [17]:
d_cont6 = [coeff_peak_1, coeff_peak0, coeff_peak1, coeff_peak2, coeff_peak3, coeff_peak4, coeff_peak5]
title_cont6 = ['P-1', 'P', 'P+1', 'P+2', 'P+3', 'P+4', 'P+5']

In [18]:
mean_r2 = [np.mean(x[:, 5]) for x in d_cont6]
std_r2 = [np.std(x[:, 5]) for x in d_cont6]

for i, x in enumerate(zip(mean_r2, std_r2)):
    print(title_cont6[i], '\t' ,'%6.4f, %.4f' % tuple(x))

P-1 	 0.9996, 0.0004
P 	 0.9995, 0.0003
P+1 	 0.9992, 0.0005
P+2 	 0.9984, 0.0009
P+3 	 0.9972, 0.0014
P+4 	 0.9958, 0.0019
P+5 	 0.9945, 0.0023


####  (2) p and P

In [19]:
for i, title in enumerate(title_cont6):
    X_data = d_cont6[i][:, 2]
    Y_data = d_cont6[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s, R2: %.4f, beta: %.2f' %(title, r2, a))

P-1, R2: 0.9984, beta: 0.78
P, R2: 0.9987, beta: 0.80
P+1, R2: 0.9986, beta: 0.81
P+2, R2: 0.9981, beta: 0.80
P+3, R2: 0.9975, beta: 0.79
P+4, R2: 0.9970, beta: 0.78
P+5, R2: 0.9965, beta: 0.77


#### (3) q and Q

In [20]:
for i, title in enumerate(title_cont6):
    X_data = d_cont6[i][:, 3]
    Y_data = d_cont6[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s, R2: %.4f, beta: %.2f' %(title, r2, a))

P-1, R2: 0.9984, beta: 0.19
P, R2: 0.9990, beta: 0.19
P+1, R2: 0.9991, beta: 0.20
P+2, R2: 0.9993, beta: 0.20
P+3, R2: 0.9992, beta: 0.21
P+4, R2: 0.9992, beta: 0.21
P+5, R2: 0.9993, beta: 0.22


#### (4) Distance

In [21]:
res_dict6 = {}
for i, title in enumerate(title_cont6):
    res_cont = []
    data = d_cont6[i]
    for j in range(1000):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
    res_dict6[title] = res_cont

In [22]:
mean_dict6 = {}
for i, title in enumerate(title_cont6):
    d = res_dict6[title]
    
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    mean_dict6[title] = [mean_train, mean_test]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print ('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

P-1	  Train: 0.00217 (0.000474)	  Test: 0.00140 (0.000403)
P	  Train: 0.00057 (0.000105)	  Test: 0.00108 (0.000277)
P+1	  Train: 0.00097 (0.000198)	  Test: 0.00086 (0.000332)
P+2	  Train: 0.00095 (0.000295)	  Test: 0.00079 (0.000150)
P+3	  Train: 0.00162 (0.000387)	  Test: 0.00062 (0.000206)
P+4	  Train: 0.00129 (0.000477)	  Test: 0.00099 (0.000245)
P+5	  Train: 0.00156 (0.000484)	  Test: 0.00088 (0.000201)


#### (4) 和随机取点的对比

In [23]:
for i, title in enumerate(title_cont6):
    data = d_cont6[i]
    samp = mean_dict6[title][1]
    x = grid_distance(data)
    print('%s' % title, end='\t')
    print('%.5f' % x, end='\t')
    print('t-statistic = %6.3f\t pvalue = %6.4f' %  stats.ttest_1samp(samp, x))

P-1	0.01775	t-statistic = -40.572	 pvalue = 0.0157
P	0.01769	t-statistic = -59.975	 pvalue = 0.0106
P+1	0.01769	t-statistic = -50.770	 pvalue = 0.0125
P+2	0.01776	t-statistic = -113.437	 pvalue = 0.0056
P+3	0.01772	t-statistic = -82.929	 pvalue = 0.0077
P+4	0.01772	t-statistic = -68.193	 pvalue = 0.0093
P+5	0.01770	t-statistic = -83.693	 pvalue = 0.0076
