In [1]:
%matplotlib inline
import pylab as pl
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import pandas as pd
import networkx as nx
import random

  from pandas.core import datetools


In [2]:
from __future__ import print_function
import time
from scipy import stats

In [3]:
u = "C:/Users/XIAOYU/PycharmProjects/A-fast-method/auto_data/"

In [4]:
def calcu_distance(data, num_train=50):
    '''
    data: [[p, q, P, Q, M, r2]]
    '''
    df = pd.DataFrame(data, columns=['p', 'q', 'P', 'Q', 'M', 'r2'])
    
    # 随机生成训练集和测试集
    num_tol = len(data)
    idx_cont = np.arange(num_tol)
    idx_train = np.random.choice(idx_cont, size=num_train, replace=False)  # for the training set
    idx_test = np.array([i for i in idx_cont if i not in idx_train])  # the remaining is the test set
    train_set = df.loc[idx_train, ['p', 'q', 'P', 'Q']]
    test_set = df.loc[idx_test, ['p', 'q', 'P', 'Q']]
    
    # 预测 p
    result_p = smf.ols('p ~ P-1', data=train_set).fit()
    k_p = result_p.params['P']
    r2_p = result_p.rsquared
    train_set['pred_p'] = k_p * train_set['P']
    test_set['pred_p'] = k_p * test_set['P']
    
    # 预测 q
    result_q = smf.ols('q ~ Q-1', data=train_set).fit()
    k_q = result_q.params['Q']
    r2_q = result_q.rsquared
    train_set['pred_q'] = k_q * train_set['Q']
    test_set['pred_q'] = k_q * test_set['Q']
    
    # 测试集中预测点和实际点之前的距离
    #dis_train = np.sqrt((train_set['p'] - train_set['pred_p']) ** 2  + (train_set['q'] - train_set['pred_q']) ** 2)
    #dis_test = np.sqrt((test_set['p'] - test_set['pred_p']) ** 2 + (test_set['q'] - test_set['pred_q']) ** 2)
    
    dis_train = np.sqrt(np.square(train_set.p - train_set.pred_p) + np.square(train_set.q - train_set.pred_q))
    dis_test = np.sqrt(np.square(test_set.p - test_set.pred_p) + np.square(test_set.q - test_set.pred_q))
    
    return dis_train, dis_test

In [5]:
def grid_distance(data, num_samp=10000):
    df = pd.DataFrame(data, columns=['p', 'q', 'P', 'Q', 'M', 'r2'])
    min_p, max_p = df.p.min(), df.p.max()
    min_q, max_q = df.q.min(), df.q.max()
    samp_p = (max_p - min_p) * np.random.random(num_samp) + min_p
    samp_q = (max_q - min_q) * np.random.random(num_samp) + min_q
    d_cont = []
    for i in df.index:
        d = np.mean(np.sqrt(np.square(samp_p - df.p[i]) + np.square(samp_q - df.q[i])))
        d_cont.append(d)
    
    return np.mean(d_cont)

$p=k_1\cdot P$

$q=k_2\cdot Q$

$D(\mathrm{fit, predict}) = \sqrt{(\hat p - p)^2 + (\hat q - q)^2}$

### 1. topology

In [6]:
coeff_cont_gnm = np.load(u +"estimate_gnm_random_graph(10000,30000).npy")
coeff_cont_ba =  np.load(u +"estimate_barabasi_albert_graph(10000,3).npy")
coeff_cont_exp = np.load(u +"estimate_exponential_graph(10000,3).npy")
coeff_cont_gua = np.load(u +"estimate_gaussian_graph(10000,3).npy")
coeff_cont_log = np.load(u +"estimate_lognormal_graph(10000,3).npy")
coeff_cont_full = np.load(u+"estimate_complete_graph(10000).npy")
coeff_cont_ws0 =  np.load(u +"estimate_watts_strogatz_graph(10000,6,0).npy")
coeff_cont_ws01 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.1).npy")
coeff_cont_ws03 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.3).npy")
coeff_cont_ws05 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.5).npy")
coeff_cont_ws07 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.7).npy")
coeff_cont_ws09 = np.load(u +"estimate_watts_strogatz_graph(10000,6,0.9).npy")
coeff_cont_ws10 = np.load(u +"estimate_watts_strogatz_graph(10000,6,1.0).npy")

In [7]:
d_cont = [coeff_cont_log, coeff_cont_ba, coeff_cont_exp, coeff_cont_gua,
          coeff_cont_gnm, coeff_cont_full,
          coeff_cont_ws0, coeff_cont_ws01, coeff_cont_ws03,
          coeff_cont_ws05, coeff_cont_ws07, coeff_cont_ws09, coeff_cont_ws10]

title_cont = ['LOG','BA','EXP','GAU','ER', 'Full', 'WS-0','WS-0.1','WS-0.3','WS-0.5','WS-0.7','WS-0.9','WS-1']

In [8]:
mean_r2 = [np.mean(x[:, 5]) for x in d_cont]
std_r2 = [np.std(x[:, 5]) for x in d_cont]

for i, x in enumerate(zip(mean_r2, std_r2)):
    print(title_cont[i], '\t' ,'%6.4f, %.4f' % tuple(x))

LOG 	 0.9959, 0.0042
BA 	 0.9920, 0.0037
EXP 	 0.9988, 0.0016
GAU 	 0.9993, 0.0004
ER 	 0.9992, 0.0005
Full 	 0.9988, 0.0011
WS-0 	 0.9746, 0.0149
WS-0.1 	 0.9947, 0.0025
WS-0.3 	 0.9975, 0.0026
WS-0.5 	 0.9987, 0.0007
WS-0.7 	 0.9988, 0.0007
WS-0.9 	 0.9989, 0.0007
WS-1 	 0.9988, 0.0007


#### (1) P and p

In [None]:
res_p = []
for i, title in enumerate(title_cont):
    X_data = d_cont[i][:, 2]
    Y_data = d_cont[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s || R2: %.4f, beta: %.2E' %(title, r2, a))
    res_p.append([r2, a])

In [None]:
res_p

#### (2) Q and q

In [None]:
res_q = []
for i, title in enumerate(title_cont):
    X_data = d_cont[i][:, 3]
    Y_data = d_cont[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s || R2: %.4f, beta: %.2E' %(title, r2, a))
    res_q.append([r2, a])

#### (3) Cross-validation

In [14]:
res_dict = {}
for i, title in enumerate(title_cont):
    res_cont = []
    data = d_cont[i]
    for j in range(1000):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
    res_dict[title] = res_cont

In [15]:
mean_dict1 = {}
for i, title in enumerate(title_cont):
    d = res_dict[title]
    
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    mean_dict1[title] = [mean_train, mean_test]
    
    print ('%s' % title, end='\t')
    print ('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print ('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

LOG	  Train: 0.00202 (0.000243)	  Test: 0.00234 (0.000001)
BA	  Train: 0.00669 (0.000329)	  Test: 0.00589 (0.000619)
EXP	  Train: 0.00203 (0.000114)	  Test: 0.00368 (0.000402)
GAU	  Train: 0.00098 (0.000228)	  Test: 0.00095 (0.000404)
ER	  Train: 0.00096 (0.000195)	  Test: 0.00209 (0.000374)
Full	  Train: 0.00160 (0.000006)	  Test: 0.00070 (0.000168)
WS-0	  Train: 0.01617 (0.004418)	  Test: 0.05181 (0.006433)
WS-0.1	  Train: 0.00599 (0.001937)	  Test: 0.01252 (0.002486)
WS-0.3	  Train: 0.00366 (0.001248)	  Test: 0.00557 (0.001159)
WS-0.5	  Train: 0.00235 (0.000652)	  Test: 0.00235 (0.000375)
WS-0.7	  Train: 0.00284 (0.000608)	  Test: 0.00287 (0.000998)
WS-0.9	  Train: 0.00195 (0.000429)	  Test: 0.00162 (0.000270)
WS-1	  Train: 0.00287 (0.000024)	  Test: 0.00236 (0.000800)


#### (4) 和随机抽取点进行对比

In [16]:
for i, title in enumerate(title_cont):
    data = d_cont[i]
    samp = mean_dict1[title][1]
    x = grid_distance(data)
    print('%s' % title, end='\t')
    print('%.5f' % x,  end='\t')
    print('t-statistic = %6.3f pvalue = %6.4f' %  stats.ttest_1samp(samp, x))

LOG	0.01416	t-statistic = -8324.093 pvalue = 0.0001
BA	0.01816	t-statistic = -19.814 pvalue = 0.0321
EXP	0.01593	t-statistic = -30.434 pvalue = 0.0209
GAU	0.01633	t-statistic = -38.084 pvalue = 0.0167
ER	0.01772	t-statistic = -41.787 pvalue = 0.0152
Full	0.01654	t-statistic = -94.441 pvalue = 0.0067
WS-0	0.01968	t-statistic =  4.993 pvalue = 0.1258
WS-0.1	0.01653	t-statistic = -1.617 pvalue = 0.3526
WS-0.3	0.01609	t-statistic = -9.074 pvalue = 0.0699
WS-0.5	0.03001	t-statistic = -73.714 pvalue = 0.0086
WS-0.7	0.03001	t-statistic = -27.194 pvalue = 0.0234
WS-0.9	0.03001	t-statistic = -105.240 pvalue = 0.0060
WS-1	0.02999	t-statistic = -34.527 pvalue = 0.0184


#### (5) 绘图

In [None]:
def voilin_plot(ax, d_cont, positions, title=False, xlabel=False, ylabel=False):
    if xlabel:
        ax.set_xlabel(xlabel, fontsize=15)
    if ylabel:
        ax.set_ylabel(ylabel, fontsize=15)
    if title:
        ax.set_title(title, fontsize=15)
        
    #ax.axhline(1, color='gray', ls='--', lw=1)
    ax.set_ylim([0.92, 1.02])
    #ax.yaxis.grid(True, linestyle='--', which='major', color='lightgrey',alpha=0.5)
    ax.violinplot(d_cont, showmedians=False, showmeans=True, showextrema=False, widths=0.5, positions=positions)
    for i, d in enumerate(d_cont):
        textstr = '%.4f\n(%.4f)' % (np.mean(d), np.std(d))
        ax.text(i + 0.6, np.mean(d), textstr, fontsize=10, verticalalignment='center', color='k', alpha=0.5)

In [None]:
def line_plot(ax, d_cont, positions, title=False, xlabel=False, ylabel=False):
    if xlabel:
        ax.set_xlabel(xlabel, fontsize=15)
    if ylabel:
        ax.set_ylabel(ylabel, fontsize=15)
    if title:
        ax.set_title(title, fontsize=15)
    
    r_cont = [d[0] for d in d_cont]
    #ax.axhline(1, color='gray', ls='--', lw=1)
    ax.plot(positions, r_cont, 'k--', lw=1, alpha=0.4)
    ax.scatter(positions, r_cont, marker='o', s=60, alpha=0.8, lw=1)
        
    for i, d in enumerate(d_cont):
        text = '%.4f\n{%.1e}'% tuple(d)
        ax.text(i + 0.6, d[0], text, fontsize=10, verticalalignment='center', color='k', alpha=0.5)

In [None]:
positions = np.arange(1, len(title_cont) + 1)
r_cont1 = [v[:,-1] for v in d_cont]

In [None]:
fig = pl.figure(figsize=(12, 10))

ax1 = fig.add_subplot(3, 1, 1)
voilin_plot(ax1, r_cont1, positions, title='(a) Results for the DE', ylabel='$R^2$')
ax1.set_xticklabels([])

ax2 = fig.add_subplot(3, 1, 2)
line_plot(ax2, res_p, positions, title='(b) Results for Linear model 1', ylabel=r'$R^2$ and $\beta$')
ax3.set_ylim([0.88, 1.02])
ax2.set_xticklabels([])

ax3 = fig.add_subplot(3, 1, 3)
line_plot(ax3, res_q, positions, title='(c) Results for Linear model 2', ylabel=r'$R^2$ and $\beta$')
ax3.set_ylim([0.88, 1.02])

pl.setp(ax1, xticks=positions, xticklabels=title_cont)
pl.setp(ax2, xticks=positions, xticklabels=title_cont)
pl.setp(ax3, xticks=positions, xticklabels=title_cont)
pl.tight_layout()

### 2.  average degree

In [17]:
coeff_cont_gnm_3 = np.load(u + "estimate_gnm_random_graph(10000,30000).npy")
coeff_cont_gnm_4 = np.load(u + "estimate_gnm_random_graph(10000,40000).npy")
coeff_cont_gnm_5 = np.load(u + "estimate_gnm_random_graph(10000,50000).npy")
coeff_cont_gnm_6 = np.load(u + "estimate_gnm_random_graph(10000,60000).npy")
coeff_cont_gnm_7 = np.load(u + "estimate_gnm_random_graph(10000,70000).npy")
coeff_cont_gnm_8 = np.load(u + "estimate_gnm_random_graph(10000,80000).npy")
coeff_cont_gnm_9 = np.load(u + "estimate_gnm_random_graph(10000,90000).npy")
coeff_cont_gnm_10 = np.load(u + "estimate_gnm_random_graph(10000,100000).npy")

In [18]:
d_cont2 = [coeff_cont_gnm_3,coeff_cont_gnm_4,coeff_cont_gnm_5,coeff_cont_gnm_6,
                     coeff_cont_gnm_7,coeff_cont_gnm_8,coeff_cont_gnm_9,coeff_cont_gnm_10]
title_cont2 = ['gnm3', 'gnm4', 'gnm5', 'gnm6', 'gnm7', 'gnm8', 'gnm9', 'gnm10']

In [19]:
mean_r2 = [np.mean(x[:, 5]) for x in d_cont2]
std_r2 = [np.std(x[:, 5]) for x in d_cont2]

for i, x in enumerate(zip(mean_r2, std_r2)):
    print(title_cont2[i], '\t' ,'%6.4f, %.4f' % tuple(x))

gnm3 	 0.9992, 0.0005
gnm4 	 0.9992, 0.0005
gnm5 	 0.9993, 0.0004
gnm6 	 0.9994, 0.0004
gnm7 	 0.9994, 0.0004
gnm8 	 0.9995, 0.0004
gnm9 	 0.9994, 0.0004
gnm10 	 0.9995, 0.0004


#### (1) P and p

In [None]:
for i, title in enumerate(title_cont2):
    X_data = d_cont2[i][:, 2]
    Y_data = d_cont2[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (2) Q and q

In [None]:
for i, title in enumerate(title_cont2):
    X_data = d_cont2[i][:, 3]
    Y_data = d_cont2[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (3) Cross-validation

In [20]:
res_dict2 = {}
t1= time.clock()
for i, title in enumerate(title_cont2):
    res_cont = []
    data = d_cont2[i]
    for j in range(1000):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
        
    res_dict2[title] = res_cont
print('Time elapsed: %.2fs ' % (time.clock() - t1))

Time elapsed: 82.37s 


In [21]:
mean_dict2 = {}
for i, title in enumerate(title_cont2):
    d = res_dict2[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    mean_dict2[title] = [mean_train, mean_test]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

gnm3 	
  Train: 0.00134 (0.000283)	  Test: 0.00081 (0.000180)
gnm4 	
  Train: 0.00082 (0.000198)	  Test: 0.00085 (0.000326)
gnm5 	
  Train: 0.00064 (0.000164)	  Test: 0.00099 (0.000318)
gnm6 	
  Train: 0.00046 (0.000147)	  Test: 0.00052 (0.000060)
gnm7 	
  Train: 0.00114 (0.000254)	  Test: 0.00071 (0.000172)
gnm8 	
  Train: 0.00068 (0.000148)	  Test: 0.00060 (0.000298)
gnm9 	
  Train: 0.00070 (0.000167)	  Test: 0.00049 (0.000246)
gnm10 	
  Train: 0.00038 (0.000128)	  Test: 0.00045 (0.000069)


#### (4) 和随机抽取点对比

In [22]:
for i, title in enumerate(title_cont2):
    data = d_cont2[i]    
    samp = mean_dict2[title][1]
    x = grid_distance(data)
    print('%s' % title, end='\t')
    print('%.5f' % x,  end='\t')
    print('t-statistic = %6.3f pvalue = %6.4f' %  stats.ttest_1samp(samp, x))

gnm3	0.01763	t-statistic = -93.403 pvalue = 0.0068
gnm4	0.01487	t-statistic = -43.041 pvalue = 0.0148
gnm5	0.01331	t-statistic = -38.698 pvalue = 0.0164
gnm6	0.01246	t-statistic = -199.657 pvalue = 0.0032
gnm7	0.01284	t-statistic = -70.543 pvalue = 0.0090
gnm8	0.01231	t-statistic = -39.288 pvalue = 0.0162
gnm9	0.01089	t-statistic = -42.367 pvalue = 0.0150
gnm10	0.01130	t-statistic = -157.190 pvalue = 0.0040


### 3. decision rule

In [23]:
coeff_cont_gnm01 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.1.npy")
coeff_cont_gnm03 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.3.npy")
coeff_cont_gnm05 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.5.npy")
coeff_cont_gnm07 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.7.npy")
coeff_cont_gnm09 = np.load(u +"estimate_gnm_random_graph(10000,30000),0.9.npy")
coeff_cont_gnm10 = np.load(u +"estimate_gnm_random_graph(10000,30000),1.0.npy")

In [24]:
d_cont3 = [coeff_cont_gnm01,coeff_cont_gnm03,coeff_cont_gnm05,coeff_cont_gnm07,coeff_cont_gnm09,coeff_cont_gnm10]
title_cont3 = ['alpha0.1', 'alpha0.3', 'alpha0.5', 'alpha0.7', 'alpha0.9', 'alpha1.0']

In [25]:
mean_r2 = [np.mean(x[:, 5]) for x in d_cont3]
std_r2 = [np.std(x[:, 5]) for x in d_cont3]

for i, x in enumerate(zip(mean_r2, std_r2)):
    print(title_cont3[i], '\t' ,'%6.4f, %.4f' % tuple(x))

alpha0.1 	 0.9991, 0.0005
alpha0.3 	 0.9991, 0.0005
alpha0.5 	 0.9987, 0.0013
alpha0.7 	 0.9989, 0.0007
alpha0.9 	 0.9987, 0.0012
alpha1.0 	 0.9989, 0.0007


#### (1) P and p

In [None]:
for i, title in enumerate(title_cont3):
    X_data = d_cont3[i][:, 2]
    Y_data = d_cont3[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (2) Q and q

In [None]:
for i, title in enumerate(title_cont3):
    X_data = d_cont3[i][:, 3]
    Y_data = d_cont3[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (3) Cross-validation

In [26]:
res_dict3 = {}
for i, title in enumerate(title_cont3):
    res_cont = []
    data = d_cont3[i]
    for j in range(1000):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
    res_dict3[title] = res_cont

In [27]:
mean_dict3 = {}
for i, title in enumerate(title_cont3):
    d = res_dict3[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    mean_dict3[title] = [mean_train, mean_test]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

alpha0.1	  Train: 0.00286 (0.000218)	  Test: 0.00282 (0.001016)
alpha0.3	  Train: 0.00215 (0.000361)	  Test: 0.00272 (0.000733)
alpha0.5	  Train: 0.00363 (0.001033)	  Test: 0.00879 (0.000993)
alpha0.7	  Train: 0.01010 (0.001621)	  Test: 0.00541 (0.000340)
alpha0.9	  Train: 0.01027 (0.001233)	  Test: 0.03744 (0.005074)
alpha1.0	  Train: 0.01286 (0.002809)	  Test: 0.01669 (0.007420)


In [28]:
for i, title in enumerate(title_cont3):
    data = d_cont3[i]
    samp = mean_dict3[title][1]
    x = grid_distance(data)
    print('%s' % title, end='\t')
    print('%.5f' % x,  end='\t')
    print('t-statistic = %6.3f pvalue = %6.4f' %  stats.ttest_1samp(samp, x))

alpha0.1	0.01821	t-statistic = -15.134 pvalue = 0.0420
alpha0.3	0.02385	t-statistic = -28.827 pvalue = 0.0221
alpha0.5	0.03225	t-statistic = -23.640 pvalue = 0.0269
alpha0.7	0.04455	t-statistic = -115.115 pvalue = 0.0055
alpha0.9	0.06232	t-statistic = -4.904 pvalue = 0.1281
alpha1.0	0.07416	t-statistic = -7.747 pvalue = 0.0817


### 4. individual heterogeneity

In [29]:
coeff_cont_sgima01 = np.load(u +"estimate_sigma-0.1.npy")
coeff_cont_sgima02 = np.load(u +"estimate_sigma-0.2.npy")
coeff_cont_sgima04 = np.load(u +"estimate_sigma-0.4.npy")
coeff_cont_sgima06 = np.load(u +"estimate_sigma-0.6.npy")
coeff_cont_sgima08 = np.load(u +"estimate_sigma-0.8.npy")
coeff_cont_sgima10 = np.load(u +"estimate_sigma-1.0.npy")

In [30]:
d_cont4 = [coeff_cont_sgima01,coeff_cont_sgima02,coeff_cont_sgima04,
           coeff_cont_sgima06,coeff_cont_sgima08,coeff_cont_sgima10]
title_cont4 = ['sigma0.1', 'sigma0.2', 'sigma0.4', 'sigma0.6', 'sigma0.8', 'sigma1.0']

In [31]:
mean_r2 = [np.mean(x[:, 5]) for x in d_cont4]
std_r2 = [np.std(x[:, 5]) for x in d_cont4]

for i, x in enumerate(zip(mean_r2, std_r2)):
    print(title_cont4[i], '\t' ,'%6.4f, %.4f' % tuple(x))

sigma0.1 	 0.9991, 0.0005
sigma0.2 	 0.9992, 0.0005
sigma0.4 	 0.9989, 0.0006
sigma0.6 	 0.9987, 0.0007
sigma0.8 	 0.9985, 0.0009
sigma1.0 	 0.9985, 0.0010


#### (1) P and p

In [None]:
for i, title in enumerate(title_cont4):
    X_data = d_cont4[i][:, 2]
    Y_data = d_cont4[i][:, 0]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (2) Q and q

In [None]:
for i, title in enumerate(title_cont4):
    X_data = d_cont4[i][:, 3]
    Y_data = d_cont4[i][:, 1]
    to_fit = pd.DataFrame({'X':X_data,'Y':Y_data})
    results = smf.ols('Y ~ X-1', data=to_fit).fit()
    a = results.params['X']
    r2 = results.rsquared
    print('%s \n R2: %.4f, beta: %.2f' %(title, r2, a))

#### (3) Cross-validation

In [32]:
res_dict4 = {}
for i, title in enumerate(title_cont4):
    res_cont = []
    data = d_cont4[i]
    for j in range(1000):
        dis_train, dis_test = calcu_distance(data, num_train=9)      
        res_cont.append([(np.mean(dis_train), np.std(dis_train)),
                         (np.mean(dis_test), np.std(dis_test))])
        
    res_dict4[title] = res_cont

In [34]:
mean_dict4 = {}
for i, title in enumerate(title_cont4):
    d = res_dict4[title]
    mean_train = d[:][0][0]
    std_train = d[:][0][1]
    
    mean_test = d[:][1][0]
    std_test = d[:][1][1]
    
    mean_dict4[title] = [mean_train, mean_test]
    
    print('%s' % title, end='\t')
    print('  Train: %.5f (%.6f)' % (np.mean(mean_train), np.std(mean_train)), end='\t')
    print('  Test: %.5f (%.6f)' % (np.mean(mean_test), np.std(mean_test)))

sigma0.1	  Train: 0.00143 (0.000286)	  Test: 0.00143 (0.000183)
sigma0.2	  Train: 0.00181 (0.000430)	  Test: 0.00162 (0.000258)
sigma0.4	  Train: 0.00178 (0.000598)	  Test: 0.00230 (0.000510)
sigma0.6	  Train: 0.00170 (0.000652)	  Test: 0.00521 (0.000292)
sigma0.8	  Train: 0.00106 (0.000324)	  Test: 0.00410 (0.000767)
sigma1.0	  Train: 0.00185 (0.000210)	  Test: 0.00171 (0.000284)


In [35]:
for i, title in enumerate(title_cont4):
    data = d_cont4[i]
    samp = mean_dict4[title][1]
    x = grid_distance(data)
    print('%s' % title, end='\t')
    print('%.5f' % x,  end='\t')
    print('t-statistic = %6.3f pvalue = %6.4f' %  stats.ttest_1samp(samp, x))

sigma0.1	0.02131	t-statistic = -108.576 pvalue = 0.0059
sigma0.2	0.02130	t-statistic = -76.204 pvalue = 0.0084
sigma0.4	0.02087	t-statistic = -36.404 pvalue = 0.0175
sigma0.6	0.02080	t-statistic = -53.304 pvalue = 0.0119
sigma0.8	0.01815	t-statistic = -18.318 pvalue = 0.0347
sigma1.0	0.01817	t-statistic = -57.847 pvalue = 0.0110
