In [3]:
%load_ext autoreload
%autoreload 1
%matplotlib inline
%run ./code/package_loader.py
%aimport data_generator, model, baseline, utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 1. n = 500, p = 5, decay = 2e-3 (Main Result)

#### 1.1 Generate Data

In [9]:
x_train_list = []
# Generate 100 dataset to test stability
repeat_num =100
for i in range(repeat_num):
    x_train_list.append(generate_function_data(sample_size=500, decay=2e-3, predictor_size=4, outcome_size=1))

In [11]:
beta_true = np.asarray([1, 1, 1, 1, 0]).reshape([5, 1])

#### 1.2 Tuning parameters using CV

In [17]:
x_cv = x_train_list[0]
y_cv = linear_model_generation(x_cv, beta_true)
_, correlation, collinearity = weighted_corrcoef(x_cv, np.ones((x_cv.shape[0], 1)))
print('Original Correlation: ', correlation)
print('Original Collinearity(Log): ', np.log(collinearity))

Original Correlation:  0.3818914153852731
Original Collinearity(Log):  15.290406936125784


In [None]:
### Lasso Tuning ###
kf = KFold(n_splits=5, shuffle=True, random_state=0)
lasso = Lasso()
lambda_list = [x for x in np.logspace(-4, 1, 100)]
mse_lasso_cv = np.zeros((len(lambda_list) , 1))

for train_index, test_index in kf.split(x_cv):
    for ind, lambdau in enumerate(lambda_list):
        _ = lasso.fit(x_cv[train_index], y_cv[train_index], validation=False, lambdau=lambdau)
        mse_lasso_cv[ind,0] += cal_prediction_error(y_cv[test_index], lasso.predict(x_cv[test_index]), 'mse')
        
param_index = np.argmin(mse_lasso_cv)
lambda_lasso = lambda_list[param_index]
print('Optimal Lambda for Lasso: %.4f' % (lambda_lasso))
# coef_lasso = lasso.fit(x_train, y_train, validation=False, lambdau=lambda_lasso)
# print('MAE of Lasso: ', cal_estimation_error(beta_true, lasso.beta, 'rmse'))
# print('MSE of Lasso: ', cal_prediction_error(y_test, lasso.predict(x_test), 'rmse'))

In [None]:
### IILasso Tuning ###
kf = KFold(n_splits=5, shuffle=True, random_state=0)
iilasso = IIlasso()
l1_norm_list = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
corr_norm_list = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
mse_iilasso_cv = np.zeros((len(l1_norm_list) , len(corr_norm_list)))

for train_index, test_index in tqdm_notebook(kf.split(x_cv), desc='val_loop'):
    for indx, l1_norm in enumerate(l1_norm_list):
        for indy, corr_norm in enumerate(corr_norm_list):
            _ = iilasso.fit(x_cv[train_index], y_cv[train_index], 
                            l1_norm=l1_norm, corr_norm=corr_norm, model_path='synthetic_data_cv')
            mse_iilasso_cv[indx, indy] += cal_prediction_error(y_cv[test_index], 
                                                    iilasso.predict(x_cv[test_index]), 'rmse')
index = np.argmin(mse_iilasso_cv)
indx, indy = index//len(corr_norm_list), index%len(corr_norm_list)
l1_norm_iilasso, corr_norm_iilasso = l1_norm_list[indx], corr_norm_list[indy]

HBox(children=(IntProgress(value=1, bar_style='info', description='val_loop', max=1, style=ProgressStyle(descr…

In [None]:
### Lasso + Our Tuning ###
kf = KFold(n_splits=5, shuffle=True, random_state=0)
lasso_our = Lasso()
weight_l2_list = [1e-1, 1, 10, 30, 50, 100, 300, 500]
mse_l2_list = np.zeros((len(weight_l2_list)))
mse_ols = []
val_count = 0
for train_index, test_index in kf.split(x_train):
    val_count += 1
    scaler = preprocessing.StandardScaler().fit(x_train[train_index])
    x_cvtrain_scaled = scaler.transform(x_train[train_index])
    x_cvtest_scaled = scaler.transform(x_train[test_index])
    _ = ols.fit(x_cvtrain_scaled, y_train[train_index])
    mse_ols.append(cal_prediction_error(y_train[test_index], ols.predict(x_cvtest_scaled), 'mse'))
    for ind, weight_l2 in enumerate(weight_l2_list):
        log_name = 'prostate_cv_%d_l2_%.0e' % (val_count, weight_l2)
        w_opt, _, _ = variable_decorrelation(x = x_cvtrain_scaled, y = y_train[train_index], 
                    log_name = log_name, learning_rate = learning_rate, weight_l2 = weight_l2, 
                    max_iter = 15000, max_to_keep = 5)
        _ = ols.fit(x_cvtrain_scaled, y_train[train_index], sample_weight = np.squeeze(w_opt))
        mse_l2_list[val_count-1, ind] = cal_prediction_error(y_train[test_index], ols.predict(x_cvtest_scaled), 'mse')     
   

### 2. Test Addtional outcome variables

In [29]:
def generate_function(decay):
    X = np.random.normal(0, 1, (1000, 4))
    f = np.sum(X, axis=1, keepdims=True) + np.random.normal(0, decay, (1000, 1))
    return preprocessing.scale(np.hstack((X, f)))

In [42]:
#x_train = generate_function_data(sample_size=500, decay=2e-3, predictor_size=4, outcome_size=1)
x_train = generate_function(2e-3)
beta_true = np.asarray([1, 1, 1, 1, 0]).reshape([5, 1])
data_description = 'synthetic data'
log_name = 'synthetic_data_add'
learning_rate = 1e-5
weight_l2 = 500
w_opt, correlation, collinearity = variable_decorrelation(x = x_train, y = None, weight_normalize=1, 
                                    log_name = log_name, learning_rate = learning_rate, weight_l2 = weight_l2,
                                    max_iter = 15000, display_iter = 300, max_to_keep = 10)

In [50]:
y_train = linear_model_generation(x_train, beta_true, noise_level=0.1)

In [51]:
ols = OLS()
coef_ols = ols.fit(x_train, y_train)
print('MAE of OLS: ', cal_estimation_error(beta_true, ols.beta, 'mae'))
print(ols.beta)
_ = ols.fit(x_train, y_train, sample_weight=w_opt.squeeze())
print(ols.beta)
print('MAE of OLS: ', cal_estimation_error(beta_true, ols.beta, 'mae'))

MAE of OLS:  1.634260100265489
[[-0.39141563]
 [-0.28819313]
 [-0.36528746]
 [-0.3617946 ]
 [ 2.76460968]]
[[-0.65845759]
 [-0.53028702]
 [-0.6172973 ]
 [-0.61759793]
 [ 3.28610851]]
MAE of OLS:  1.9419496692406721


In [52]:
lasso = Lasso()
print(lasso.fit(x_train, y_train))
print(lasso.fit(x_train, y_train, weights=w_opt))

[[-0.00380304]
 [ 0.        ]
 [ 0.05367042]
 [ 0.        ]
 [ 0.        ]
 [ 2.00599417]]
[[-0.00042851]
 [-0.0074856 ]
 [ 0.05814685]
 [ 0.00661391]
 [ 0.00397245]
 [ 2.009243  ]]


In [54]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
lasso = Lasso()
lasso_our = Lasso()
lambda_list = [x for x in np.logspace(-4, 1, 100)]
mse_lasso_cv = np.zeros((len(lambda_list) , 1))
mse_lasso_our_cv = np.zeros((len(lambda_list) , 1))

for train_index, test_index in kf.split(x_train):
    for ind, lambdau in enumerate(lambda_list):
        _ = lasso.fit(x_train[train_index], y_train[train_index], validation=False, lambdau=lambdau)
        mse_lasso_cv[ind,0] += cal_prediction_error(y_train[test_index], lasso.predict(x_train[test_index]), 'mse')
        _ = lasso_our.fit(x_train[train_index], y_train[train_index], validation=False, lambdau=lambdau, weights=w_opt[train_index])
        mse_lasso_our_cv[ind,0] += cal_prediction_error(y_train[test_index], lasso_our.predict(x_train[test_index]), 'mse')
param_index = np.argmin(mse_lasso_cv)
lambda_lasso = lambda_list[param_index]
param_index = np.argmin(mse_lasso_our_cv)
lambda_lasso_our = lambda_list[param_index]

In [55]:
lambda_lasso, lambda_lasso_our

(0.0007220809018385464, 0.002595024211399737)

In [56]:
lasso.fit(x_train, y_train, validation=False, lambdau=lambda_lasso)

array([[-0.00380304],
       [ 0.        ],
       [ 0.05367042],
       [ 0.        ],
       [ 0.        ],
       [ 2.00599417]])

In [58]:
lasso_our.fit(x_train, y_train, validation=False, lambdau=lambda_lasso_our, weights=w_opt)

array([[-0.00119208],
       [ 0.        ],
       [ 0.07460739],
       [ 0.02387393],
       [ 0.02074003],
       [ 1.98397402]])

In [18]:
_, correlation, collinearity = weighted_corrcoef(x_cv, w_opt)
print('Weighted Correlation: ', correlation)
print('Weighted Collinearity(Log): ', np.log(collinearity))

Weighted Correlation:  0.39569526456497756
Weighted Collinearity(Log):  14.034349497409663


In [230]:
_, correlation, collinearity = weighted_corrcoef(x_train[:, 5:], w_opt)
print('Weighted Correlation: ', correlation)
print('Weighted Collinearity: ', collinearity)

Weighted Correlation:  0.37161934250968803
Weighted Collinearity:  1265357.7303086324


In [251]:
ols_our = OLS()
coef_ols_our = ols_our.fit(x_train, y_train, sample_weight = np.squeeze(w_opt))
# print('MAE of OLS+Our: ', cal_estimation_error(beta_true, ols_our.beta, 'rmse'))
# print('MSE of OLS+Our: ',cal_prediction_error(y_test, ols_our.predict(x_test), 'rmse'))

In [252]:
coef_ols_our

array([[ 0.00669347],
       [-2.63909133],
       [-2.65674492],
       [-2.8596922 ],
       [-2.818126  ],
       [ 7.67300117],
       [-0.90273771],
       [-1.02731056],
       [-0.91340014],
       [-0.80080951],
       [ 3.71229704]])

In [246]:
kf = KFold(n_splits=10, shuffle=True, random_state=0)
lasso_our = Lasso()
lambda_list = [x for x in np.logspace(-5, 1, 100)];
mse_lasso_our_cv = np.zeros((len(lambda_list) , 1))

for train_index, test_index in kf.split(x_train):
    for ind, lambdau in enumerate(lambda_list):
        _ = lasso_our.fit(x_train[train_index], y_train[train_index], validation=False, lambdau=lambdau, weights=w_opt[train_index])
        mse_lasso_our_cv[ind, 0] += cal_prediction_error(y_train[test_index], lasso_our.predict(x_train[test_index]), 'rmse')
        
param_index = np.argmin(mse_lasso_our_cv)
lambda_lasso_our = lambda_list[param_index]

coef_lasso_our = lasso_our.fit(x_train, y_train, validation=False, lambdau=lambda_lasso_our, weights =w_opt )
print('MAE of Lasso+Our: ', cal_estimation_error(beta_true, lasso_our.beta, 'rmse'))
print('RMSE of Lasso+Our: ',cal_prediction_error(y_test, lasso_our.predict(x_test), 'rmse'))

MAE of Lasso+Our:  1.211567212466482
RMSE of Lasso+Our:  0.15908381832398932


In [247]:
coef_lasso, coef_lasso_our

(array([[0.00407003],
        [0.06052972],
        [0.0577682 ],
        [0.        ],
        [0.00923926],
        [1.97931722],
        [0.0565467 ],
        [0.        ],
        [0.05255958],
        [0.10683379],
        [1.83278458]]), array([[0.00582436],
        [0.05188871],
        [0.05806107],
        [0.        ],
        [0.00971911],
        [1.98550685],
        [0.06100854],
        [0.        ],
        [0.05132991],
        [0.1020508 ],
        [1.8389945 ]]))

In [253]:
decay_list = [0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
mse_ols, mse_ols_our, mse_lasso, mse_lasso_our = [], [], [], []
for decay in decay_list:
    x_test = generate_function_data(sample_size = 500, decay = decay)
    y_test = linear_model_generation(x_test, beta_true, noise_level=0.1)
    mse_ols.append(cal_prediction_error(y_test, ols.predict(x_test), 'rmse'))
    mse_ols_our.append(cal_prediction_error(y_test, ols_our.predict(x_test), 'rmse'))
    mse_lasso.append(cal_prediction_error(y_test, lasso.predict(x_test), 'rmse'))
    mse_lasso_our.append(cal_prediction_error(y_test, lasso_our.predict(x_test), 'rmse'))

In [254]:
np.mean(mse_ols), np.mean(mse_lasso), np.mean(mse_ols_our), np.mean(mse_lasso_our)

(0.3740530337728788,
 0.28151601924995606,
 0.8141470772828949,
 0.2782116110986788)

In [255]:
np.std(mse_ols), np.std(mse_lasso), np.std(mse_ols_our), np.std(mse_lasso_our)

(0.2347544940738816,
 0.16809393680853812,
 0.5425046767746702,
 0.16883344576305237)