In [1]:
%load_ext autoreload
%autoreload 1
%matplotlib inline
%run ./code/package_loader.py
%aimport data_generator, model, baseline, utils

### 1.Load and preprocess data

In [2]:
p_x, p_y = 6, 6
file_name_list = ['AQ-01-073-0023','AQ-09-009-0027','AQ-11-001-0043',
                  'AQ-13-089-0002','AQ-18-097-0078','AQ-22-033-0009',
                  'AQ-35-001-0023','AQ-37-119-0041','AQ-39-061-0040','AQ-56-021-0100']
state_X_list, state_Y_list = [], []
for file_name in file_name_list:
    fd = open('AQ_data/'+file_name+'.csv', 'r')
    data = fd.readlines()
    x, y = np.zeros((len(data), p_x)), np.zeros((len(data), p_y))
    for ind, line in enumerate(data):
        temp = line.strip().split(',')
        x[ind, :] = [i for i in map(float, temp[:p_x])]
        y[ind, :] = [i for i in map(float, temp[p_x:])]
    state_X_list.append(x); state_Y_list.append(y)

In [None]:
for i in range(10):
    x_train, y_train = state_X_list[i], state_Y_list[i]
    _, correlation, collinearity = weighted_corrcoef(x_train, np.ones((x_train.shape[0], 1)))
    print('Original Correlation: ', correlation)
    print('Original Collinearity: ', collinearity)
    features = ['temp', 'pressure', 'humidity', 'dir_sin', 'dir_cos', 'speed']
    corrmat = np.corrcoef(x_train, rowvar = False)
    f, ax = plt.subplots(figsize=(10, 7))
    plt.xticks(rotation='45')
    sns.heatmap(corrmat, annot=True, square=True, linewidths=.5,
                xticklabels=features, yticklabels=features, cmap='YlGnBu')
    plt.show()

In [131]:
train_ind = 1
outcome_ind = 3
x_train_whole, y_train_whole = state_X_list[train_ind], state_Y_list[train_ind][:, outcome_ind]
# Sample and Scale the data
np.random.seed(0)
sample_index = np.random.choice(x_train_whole.shape[0], 500, replace=False)
unsample_index = [x for x in range(len(x_train_whole)) if not (x in sample_index)]

x_train, y_train = x_train_whole[sample_index], y_train_whole[sample_index]
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_iid_test, y_iid_test = scaler.transform(x_train_whole[unsample_index]), y_train_whole[unsample_index]

In [132]:
_, correlation, collinearity = weighted_corrcoef(x_train_scaled, np.ones((x_train_scaled.shape[0], 1)))
print('Original Correlation: ', correlation)
print('Original Collinearity: ', collinearity)

Original Correlation:  0.34681692111845663
Original Collinearity:  5.34104867812108


### 2. Lasso & OLS & IILasso

In [133]:
ols = OLS()
coef_ols = ols.fit(x_train_scaled, y_train)

In [134]:
coef_ols

array([[10.836     ],
       [-1.97330217],
       [ 0.42385567],
       [-0.85789338],
       [-0.32341809],
       [-2.91460411],
       [-2.23240732]])

In [135]:
print(cal_prediction_error(y_iid_test, ols.predict(x_iid_test), 'rmse'))

9.72360712673791


In [138]:
kf = KFold(n_splits=5, shuffle=False, random_state=0)
lasso = Lasso()
lambda_list = [x for x in np.logspace(-4, 1, 100)]
mse_lasso_cv = np.zeros((len(lambda_list) , 1))

for train_index, test_index in kf.split(x_train):
    scaler_cv = preprocessing.StandardScaler().fit(x_train[train_index])
    x_cvtrain_scaled = scaler_cv.transform(x_train[train_index])
    x_cvtest_scaled = scaler_cv.transform(x_train[test_index])
    for ind, lambdau in enumerate(lambda_list):
        _ = lasso.fit(x_cvtrain_scaled, y_train[train_index], validation=False, lambdau=lambdau, standardize=False)
        mse_lasso_cv[ind, 0] += cal_prediction_error(y_train[test_index], lasso.predict(x_cvtest_scaled), 'rmse')
        
param_index = np.argmin(mse_lasso_cv)
lambda_lasso = lambda_list[param_index]

In [139]:
print('Optimal Lambda for Lasso: %.4f' % (lambda_lasso))
coef_lasso = lasso.fit(x_train_scaled, y_train, validation=False, lambdau=lambda_lasso)
print('MSE of Lasso: ', cal_prediction_error(y_iid_test, lasso.predict(x_iid_test), 'rmse'))
print(coef_lasso)

Optimal Lambda for Lasso: 3.1257
MSE of Lasso:  8.811123665657538
[[10.836]
 [ 0.   ]
 [ 0.   ]
 [ 0.   ]
 [ 0.   ]
 [ 0.   ]
 [ 0.   ]]


In [147]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)
iilasso = IIlasso()
lasso = Lasso()

#l1_norm_list = [1e-3, 1e-2, 1e-1, 1, 10]
l1_norm_list  = [x for x in np.logspace(-5, -2, 100)]
corr_norm_list = [1e-3, 1e-2, 1e-1, 1, 10]
mse_iilasso_cv = np.zeros((len(l1_norm_list) , len(corr_norm_list)))
mse_lasso_cv = np.zeros((len(l1_norm_list) , 1))
val_count = 0

for indx, l1_norm in enumerate(l1_norm_list):
        _ = lasso.fit(x_train_scaled, y_train, validation=False, lambdau=l1_norm, weights=w_opt)
        mse_lasso_cv[indx, 0] += (cal_prediction_error(y_iid_test,  lasso.predict(x_iid_test), 'mse'))

# for train_index, test_index in kf.split(x_train_scaled):
#     val_count += 1
#     #scaler = preprocessing.StandardScaler().fit(x_train[train_index])
#     #x_cvtrain_scaled = scaler.transform(x_train[train_index])
#     #x_cvtest_scaled = scaler.transform(x_train[test_index])
#     for indx, l1_norm in enumerate(l1_norm_list):
#         _ = lasso.fit(x_train_scaled[train_index], y_train[train_index], validation=False, lambdau=l1_norm)
#         #lasso.beta, lasso.intercept = coef_ols[1:], coef_ols[0]
#         mse_lasso_cv[indx, 0] += (cal_prediction_error(y_train[test_index],  lasso.predict(x_train_scaled[test_index]), 'mse'))
#     for indx, l1_norm in enumerate(l1_norm_list):
#         for indy, corr_norm in enumerate(corr_norm_list):
#             _ = iilasso.fit(x_cvtrain_scaled, y_train[train_index].reshape([400,1]), learning_rate=1e-2,
#                             l1_norm=l1_norm, corr_norm=corr_norm, model_path='AQ_data')
#             mse_iilasso_cv[indx, indy] += cal_prediction_error(y_train[test_index], 
#                                                     iilasso.predict(x_cvtest_scaled), 'mse')
# index = np.argmin(mse_iilasso_cv)
# indx, indy = index//len(corr_norm_list), index%len(corr_norm_list)
# l1_norm, corr_norm = l1_norm_list[indx], corr_norm_list[indy]

In [148]:
mse_lasso_cv

array([[0.000357  ],
       [0.00035698],
       [0.00035695],
       [0.00035693],
       [0.0003569 ],
       [0.00035687],
       [0.00035684],
       [0.00035681],
       [0.00035677],
       [0.00035673],
       [0.00035669],
       [0.00035665],
       [0.0003566 ],
       [0.00035655],
       [0.0003565 ],
       [0.00035644],
       [0.00035638],
       [0.00035631],
       [0.00035624],
       [0.00035616],
       [0.00035608],
       [0.00035599],
       [0.0003559 ],
       [0.00035579],
       [0.00035569],
       [0.00035557],
       [0.00035545],
       [0.00035531],
       [0.00035517],
       [0.00035502],
       [0.00035485],
       [0.00035468],
       [0.00035449],
       [0.00035429],
       [0.00035408],
       [0.00035385],
       [0.0003536 ],
       [0.00035334],
       [0.00035306],
       [0.00035275],
       [0.00035243],
       [0.00035209],
       [0.00035172],
       [0.00035133],
       [0.00035091],
       [0.00035046],
       [0.00034998],
       [0.000

In [42]:
l1_norm, corr_norm

(0.01, 0.001)

In [43]:
coef_iilasso = iilasso.fit(x_train_scaled, y_train.reshape([500,1]), learning_rate=1e-2,
                            l1_norm=l1_norm, corr_norm=corr_norm, model_path='AQ_data')

In [44]:
coef_iilasso

array([[ 0.039976  ],
       [-0.0000986 ],
       [ 0.00301105],
       [ 0.00043764],
       [ 0.00185843],
       [ 0.00486853],
       [-0.00212519]], dtype=float32)

In [92]:
mse_ols, mse_lasso, mse_iilasso = [], [], []
for i in range(10):
    if i == train_ind:
        x_test, y_test = x_iid_test, y_iid_test
    else:
        x_test, y_test = scaler.transform(state_X_list[i]), state_Y_list[i][:, outcome_ind]
    mse_ols.append(cal_prediction_error(y_test, ols.predict(x_test), 'rmse'))
    mse_lasso.append(cal_prediction_error(y_test, lasso.predict(x_test), 'rmse'))
    mse_iilasso.append(cal_prediction_error(y_test, lasso.intercept, 'rmse'))

In [93]:
print('ave_MSE(std) of OLS: %.4f(%.4f)'%(np.mean(mse_ols), np.std(mse_ols)))
print('ave_MSE(std) of Lasso: %.4f(%.4f)'%(np.mean(mse_lasso), np.std(mse_lasso)))
print('ave_MSE(std) of IILasso: %.4f(%.4f)'%(np.mean(mse_iilasso), np.std(mse_iilasso)))

ave_MSE(std) of OLS: 49.0445(17.4258)
ave_MSE(std) of Lasso: 12.3232(2.3422)
ave_MSE(std) of IILasso: 12.3272(2.3438)


### 3.Our Method

In [7]:
learning_rate_list = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
data_description = 'investigate convergence property with different lr'
corr_list, coll_list = [], []
for learning_rate in learning_rate_list:
    log_name = 'AQ_lr_%.0e' % (learning_rate)
    w_opt, corr, coll = variable_decorrelation(x = x_train_scaled, y = y_train, 
                           log_name = log_name, data_description = data_description,
                           learning_rate = learning_rate, max_iter = 15000, display_iter=300)
    corr_list.append(corr); coll_list.append(coll)

In [None]:
coll_list[-1]

In [75]:
learning_rate = 2e-5

#### 3.2 CV on x_train_scaled for tuning parameter weight_l2 using OLS

In [30]:
kf = KFold(n_splits=10, shuffle=False, random_state=0)
ols, lasso =  OLS(), Lasso()
weight_l2_list = [1e-1, 1, 10, 30, 50, 100, 300, 500]
mse_l2_list = np.zeros((5, len(weight_l2_list)))
mse_ols, mse_lasso = [], []
val_count = 0
for train_index, test_index in kf.split(x_train):
    val_count += 1
    x_cvtrain, y_cvtrain = x_train[train_index], y_train[train_index]
    x_cvtest, y_cvtest = x_train[test_index], y_train[test_index]
    
    scaler_cv = preprocessing.StandardScaler().fit(x_cvtrain)
    x_cvtrain_scaled = scaler_cv.transform(x_cvtrain)
    x_cvtest_scaled = scaler_cv.transform(x_cvtest)
    
    _ = ols.fit(x_cvtrain_scaled, y_cvtrain)
    mse_ols.append(cal_prediction_error(y_cvtest, ols.predict(x_cvtest_scaled), 'mse'))
    _ = lasso.fit(x_cvtrain_scaled, y_cvtrain, validation=False, lambdau = 0.23130281)
    mse_lasso.append(cal_prediction_error(y_cvtest, lasso.predict(x_cvtest_scaled), 'mse'))
#     for ind, weight_l2 in enumerate(weight_l2_list):
#         log_name = 'AQ_cv_%d_l2_%.0e' % (val_count, weight_l2)
#         w_opt = load_weight(x_cvtrain_scaled, 'model/'+log_name+'/model_iters15000.ckpt')
#         _, corr, coll = weighted_corrcoef(x_cvtrain_scaled, w_opt)
#         print(corr, coll)
# #         w_opt, _, _ = variable_decorrelation(x = x_cvtrain_scaled, y = y_cvtrain, 
# #                     log_name = log_name, learning_rate = learning_rate, weight_l2 = weight_l2, 
# #                     max_iter = 15000, max_to_keep = 5)
#         _ = ols.fit(x_cvtrain_scaled, y_cvtrain, sample_weight = np.squeeze(w_opt))
#         mse_l2_list[val_count-1, ind] = cal_prediction_error(y_cvtest, ols.predict(x_cvtest_scaled), 'rmse')

In [31]:
np.mean(mse_lasso)

448.71122242001684

In [32]:
lasso.coef

array([[17.38      ],
       [ 2.05839841],
       [ 0.71298598],
       [ 0.        ],
       [ 0.72562565],
       [-1.18739996],
       [ 4.69488758]])

In [28]:
for ind, weight_l2 in enumerate(weight_l2_list):
    print('l2 %.0e: %.4f(%.4f)' % (weight_l2, np.mean(mse_l2_list[:, ind]), np.std(mse_l2_list[:, ind])))
print(np.mean(mse_ols), np.std(mse_ols))

l2 1e-01: 17.0907(10.3262)
l2 1e+00: 17.0909(10.3260)
l2 1e+01: 17.0929(10.3234)
l2 3e+01: 17.0974(10.3180)
l2 5e+01: 17.1016(10.3133)
l2 1e+02: 17.1098(10.3044)
l2 3e+02: 17.0966(10.3083)
l2 5e+02: 17.0872(10.3167)
16.99451977856486 10.5326259538568


In [76]:
weight_l2 = 100

#### 3.3 Evaluate the performance over different states

In [77]:
data_description = 'air quality data'
log_name = 'AQ_data'
w_opt, correlation, collinearity = variable_decorrelation(x = x_train_scaled, y = y_train, 
                                    log_name = log_name, learning_rate = learning_rate, weight_l2 = weight_l2,
                                    max_iter = 30000, display_iter = 300, max_to_keep = 10)

In [142]:
w_opt = load_weight(x_train_scaled, 'model/AQ_data/model_iters30000.ckpt')

INFO:tensorflow:Restoring parameters from model/AQ_data/model_iters30000.ckpt


In [89]:
kf = KFold(n_splits=20, shuffle=False, random_state=0)
lasso_our = Lasso()
lambda_list = [x for x in np.logspace(-4, 1, 100)]
mse_lasso_our_cv = np.zeros((len(lambda_list) , 1))

for train_index, test_index in kf.split(x_train):
    scaler_cv = preprocessing.StandardScaler().fit(x_train[train_index])
    x_cvtrain_scaled = scaler_cv.transform(x_train[train_index])
    x_cvtest_scaled = scaler_cv.transform(x_train[test_index])
    for ind, lambdau in enumerate(lambda_list):
        _ = lasso_our.fit(x_cvtrain_scaled, y_train[train_index], validation=False, lambdau=lambdau, weights=w_opt[train_index])
        mse_lasso_our_cv[ind, 0] += cal_prediction_error(y_train[test_index], lasso_our.predict(x_cvtest_scaled), 'mse')
        
param_index = np.argmin(mse_lasso_our_cv)
lambda_lasso_our = lambda_list[param_index]

In [90]:
print('Optimal Lambda for Lasso+Our: %.4f' % (lambda_lasso_our))
coef_lasso_our = lasso_our.fit(x_train_scaled, y_train, validation=False, lambdau=lambda_lasso_our, weights=w_opt)
print('MSE of Lasso: ', cal_prediction_error(y_iid_test, lasso_our.predict(x_iid_test), 'rmse'))
print(coef_lasso_our)


Optimal Lambda for Lasso+Our: 7.9248
MSE of Lasso:  16.9839880191341
[[19.40479739]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.76548085]]


In [87]:
ols_our = OLS()
mse_lasso_our, mse_ols_our = [], []
coef_ols_our = ols_our.fit(x_train_scaled, y_train, sample_weight=np.squeeze(w_opt))
#coef_lasso_our = lasso_our.fit(x_train_scaled, y_train, weights = w_opt, validation=False, lambdau=1e-2)
for i in range(10):
    if i == train_ind:
        x_test, y_test = x_iid_test, y_iid_test
    else:
        x_test, y_test = scaler.transform(state_X_list[i]), state_Y_list[i][:, outcome_ind]
    mse_ols_our.append(cal_prediction_error(y_test, ols_our.predict(x_test), 'rmse'))
    mse_lasso_our.append(cal_prediction_error(y_test, lasso_our.predict(x_test), 'rmse'))

In [88]:
print('ave_MSE(std) of OLS+Our: %.4f(%.4f)'%(np.mean(mse_ols_our), np.std(mse_ols_our)))
print('ave_MSE(std) of Lasso+OUr: %.4f(%.4f)'%(np.mean(mse_lasso_our), np.std(mse_lasso_our)))

ave_MSE(std) of OLS+Our: 45.3853(15.2347)
ave_MSE(std) of Lasso+OUr: 12.5967(2.3305)


In [26]:
print(coef_lasso_our, coef_ols_our)

[[ 0.31885339]
 [-0.01548832]
 [-0.00682469]
 [ 0.00110986]
 [-0.0018158 ]
 [-0.04458581]
 [-0.03589299]] [[ 0.31907504]
 [-0.0174633 ]
 [-0.00879334]
 [ 0.0014017 ]
 [-0.00396649]
 [-0.04693366]
 [-0.03807552]]
