In [1]:
from utils import *
from sklearn.linear_model import LinearRegression

In [2]:
class OLS(object):
    def __init__(self, beta_true = None):
        self.model = LinearRegression()

    def fit(self, x, y, **options):
        n, p = x.shape
        # Set the sample weights
        if 'weights' in options:
            self.weights = np.reshape(options['weights'], [n,])
        else:
            self.weights = np.ones([n,])

        # Fit the model and get the parameter
        self.model.fit(x, y, sample_weight = self.weights)
        self.beta = np.reshape(self.model.coef_, [p, 1])
        self.intercept = np.reshape(self.model.intercept_, [1, 1])
        self.coef = np.concatenate((self.intercept, self.beta), axis = 0)

    def predict(self, x_test):
        return np.matmul(x_test, self.beta) + self.intercept

In [3]:
X_list, Y_list = [], []
for i in range(6):
    x = np.load('House_Data/X'+str(i)+'.npy')
    y = np.load('House_Data/Y'+str(i)+'.npy')
    X_list.append(x)
    Y_list.append(y)

In [4]:
for i in range(6):
    x, y = X_list[i], Y_list[i]
    w_stat = weighted_stat(x, np.ones(y.shape))
    # default by indentity.
    print('======Envrionment %4d============' % (i))
    print('Original Correlation: ', w_stat['mean_corr'])
    print('Original Collinearity: ', w_stat['CN'])

Original Correlation:  0.19463977288355847
Original Collinearity:  41.28751156758869
Original Correlation:  0.20517168815940565
Original Collinearity:  42.333709959295646
Original Correlation:  0.19539134384534926
Original Collinearity:  34.80208300624458
Original Correlation:  0.1844833299748109
Original Collinearity:  33.44420625408224
Original Correlation:  0.1786383434270527
Original Collinearity:  31.203442363271016
Original Correlation:  0.16375951007089282
Original Collinearity:  31.941753947565097


In [5]:
np.random.seed(0)
train_ind = 0
x_train_whole, y_train_whole = X_list[train_ind], Y_list[train_ind]
sample_index = np.random.choice(x_train_whole.shape[0], 500, replace=False)
unsample_index = [x for x in range(len(x_train_whole)) if not (x in sample_index)]

x_train, y_train = x_train_whole[sample_index], y_train_whole[sample_index]
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_iid_test, y_iid_test = scaler.transform(x_train_whole[unsample_index]), y_train_whole[unsample_index]

In [6]:
w_stat = weighted_stat(x_train_scaled, np.ones(y_train.shape))
print('Original Correlation: ', w_stat['mean_corr'])
print('Original Collinearity: ', w_stat['CN'])

Original Correlation:  0.2170941630425216
Original Collinearity:  51.51486885287824


In [7]:
ols = OLS()
ols.fit(x_train_scaled, y_train)

In [8]:
rmse_ols = []
for i in range(6):
    if i == train_ind:
        x_test, y_test = x_iid_test, y_iid_test
    else:
        x_test, y_test = scaler.transform(X_list[i]), Y_list[i]
    rmse_ols.append(cal_prediction_error(y_test, ols.predict(x_test), 'rmse'))

In [9]:
print('ave_RMSE(std) of OLS: %.4f(%.4f)'%(np.mean(rmse_ols), np.std(rmse_ols)))

ave_RMSE(std) of OLS: 365078.1571(137313.6264)


In [10]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

def column_wise_resampling(x, replacement = False, random_state = 0, **options):
    """
    Perform column-wise random resampling to break the joint distribution of p(x).
    In practice, we can perform resampling without replacement (a.k.a. permutation) to retain all the data points of feature x_j. 
    Moreover, if the practitioner has some priors on which features should be permuted,
    it can be passed through options by specifying 'sensitive_variables', by default it contains all the features
    """
    rng = np.random.RandomState(random_state)
    n, p = x.shape
    if 'sensitive_variables' in options:
        sensitive_variables = options['sensitive_variables']
    else:
        sensitive_variables = [i for i in range(p)] 
    x_decorrelation = np.zeros([n, p])
    for i in sensitive_variables:
        var = x[:, i]
        if replacement: # sampling with replacement
            x_decorrelation[:, i] = np.array([var[rng.randint(0, n)] for j in range(n)])
        else: # permutation     
            x_decorrelation[:, i] = var[rng.permutation(n)]
    return x_decorrelation

def decorrelation(x, solver = 'adam', hidden_layer_sizes = (2,), max_iter = 500, random_state = 0, clip_range = 0.9):
    """
    Calcualte new sample weights by density ratio estimation
           q(x)   P(x belongs to q(x) | x) 
    w(x) = ---- = ------------------------ 
           p(x)   P(x belongs to p(x) | x)

    If default == True, then a single hidden layer perceptron will be used as binary classifier, 
    otherwise you can specify it by 'classifier', it must have 'fit' and 'predict_proba' api according to sklearn API standard.
    """
    n, p = x.shape
    x_decorrelation = column_wise_resampling(x, random_state = random_state)
    P = pd.DataFrame(x)
    Q = pd.DataFrame(x_decorrelation)
    P['src'] = 1 # 1 means source distribution
    Q['src'] = 0 # 0 means target distribution
    Z = pd.concat([P, Q], ignore_index=True, axis=0)
    labels = Z['src'].values
    Z = Z.drop('src', axis=1).values
    P, Q = P.values, Q.values

    # Train a binary classifier to classify the source and target distribution
    clf = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes, max_iter=max_iter, random_state=random_state)
    clf.fit(Z, labels)
    proba = np.clip(clf.predict_proba(Z)[:len(P), 1], 1-clip_range, clip_range)
    weights = (1./proba) - 1. # calculate sample weights by density ratio
    weights /= np.mean(weights) # normalize the weights to get average 1
    weights = np.reshape(weights, [n,])
    return weights

In [11]:
ols_our = OLS()
w = decorrelation(x_train_scaled, max_iter=1000, hidden_layer_sizes=(3,))
w_stat = weighted_stat(x_train_scaled, w)
print('Decorrelated Correlation: ', w_stat['mean_corr'])
print('Decorrelated Collinearity: ', w_stat['CN'])
print('Start fitting')

ols_our.fit(x_train_scaled, y_train, weights=w)
rmse_ols_our = []
for i in range(6):
    if i == train_ind:
        x_test, y_test = x_iid_test, y_iid_test
    else:
        x_test, y_test = scaler.transform(X_list[i]), Y_list[i]
    rmse_ols_our.append(cal_prediction_error(y_test, ols_our.predict(x_test)))

print('ave_RMSE(std) of OLS+Our: %.4f(%.4f)'%(np.mean(rmse_ols_our), np.std(rmse_ols_our)))

Decorrelated Correlation:  0.15337498374949854
Decorrelated Collinearity:  17.197719725581518
Start fitting
ave_RMSE(std) of OLS+Our: 249206.1521(47224.5792)
