In [1]:
# Make sure helpers functionality can be imported
import os
import sys

project_path, _ = os.path.split(os.getcwd())
if project_path not in sys.path:
    sys.path.insert(0, project_path)

In [2]:
# Dependencies
# pip install numpy
# pip install pandas
# pip install sklearn

# Ignore warnings
import warnings; warnings.simplefilter("ignore")

# Import libraries
import numpy as np
import pandas as pd

## Load an experimental data

In [3]:
# Load an example dataset
from sklearn.datasets import load_boston

dataset = load_boston()
X = dataset.data
y = dataset.target
feature_names = dataset.feature_names

print(dataset.get('DESCR'))

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

## Remove the effect of covariates

### 1. Use a standalone functions

In [4]:
from helpers.utils.transformers import remove_effect_of_covariates

# Show the original features
print("\nOriginal features\n")
print(X[0:8, 0:8])

# Create some artificial data to play the role of covariates
covariates = np.random.randn(X.shape[0], 3)

# Create the copy of the feature matrix
X_copy = X.copy()

# Remove the effect of covariates
X_copy = remove_effect_of_covariates(X_copy, covariates)

# Show the edited features
print("\nFeatures without the effect of covariates\n")
print(X_copy[0:8, 0:8])


Original features

[[6.3200e-03 1.8000e+01 2.3100e+00 0.0000e+00 5.3800e-01 6.5750e+00
  6.5200e+01 4.0900e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 6.4210e+00
  7.8900e+01 4.9671e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 7.1850e+00
  6.1100e+01 4.9671e+00]
 [3.2370e-02 0.0000e+00 2.1800e+00 0.0000e+00 4.5800e-01 6.9980e+00
  4.5800e+01 6.0622e+00]
 [6.9050e-02 0.0000e+00 2.1800e+00 0.0000e+00 4.5800e-01 7.1470e+00
  5.4200e+01 6.0622e+00]
 [2.9850e-02 0.0000e+00 2.1800e+00 0.0000e+00 4.5800e-01 6.4300e+00
  5.8700e+01 6.0622e+00]
 [8.8290e-02 1.2500e+01 7.8700e+00 0.0000e+00 5.2400e-01 6.0120e+00
  6.6600e+01 5.5605e+00]
 [1.4455e-01 1.2500e+01 7.8700e+00 0.0000e+00 5.2400e-01 6.1720e+00
  9.6100e+01 5.9505e+00]]

Features without the effect of covariates

[[6.3200e-03 1.8000e+01 2.3100e+00 0.0000e+00 5.3800e-01 6.5750e+00
  6.5200e+01 4.0900e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 6.4210e+00
  7.8900e+01 4.9671e+00]
 [2.7

### 2. Use a sklearn type of transformer

In [5]:
from helpers.utils.transformers import CovariateController

# Create some artificial data to play the role of covariates
covariates = np.random.randn(X.shape[0], 3)
df_covariates = pd.DataFrame({str(i): covariates[:, i] for i in range(covariates.shape[1])})

# Create the copy of the feature matrix
features = X.copy()
df_features = pd.DataFrame({feature_names[i]: features[:, i] for i in range(features.shape[1])})

# Remove the effect of covariates
df_features_copy = df_features.copy()
transformer = CovariateController()
df_features_transformed = transformer.fit_transform(df_features_copy, df_covariates)

In [6]:
# Show the original features
print("\nOriginal features\n")
df_features.head()


Original features



Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [7]:
# Show the edited features
print("\nFeatures without the effect of covariates\n")
df_features_transformed.head()


Features without the effect of covariates



Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-2.636703,6.816682,-8.468124,-0.055086,-0.003951,0.280146,0.7392,0.036082,-8.103266,-96.373821,-3.213398,33.017913,-6.795601
1,-3.56582,-9.176408,-4.290458,-0.050436,-0.082614,0.15204,11.721988,1.151685,-8.022078,-167.939196,-0.71437,38.495615,-3.423843
2,-5.186981,-8.476843,-4.296916,-0.037329,-0.08814,0.90717,-9.96276,1.371355,-8.617808,-182.049357,-0.769502,40.961678,-9.48842
3,-3.098847,-13.754221,-8.176323,-0.069418,-0.086522,0.674089,-21.058687,2.055377,-5.600063,-169.566628,0.22022,33.843532,-9.124711
4,-4.470775,-9.986239,-8.988914,-0.050396,-0.097037,0.860841,-15.751293,2.363857,-7.06583,-193.779196,0.173689,42.739804,-7.785354
