# Creating a feature engineering pipeline

In the previous notebook, we refactored our feature engineering steps into Scikit-learn-like classes or replaced them with transformers from Feature-engine. These new classes can be incorporated into a Scikit-learn pipeline to make the feature creation process easier.

In this notebook, we will line up all the feature transformation steps into a pipeline and execute the feature extraction in fewer lines of code.

## Data

We will work with the Air Quality Dataset from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Air+Quality).

For instructions on how to download, prepare, and store the dataset, refer to notebook number 3, in the folder "01-Datasets" from this repo.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.base import (
    BaseEstimator,
    TransformerMixin,
    RegressorMixin,
)

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from feature_engine.creation import CyclicalTransformer, MathematicalCombination
from feature_engine.datetime import DatetimeFeatures
from feature_engine.imputation import DropMissingData
from feature_engine.selection import DropFeatures

# Load Data

In [2]:
# We pack all the data preparation steps in one function


def load_data():
    
    # Data lives here.
    filename = '../datasets/AirQualityUCI_ready.csv'

    # Load data.
    data = pd.read_csv(filename)
    
    # We'll only use sensor data, temperature
    # and relative humidity. Thus, we drop all 
    # other variables.

    drop_vars  = [var for var in data.columns if '_true' in var]
    drop_vars.append('AH')

    # Remove variables.
    data.drop(labels=drop_vars, axis=1, inplace=True)
    
    # Cast date variable in datetime format.

    data['Date_Time'] = pd.to_datetime(data['Date_Time'])

    # Set the index to the timestamp.

    data.index = data['Date_Time']

    # Sanity: sort index.

    data.sort_index(inplace=True)
    
    # Remove data with a lot of missing information.

    # Check our notebook in section 2 to understand
    # why we perform this step

    data = data[(
        data['Date_Time'] >= '2004-04-01') &
        (data['Date_Time'] <= '2005-04-30')
    ]
    
    variables = [var for var in data.columns if var != 'Date_Time']
    
    
    data = data.loc[(data[variables]>0).all(axis=1)]
    
    return data

In [3]:
data = load_data()

data.head()

Unnamed: 0_level_0,Date_Time,CO_sensor,NMHC_sensor,NOX_sensor,NO2_sensor,O3_sensor,T,RH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-04-04 00:00:00,2004-04-04 00:00:00,1224.0,892.0,884.0,1580.0,923.0,16.7,56.5
2004-04-04 01:00:00,2004-04-04 01:00:00,1215.0,843.0,929.0,1551.0,862.0,15.9,59.2
2004-04-04 02:00:00,2004-04-04 02:00:00,1115.0,782.0,980.0,1500.0,752.0,15.2,62.4
2004-04-04 03:00:00,2004-04-04 03:00:00,1124.0,793.0,965.0,1521.0,791.0,14.7,65.0
2004-04-04 04:00:00,2004-04-04 04:00:00,1028.0,682.0,1090.0,1448.0,697.0,14.3,65.3


# Our Feature engineering classes

## Lag Features

In [4]:
class LagFeatures(BaseEstimator, TransformerMixin):

    def __init__(self, features, frequency, label):

        # In the init we specify the parameters that
        # the user needs to pass to start the transformer.

        # The user needs to indicate which features to lag,
        # how much we should lag the variables, 
        # and the name for the new variables.
        
        self.features = features
        self.frequency = frequency
        self.label = label

    def fit(self, X, y=None):

        # We do not need to learn parameters

        return self

    def transform(self, X):

        # We lag the features

        # We make a copy not to over-write the original data
        X = X.copy()

        # Shift the data forward.
        tmp = X[self.features].shift(freq=self.frequency)

        # Name the new variables.
        tmp.columns = [v + self.label for v in self.features]

        # Add the variables to the original data.
        X = X.merge(tmp, left_index=True, right_index=True, how='left')

        return X

## Window features

In [5]:
class WindowFeatures(BaseEstimator, TransformerMixin):

    def __init__(self, features, window, frequency):

        # In the init we specify the parameters that
        # the user needs to pass to start the transformer.

        # The user needs to indicate the features to use for the computation
        # the size of the window,
        # and the frequency to shift forward.
        
        self.features = features
        self.window = window
        self.frequency = frequency

    def fit(self, X, y=None):

        # We do not need to learn parameters

        return self

    def transform(self, X):

        # First we calculate the average of the feature in
        # the indicated window, then we shift the value forward
        # based on the indicated frequency.

        X = X.copy()

        tmp = (X[self.features]
               .rolling(window=self.window).mean()
               .shift(freq=self.frequency)
               )

        # Rename the columns
        tmp.columns = [v + '_window' for v in self.features]

        # Add the variables to the original data.
        X = X.merge(tmp, left_index=True, right_index=True, how='left')

        return X

## Seasonality Features

Note that this is the only class that learns parameters from the data!

In [6]:
class SeasonalTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, season_var, variables):

        # In the init we specify the parameters that
        # the user needs to pass to start the transformer.

        # The user needs to indicate the seasonal variable
        # and the variables that should be aggregated.

        self.season_var = season_var
        self.variables = variables

    def fit(self, X, y=None):

        # We want to estimate the mean value of the
        # time series in the seasonal term.

        # In our demo, that is the mean pollutant's 
        # concentration per hour.

        # We make a copy of the dataframe 
        # not to over-write the user's data.
        X = X.copy()

        # Calcualte mean pollutant per hr.
        # The learned values will be stored in this attribute.
        self.seasonal_ = X.groupby(self.season_var)[self.variables].mean()

        # Rename the new variables.
        self.seasonal_.columns = [v + '_season' for v in self.variables]

        return self

    def transform(self, X):

        # We want to add the seasonal component to the
        # dataset to transform.

        X = X.copy()

        X = X.merge(self.seasonal_, on=self.season_var, how='left')

        return X

# Feature Engineering Pipeline

In [7]:
# Some hard coded values

# Original variables
variables = [var for var in data.columns if var !='Date_Time']

# Pollutants (our targets)
pollutants = variables[:-2]

lag_1 = ['CO_sensor_lag_1',
         'NMHC_sensor_lag_1',
         'NOX_sensor_lag_1',
         'NO2_sensor_lag_1',
         'O3_sensor_lag_1',
         'T_lag_1',
         'RH_lag_1']

lag_24 = ['CO_sensor_lag_24',
          'NMHC_sensor_lag_24',
          'NOX_sensor_lag_24',
          'NO2_sensor_lag_24',
          'O3_sensor_lag_24',
          'T_lag_24',
          'RH_lag_24']

In [8]:
engineering_pipe = Pipeline([

    # Extract datetime features
    ('datetime_features', DatetimeFeatures(
        features_to_extract=["month",
                             "week_of_the_year",
                             "day_of_the_week",
                             "day_of_the_month",
                             "hour",
                             "weekend",
                            ],
        drop_original=False)),

    # Lag Features
    ('lag_1', LagFeatures(variables, '1H', '_lag_1')),
    ('lag_24', LagFeatures(variables, '24H', '_lag_24')),

    # Window Features
    ('window_features', WindowFeatures(variables, '3H', '1H')),

    # Combine pollutants
    ('Combine_lag_1', MathematicalCombination(
        variables_to_combine=lag_1,
        math_operations=['sum'],
        new_variables_names=['total_poll_lag_1'],
        missing_values='ignore')),

    ('Combine_lag_24', MathematicalCombination(
        variables_to_combine=lag_24,
        math_operations=['sum'],
        new_variables_names=['total_poll_lag_1'],
        missing_values='ignore')),

    # Periodic features
    ('Periodic', CyclicalTransformer(
        variables=['Date_Time_month', 'Date_Time_hour'],
        drop_original=False)),

    # Drop missing data
    ('dropna', DropMissingData(missing_only=True)),

])

In [9]:
print(data.shape)

data = engineering_pipe.fit_transform(data)

print(data.shape)

(7379, 8)
(6892, 40)


# Naive Forecast

We will predict the pollutant's concentration at time t with its value at time t-1. We will use this model as a benchmark. We will compare the results of Linear Regression against this benchmark.

In [10]:
class NaiveForecast(BaseEstimator, RegressorMixin):

    def __init__(self, predictor_var):

        # In the init we specify the parameters that
        # the user needs to pass to start the transformer.

        # The user needs to indicate the seasonal variable
        # and the variables that should be aggregated.

        self.predictor_var = predictor_var

    def fit(self, X, y):

        # We want to estimate the mean value of the
        # time series in the seasonal term.

        return self

    def predict(self, X):

        # We want to add the seasonal component to the
        # dataset to transform.

        return X[self.predictor_var]

## Test the Naive Forecaster

In [11]:
# Test our predictor class

naive = NaiveForecast(predictor_var='CO_sensor_lag_1')

naive.fit(data, data['CO_sensor'])

preds = naive.predict(data)

preds

Date_Time
2004-04-05 00:00:00    1188.0
2004-04-05 01:00:00    1065.0
2004-04-05 02:00:00     999.0
2004-04-05 03:00:00     911.0
2004-04-05 04:00:00     873.0
                        ...  
2005-04-04 10:00:00    1297.0
2005-04-04 11:00:00    1314.0
2005-04-04 12:00:00    1163.0
2005-04-04 13:00:00    1142.0
2005-04-04 14:00:00    1003.0
Name: CO_sensor_lag_1, Length: 6892, dtype: float64

## Forecast

In [12]:
# Load Data

data = load_data()

In [13]:
# Create features

data = engineering_pipe.transform(data)

In [14]:
# Split the data

X_train = data[data.index<='2005-03-04']
X_test = data[data.index>'2005-03-04']

X_train.shape, X_test.shape

((6398, 40), (494, 40))

In [15]:
# Forecast

naive = NaiveForecast(predictor_var='CO_sensor_lag_1')

naive.fit(X_train, data['CO_sensor'])

NaiveForecast(predictor_var='CO_sensor_lag_1')

In [16]:
# Evaluate the forecast: train set

print('X_train rmse: ',
      mean_squared_error(
          X_train['CO_sensor'], naive.predict(X_train), squared=False,
      ))

X_train rmse:  103.14617454328304


In [17]:
# Evaluate the forecast: test set

print('X_test rmse: ',
      mean_squared_error(
          X_test['CO_sensor'], naive.predict(X_test), squared=False,
      ))

X_test rmse:  111.12015235369915


# Linear Regression

Random Forest regression models are not very good at extrapolating values beyond those observed in the data. So let's now train a machine learning pipeline with a Lasso regression for comparison.

In [18]:
# these are not predictor variables
# yet, they are in the dataset, so we
# need to be careful to drop them, before
# training the lasso

vars_to_drop = ['Date_Time',
                'CO_sensor',
                'NMHC_sensor',
                'NOX_sensor',
                'NO2_sensor',
                'O3_sensor',
                'T',
                'RH']

In [19]:
regression_pipe = Pipeline([
    
    # create seasonal features
    ('seasonal', SeasonalTransformer(
        season_var='Date_Time_hour', variables=pollutants)
    ),
    
    # drop original (raw) variables. These are only 
    # needed to capture the seasonality.
    ('drop_vars', DropFeatures(features_to_drop=vars_to_drop)),
    
    # scaler
    ('scaler', MinMaxScaler()),
    
    # random forests
    ('lasso', Lasso(random_state=0)),    
])

In [20]:
regression_pipe.fit(X_train, X_train['CO_sensor'])

Pipeline(steps=[('seasonal',
                 SeasonalTransformer(season_var='Date_Time_hour',
                                     variables=['CO_sensor', 'NMHC_sensor',
                                                'NOX_sensor', 'NO2_sensor',
                                                'O3_sensor'])),
                ('drop_vars',
                 DropFeatures(features_to_drop=['Date_Time', 'CO_sensor',
                                                'NMHC_sensor', 'NOX_sensor',
                                                'NO2_sensor', 'O3_sensor', 'T',
                                                'RH'])),
                ('scaler', MinMaxScaler()), ('lasso', Lasso(random_state=0))])

In [21]:
print('X_train rmse: ',
      mean_squared_error(
          X_train['CO_sensor'], regression_pipe.predict(X_train), squared=False,
      ))

X_train rmse:  92.11615316930721


In [22]:
print('X_test rmse: ',
      mean_squared_error(
          X_test['CO_sensor'], regression_pipe.predict(X_test), squared=False,
      ))

X_test rmse:  93.39112295426037


We can see an uplift with respect to the Naive forecast.