Custom transformers
---

In [None]:
import pandas as pd

data_df = pd.read_csv('messy-bikes.csv')
data_df.head()

In [None]:
data_df.isnull().mean()

In [None]:
import numpy as np

def preprocess_f(df):
    # Work on a copy
    df = df.copy()
    
    # Missing values in continuous features
    cont_vars = ['temp', 'hum', 'windspeed']
    for c in cont_vars:
        df[c] = df[c].fillna(df[c].mean()) # replace by mean
        
    # Explicitely convert to string values
    to_convert = ['yr', 'weekday']
    convert_f = lambda x: str(int(x)) if not np.isnan(x) else np.nan
    df[to_convert] = df[to_convert].applymap(convert_f)
    
    # .. in categorical ones: create 'missing' category
    cat_vars = ['yr', 'workingday', 'holiday', 'weekday', 'season', 'weathersit']
    df[cat_vars] = df[cat_vars].fillna('missing')
    
    # One-hot encoding
    df = pd.get_dummies(df)
    
    return df
    
preprocessed = preprocess_f(data_df)
preprocessed.head()

In [None]:
preprocessed.columns

In [None]:
from sklearn.preprocessing import FunctionTransformer

preprocessor = FunctionTransformer(preprocess_f, validate=False)
preprocessed = preprocessor.fit_transform(data_df)
preprocessed.head()

In [None]:
preprocessor.transform(data_df.iloc[:1])

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class PandasPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, preprocess_f):
        self.preprocess_f = preprocess_f

    def fit(self, X_df, y=None):
        # Check that we get a DataFrame
        assert type(X_df) == pd.DataFrame
        
        # Preprocess data
        X_preprocessed = self.preprocess_f(X_df)
        
        # Save columns names/order for inference time
        self.columns_ = X_preprocessed.columns
        
        return self

    def transform(self, X_df):
        # Check that we get a DataFrame
        assert type(X_df) == pd.DataFrame
        
        # Preprocess data
        X_preprocessed = self.preprocess_f(X_df)
        
        # Make sure to have the same features
        X_reindexed = X_preprocessed.reindex(columns=self.columns_, fill_value=0)
        
        return X_reindexed
    
preprocessor = PandasPreprocessor(preprocess_f)
preprocessor.fit(data_df)
preprocessor.transform(data_df.iloc[:1])

In [None]:
class PandasPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cat_vars_ = ['yr', 'workingday', 'holiday', 'weekday', 'season', 'weathersit']
        self.cont_vars_ = ['temp', 'hum', 'windspeed']
        self.to_convert_ = ['yr', 'weekday']
    
    def preprocess_f(self, X_df, train_mean):
        # Work on a copy
        X_df = X_df.copy()
        
        # Missing values in continuous features
        for c in self.cont_vars_:
            X_df[c] = X_df[c].fillna(train_mean[c])
        
        # Explicitely convert to string values
        convert_f = lambda x: str(int(x)) if not np.isnan(x) else np.nan
        X_df[self.to_convert_] = X_df[self.to_convert_].applymap(convert_f)
    
        # .. in categorical ones: create 'missing' category
        X_df[self.cat_vars_] = X_df[self.cat_vars_].fillna('missing')
    
        # One-hot encoding
        X_df = pd.get_dummies(X_df)
        
        return X_df
        
    def fit(self, X_df, y=None):
        # Check that we get a DataFrame
        assert type(X_df) == pd.DataFrame
    
        # Save train mean for continuous variables
        self.train_mean_ = X_df[self.cont_vars_].mean() 
        
        # Preprocess data
        X_preprocessed = self.preprocess_f(X_df, self.train_mean_)
        
        # Save columns names/order for inference time
        self.columns_ = X_preprocessed.columns 
        
        return self

    def transform(self, X_df):
        # Check that we get a DataFrame
        assert type(X_df) == pd.DataFrame
        
        # Preprocess data
        X_preprocessed = self.preprocess_f(X_df, self.train_mean_)
        
        # Make sure to have the same features
        X_reindexed = X_preprocessed.reindex(columns=self.columns_, fill_value=0)
        
        return X_reindexed
    
preprocessor = PandasPreprocessor()
preprocessor.fit(data_df)
preprocessor.transform(data_df.iloc[:1])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Use our custom transformer in a pipeline
pipe = Pipeline([
    ('preprocessor', PandasPreprocessor()),
    ('estimator', LinearRegression())
])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE

# Split data
X = data_df.drop('casual', axis=1)
y = data_df.casual
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)

# Evaluate estimator
pipe.fit(X_tr, y_tr)
print('MAE: {:.0f}'.format(MAE(y_te, pipe.predict(X_te))))