In [None]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [None]:
train_2016 = pd.read_csv('https://drive.google.com/uc?id=15GlcdLJ79bc5_WhVNViepQaXvsE1vYb8')
properties_2016 = pd.read_csv('/Users/yang/Downloads/properties_2016.csv')

In [None]:
training_data = pd.merge(train_2016, properties_2016, on=['parcelid'], how='inner')
training_data.shape

In [None]:
# Lets create a transformer
class BinaryNullTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.assign(**{col: X[col].notnull() for col in self.columns})

In [None]:
# Let's try it and compare with original data
exist_or_not_variables = ['garagecarcnt', 'yardbuildingsqft26', 'basementsqft', 'fireplacecnt', 'yardbuildingsqft17']
transformer = BinaryNullTransformer(exist_or_not_variables)
transformed_data = transformer.fit_transform(training_data.head(20))
transformed_data[exist_or_not_variables].rename(
    columns={col: f'{col}_transformed' for col in exist_or_not_variables}
).join(training_data.head(20)[exist_or_not_variables])


In [None]:
# Let's create another transformer
class IntervalCategorizer(BaseEstimator, TransformerMixin):
    def __init__(self, column, rng=(2,4)):
        self.column = column
        self.rng = rng
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.assign(**{self.column: np.where((X[self.column] >= self.rng[0]) & (X[self.column] <= self.rng[1]), True, False)})

In [None]:
# Let's try it again and compare with original data
transformer = IntervalCategorizer('bedroomcnt')
transformed_data = transformer.fit_transform(training_data.head(20))
transformed_data[['bedroomcnt']].rename(columns={'bedroomcnt': 'transformed'}).join(training_data.head(20)[['bedroomcnt']])

In [None]:
# Last transformer
class Normalizer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        self.means = X[self.columns].mean()
        self.std = X[self.columns].std()
        
        return self
    
    def transform(self, X):
        return X.assign(**{col: (X[col] - self.means[col]) / self.std[col] for col in self.columns})

In [None]:
# Let's try it one last time and compare with original data
transformer = Normalizer(['finishedsquarefeet12', 'structuretaxvaluedollarcnt'])
transformed_data = transformer.fit_transform(training_data.head(20))
transformed_data[['bedroomcnt']].rename(columns={'bedroomcnt': 'transformed'}).join(training_data.head(20)[['bedroomcnt']])

In [None]:
# Transformer to select our variables
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

In [None]:
# issue - we now have 4 normalizers, and it's a pain to actually run these manually everytime
# solution - pipelines.  Pipelines take a set of transforms and a final estimator.  Let's see how this works
exist_or_not_variables = ['garagecarcnt', 'yardbuildingsqft26', 'basementsqft', 'fireplacecnt', 'yardbuildingsqft17']
bedrooms = ['bedroomcnt']
bathrooms = ['fullbathcnt']
normalized_variables = ['finishedsquarefeet12', 'structuretaxvaluedollarcnt']
other_variables = ['yearbuilt']

transformer_pipeline = Pipeline(
    steps=[
        ('binary_null', BinaryNullTransformer(exist_or_not_variables)),
        ('bedrooms', IntervalCategorizer(*bedrooms, (2, 4))),
        ('bathrooms', IntervalCategorizer(*bathrooms, (2, 4))),
        ('normalize', Normalizer(normalized_variables)),
        ('select_features', FeatureSelector([*exist_or_not_variables, *bedrooms, *bathrooms, *normalized_variables, *other_variables]))
    ]
)

In [None]:
# let's try using the pipeline
transformed_data = transformer_pipeline.fit_transform(training_data.head(20))

# transformed_data[exist_or_not_variables].rename(
#     columns={col: f'{col}_transformed' for col in exist_or_not_variables}
# ).join(training_data.head(20)[exist_or_not_variables])

transformed_data