In [None]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn import metrics


from sklearn.ensemble import RandomForestRegressor

import pickle

In [None]:
train_2016 = pd.read_csv('https://drive.google.com/uc?id=15GlcdLJ79bc5_WhVNViepQaXvsE1vYb8')
properties_2016 = pd.read_csv('/Users/yang/Downloads/properties_2016.csv')

In [None]:
# Data
training_data = pd.merge(train_2016, properties_2016, on=['parcelid'], how='inner')
training_data.shape

In [None]:
# Transfomers
class BinaryNullTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.assign(**{col: X[col].notnull() for col in self.columns})
    
    
class IntervalCategorizer(BaseEstimator, TransformerMixin):
    def __init__(self, column, rng=(2,4)):
        self.column = column
        self.rng = rng
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.assign(**{self.column: np.where((X[self.column] >= self.rng[0]) & (X[self.column] <= self.rng[1]), True, False)})
    
    
class Normalizer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        self.means = X[self.columns].mean()
        self.std = X[self.columns].std()
        
        return self
    
    def transform(self, X):
        return X.assign(**{col: (X[col] - self.means[col]) / self.std[col] for col in self.columns})
    

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]
    
# New transformer to just do simple imput on each column
class DataframeImputer(SimpleImputer):
    def transform(self, X):
        return pd.DataFrame(super().transform(X), columns=X.columns)

In [None]:
# Features
exist_or_not_variables = ['garagecarcnt', 'yardbuildingsqft26', 'basementsqft', 'fireplacecnt', 'yardbuildingsqft17']
bedrooms = ['bedroomcnt']
bathrooms = ['fullbathcnt']
normalized_variables = ['finishedsquarefeet12', 'structuretaxvaluedollarcnt']
other_variables = ['yearbuilt']


In [None]:
transformer_pipeline = Pipeline(
    steps=[
        ('binary_null', BinaryNullTransformer(exist_or_not_variables)),
        ('bedrooms', IntervalCategorizer(*bedrooms, (2, 4))),
        ('bathrooms', IntervalCategorizer(*bathrooms, (2, 4))),
        ('normalize', Normalizer(normalized_variables)),
        ('select_features', FeatureSelector([*exist_or_not_variables, *bedrooms, *bathrooms, *normalized_variables, *other_variables])),
        ('impute_nulls', DataframeImputer(missing_values=np.nan, strategy='mean'))
    ]
)

In [None]:
# Let's build a tree, but use our previous pipeline.  Note that a pipeline can be part of a pipeline as well
model_pipeline = Pipeline(
    steps=[
        ('transformer', transformer_pipeline),
        ('model', RandomForestRegressor(n_estimators=100, random_state=10))
    ]   
)

In [None]:
# fit the model on training data
fitted_model = model_pipeline.fit(training_data, training_data['logerror'])

In [None]:
# let's test that the model works
(lambda x: x.assign(prediction=fitted_model.predict(x)))(training_data.sample(1))

In [None]:
# Great - but that is in sample.  We want to validate the model out of sample, so how do we know that it's the best we can do?  We'll need to cross-validate
kf = KFold(n_splits=3)

for train_idx, test_idx in kf.split(training_data):
    X_train, X_test = training_data.iloc[train_idx], training_data.iloc[test_idx]

    fitted_model = model_pipeline.fit(X_train, X_train['logerror'])
 
    y_pred = fitted_model.predict(X_test)
    metric = metrics.mean_absolute_error(X_test['logerror'], y_pred)
    print(metric)


In [None]:
# We now know our general performance.  However, how do we know that the hyperparameters of the model are tuned correctly?
# We can use CV to tune them
param_grid = {
    'model__bootstrap': [True],
    'model__max_depth': [80, 110],
    'model__max_features': [2, 3],
    'model__min_samples_leaf': [3, 5],
    'model__min_samples_split': [8, 12],
    'model__n_estimators': [100, 300]
}

grid_search = GridSearchCV(estimator=model_pipeline, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

grid_search.fit(training_data, training_data['logerror'])

In [None]:
# let's evaluate the best model from hyperparameter tuning
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

metrics.mean_absolute_error(training_data['logerror'], best_model.predict(training_data))

In [None]:
# Great, we have found our model.  Let's save it so it can be used later
pickle.dump(best_model, open('/Users/yang/Downloads/mfe_model.pkl', 'wb'))

In [None]:
# let's verify that the saved model works
loaded_model = pickle.load(open('/Users/yang/Downloads/mfe_model.pkl', 'rb'))

metrics.mean_absolute_error(training_data['logerror'], best_model.predict(training_data))