In [98]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MultiLabelBinarizer, FunctionTransformer, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn import set_config
set_config(transform_output = "pandas")

from modules.TargetAverager import TargetAverager


pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

df = pd.read_csv("../house_sales.csv")
df.head(10)

df['baths'].dtype

dtype('float64')

In [99]:
y = df["sold_price"]
X = df.drop(["sold_price", "list_date", "line", "sold_date", "property_id"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train['baths'].dtype


dtype('float64')

In [100]:
def numFeat(data):
    filter = data.dtypes[[pd.api.types.is_numeric_dtype(dtype) for dtype in data.dtypes]].index.tolist()
    print(filter)
    return data[filter]

def catFeat(data):
    filter = data.dtypes[data.dtypes == 'object'].index.tolist()
    print(filter)
    return data[filter]


keep_num = FunctionTransformer(numFeat)
keep_cat = FunctionTransformer(catFeat)

In [101]:
# mlb = MultiLabelBinarizer()
minmax_scaler = MinMaxScaler()
imputer = SimpleImputer()
class_encoder = LabelEncoder()
target_averager = TargetAverager("city")

In [102]:
# Potential models
base_model = Ridge()
rf_model = RandomForestRegressor(n_estimators=200)
gb_model = GradientBoostingRegressor()

In [103]:
num_pipe = Pipeline([
    ('NumFilter', keep_num),
    ('MMScaler', minmax_scaler),
    ('Imputer', imputer)
    
])

cat_pipe = Pipeline([
    ('CatFilter', keep_cat),
    ('LabelEncode', class_encoder)
    # ('Imputer', imputer)
])

# list_pipe = Pipeline([
#     ('ListFilter', keep_list),
#     ('MultiLabelBinarizer', mlb),
#     ('Imputer', imputer)
# ])

pre_processing_pipeline = FeatureUnion([("num", num_pipe), ("cat", cat_pipe)])

In [104]:
pipeline = Pipeline([("TargetAverager", target_averager), ("preprocessing", pre_processing_pipeline), ("model", rf_model)])

In [105]:
pipeline.fit(X_train, y_train)

['baths', 'baths_full', 'baths_half', 'beds', 'garage', 'lot_sqft', 'sqft', 'stories', 'year_built', 'is_price_reduced', 'list_price', 'listing_id', 'lat', 'lon', 'postal_code', 'price_reduced_amount', 'basement', 'big_lot', 'big_yard', 'carport', 'central_air', 'central_heat', 'city_view', 'community_outdoor_space', 'community_security_features', 'community_swimming_pool', 'corner_lot', 'dining_room', 'disability_features', 'dishwasher', 'energy_efficient', 'ensuite', 'family_room', 'farm', 'fenced_yard', 'fireplace', 'floor_plan', 'forced_air', 'front_porch', 'garage_1_or_more', 'garage_2_or_more', 'garage_3_or_more', 'groundscare', 'hardwood_floors', 'high_ceiling', 'laundry_room', 'master_bedroom', 'modern_kitchen', 'new_roof', 'open_floor_plan', 'park', 'ranch', 'recreation_facilities', 'rental_property', 'shopping', 'single_story', 'swimming_pool', 'trails', 'two_or_more_stories', 'updated_kitchen', 'view', 'views', 'washer_dryer', 'mean']
['type', 'state', 'state_code']


TypeError: LabelEncoder.fit_transform() takes 2 positional arguments but 3 were given

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred, squared=False))

0.37659217525201494
283453.5591592913


In [None]:
# params = {
#     "model": [base_model, rf_model, gb_model]
# }

# grid_search = GridSearchCV(pipeline, params, verbose=10, refit=True)

# grid_search.fit(X_train, y_train)

# print(grid_search.best_params_)

We want to make sure that we save our models.  In the old days, one just simply pickled (serialized) the model.  Now, however, certain model types have their own save format.  If the model is from sklearn, it can be pickled, if it's xgboost, for example, the newest format to save it in is JSON, but it can also be pickled.  It's a good idea to stay with the most current methods. 
- you may want to create a new `models/` subdirectory in your repo to stay organized

In [None]:
# save your best model here

Once you've identified which model works the best, implement a prediction pipeline to make sure that you haven't leaked any data, and that the model could be easily deployed if desired.
- Your pipeline should load the data, process it, load your saved tuned model, and output a set of predictions
- Assume that the new data is in the same JSON format as your original data - you can use your original data to check that the pipeline works correctly
- Beware that a pipeline can only handle functions with fit and transform methods.
- Classes can be used to get around this, but now sklearn has a wrapper for user defined functions.
- You can develop your functions or classes in the notebook here, but once they are working, you should import them from `functions_variables.py` 

In [None]:
# Build pipeline here

Pipelines come from sklearn.  When a pipeline is pickled, all of the information in the pipeline is stored with it.  For example, if we were deploying a model, and we had fit a scaler on the training data, we would want the same, already fitted scaling object to transform the new data with.  This is all stored when the pipeline is pickled.
- save your final pipeline in your `models/` folder

In [None]:
# save your pipeline here