In [253]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

from modules.TargetAverager import TargetAverager
from modules.Probe import Probe


# sklearn.set_config(transform_output='pandas')


pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

df = pd.read_csv("../house_sales.csv")

print(df['property_id'].nunique())
print(df.shape)
df.info()

1584
(1584, 74)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1584 entries, 0 to 1583
Data columns (total 74 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   baths                        1584 non-null   int64  
 1   baths_full                   1584 non-null   int64  
 2   baths_half                   1584 non-null   int64  
 3   beds                         1584 non-null   int64  
 4   garage                       1584 non-null   int64  
 5   lot_sqft                     1584 non-null   float64
 6   sold_price                   1584 non-null   float64
 7   sqft                         1584 non-null   float64
 8   stories                      1584 non-null   int64  
 9   type                         1584 non-null   object 
 10  year_built                   1584 non-null   int64  
 11  is_price_reduced             1584 non-null   bool   
 12  city                         1584 non-null   object 
 13  la

In [254]:
y = df["sold_price"]
X = df.drop(["sold_price", "property_id", "state_code", 'lat','postal_code'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print(X.shape[1])

69


In [255]:
def numFeat(data):
    filter = data.dtypes[[pd.api.types.is_numeric_dtype(dtype) for dtype in data.dtypes]].index.tolist()
    return data[filter]

def catFeat(data):
    filter = data.dtypes[data.dtypes == 'object'].index.tolist()
    return data[filter]


keep_num = FunctionTransformer(numFeat)
keep_cat = FunctionTransformer(catFeat)

In [256]:
# mlb = MultiLabelBinarizer()
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()
cat_imputer = SimpleImputer()
num_imputer = SimpleImputer()
class_encoder = OneHotEncoder()
city_averager = TargetAverager("city")
state_averager = TargetAverager("state")
skbest = SelectKBest(k = 12)

In [257]:
class ToDenseTransformer:
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

    def fit(self, X, y=None, **fit_params):
        return self

densifier = ToDenseTransformer()

In [258]:
# Potential models
ridge_model = Ridge()
lasso_model = Lasso()
rf_model = RandomForestRegressor(n_estimators=100)
gb_model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.01)
xgb_model = xgb.XGBRegressor()

In [259]:
num_pipe = Pipeline([
    ('NumFilter', keep_num),
    # ('Scaler', standard_scaler),
    ('Imputer', num_imputer),
    ("skbest", skbest)
])

cat_pipe = Pipeline([
    ('CatFilter', keep_cat),
    ('OHEncoder', class_encoder),
    # ('Densifier', densifier),
    # ('Imputer', cat_imputer)
])

# list_pipe = Pipeline([
#     ('ListFilter', keep_list),
#     ('MultiLabelBinarizer', mlb),
#     ('Imputer', imputer)
# ])

pre_processing_pipeline = FeatureUnion([("num", num_pipe), ("cat", cat_pipe)])

In [260]:
pipeline = Pipeline([("CityAverager", city_averager), ("StateAverager", state_averager), ("preprocessing", pre_processing_pipeline), ("model", rf_model)])

In [261]:
pipeline.fit(X_train, y_train)

In [262]:
print(skbest.get_feature_names_out())
print(X_train.info())

['x0' 'x1' 'x6' 'x7' 'x8' 'x35' 'x38' 'x39' 'x51' 'x56' 'x66' 'x67']
<class 'pandas.core.frame.DataFrame'>
Index: 1267 entries, 552 to 1126
Data columns (total 69 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   baths                        1267 non-null   int64  
 1   baths_full                   1267 non-null   int64  
 2   baths_half                   1267 non-null   int64  
 3   beds                         1267 non-null   int64  
 4   garage                       1267 non-null   int64  
 5   lot_sqft                     1267 non-null   float64
 6   sqft                         1267 non-null   float64
 7   stories                      1267 non-null   int64  
 8   type                         1267 non-null   object 
 9   year_built                   1267 non-null   int64  
 10  is_price_reduced             1267 non-null   bool   
 11  city                         1267 non-null   object 
 12  lon       

In [263]:
y_pred = pipeline.predict(X_test)

In [264]:
print("R2:" + str(r2_score(y_test, y_pred)))
print("RMSE:" + str(mean_squared_error(y_test, y_pred, squared=False)))

R2:0.6646226683383307
RMSE:137557.39225120976


In [265]:
params = [
    # {
    #     "model": [ridge_model, lasso_model],
    #     "model__alpha": [0.1, 0.5, 1],
    #     "preprocessing__num__skbest__k": [20, 28, 36, 44]
    # },
    {
        "model": [rf_model],
        # "model__n_estimators": [200, 250, 350, 500],
        "model__n_estimators": [100],
        "model__min_samples_split": [4, 5, 6, 7, 8, 9, 10],
        # "preprocessing__num__skbest__k": [28, 36, 44, 52, 60]
        "preprocessing__num__skbest__k": [44]
    },
    # {
    #     "model": [xgb_model],
    #     "model__n_estimators": [50, 75, 100],
    #     "model__max_leaves": [2, 3, 4, 0],
    #     "preprocessing__num__skbest__k": [8, 20, 28]
    # }
    # {
    #     "model": [gb_model],
    #     "model__n_estimators": [25, 50, 75, 100],
    #     "model__min_samples_split": [3, 4],
    #     # "model__learning_rate": [0.01, 0.1],
    #     "preprocessing__num__skbest__k": [28, 36, 44, 52]
    # }
]

grid_search = GridSearchCV(pipeline, params, verbose=3, scoring='neg_mean_squared_error', refit=True)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[CV 1/5] END model=RandomForestRegressor(), model__min_samples_split=4, model__n_estimators=100, preprocessing__num__skbest__k=44;, score=-35167969887.837 total time=   1.6s
[CV 2/5] END model=RandomForestRegressor(), model__min_samples_split=4, model__n_estimators=100, preprocessing__num__skbest__k=44;, score=-24335607318.455 total time=   1.5s
[CV 3/5] END model=RandomForestRegressor(), model__min_samples_split=4, model__n_estimators=100, preprocessing__num__skbest__k=44;, score=-21931289482.981 total time=   1.4s
[CV 4/5] END model=RandomForestRegressor(), model__min_samples_split=4, model__n_estimators=100, preprocessing__num__skbest__k=44;, score=-36242048886.994 total time=   1.5s
[CV 5/5] END model=RandomForestRegressor(), model__min_samples_split=4, model__n_estimators=100, preprocessing__num__skbest__k=44;, score=-23757111499.395 total time=   1.5s
[CV 1/5] END model=RandomForestRegressor(), model__min_samples_split=5, model__n_estimators=100, preprocessing__num__skbest__k=44;

We want to make sure that we save our models.  In the old days, one just simply pickled (serialized) the model.  Now, however, certain model types have their own save format.  If the model is from sklearn, it can be pickled, if it's xgboost, for example, the newest format to save it in is JSON, but it can also be pickled.  It's a good idea to stay with the most current methods. 
- you may want to create a new `models/` subdirectory in your repo to stay organized

In [266]:
y_pred = grid_search.predict(X_test)

print("R2:" + str(r2_score(y_test, y_pred)))
print("RMSE:" + str(mean_squared_error(y_test, y_pred, squared=False)))

R2:0.6960391295055837
RMSE:130956.15682667466


In [272]:
import pickle

with open('models/tuned_rf.pkl', 'wb') as handle:
    pickle.dump(grid_search, handle)

with open('models/tuned_rf.pkl', 'rb') as handle:
    grid_search = pickle.load(handle)

y_pred = grid_search.predict(X_test)

print("R2:" + str(r2_score(y_test, y_pred)))
print("RMSE:" + str(mean_squared_error(y_test, y_pred, squared=False)))

Once you've identified which model works the best, implement a prediction pipeline to make sure that you haven't leaked any data, and that the model could be easily deployed if desired.
- Your pipeline should load the data, process it, load your saved tuned model, and output a set of predictions
- Assume that the new data is in the same JSON format as your original data - you can use your original data to check that the pipeline works correctly
- Beware that a pipeline can only handle functions with fit and transform methods.
- Classes can be used to get around this, but now sklearn has a wrapper for user defined functions.
- You can develop your functions or classes in the notebook here, but once they are working, you should import them from `functions_variables.py` 

R2:0.6960391295055837
RMSE:130956.15682667466


Pipelines come from sklearn.  When a pipeline is pickled, all of the information in the pipeline is stored with it.  For example, if we were deploying a model, and we had fit a scaler on the training data, we would want the same, already fitted scaling object to transform the new data with.  This is all stored when the pipeline is pickled.
- save your final pipeline in your `models/` folder

In [268]:
# save your pipeline here