# Model Training

## Packages and Data

In [1]:
import pandas as pd
import sklearn

from joblib import dump, load

from sklearn import ensemble
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv("../data/train.csv")
X = train.loc[:, train.columns != 'SalePrice']
y = train[['SalePrice']]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

In [4]:
X.columns, y.columns

(Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
        'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
        'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
        'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
        'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
        'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
        'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
        'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
        'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
        'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
        'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
        'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
        'GarageCond

In [5]:
train.dtypes.to_dict()

{'Id': dtype('int64'),
 'MSSubClass': dtype('int64'),
 'MSZoning': dtype('O'),
 'LotFrontage': dtype('float64'),
 'LotArea': dtype('int64'),
 'Street': dtype('O'),
 'Alley': dtype('O'),
 'LotShape': dtype('O'),
 'LandContour': dtype('O'),
 'Utilities': dtype('O'),
 'LotConfig': dtype('O'),
 'LandSlope': dtype('O'),
 'Neighborhood': dtype('O'),
 'Condition1': dtype('O'),
 'Condition2': dtype('O'),
 'BldgType': dtype('O'),
 'HouseStyle': dtype('O'),
 'OverallQual': dtype('int64'),
 'OverallCond': dtype('int64'),
 'YearBuilt': dtype('int64'),
 'YearRemodAdd': dtype('int64'),
 'RoofStyle': dtype('O'),
 'RoofMatl': dtype('O'),
 'Exterior1st': dtype('O'),
 'Exterior2nd': dtype('O'),
 'MasVnrType': dtype('O'),
 'MasVnrArea': dtype('float64'),
 'ExterQual': dtype('O'),
 'ExterCond': dtype('O'),
 'Foundation': dtype('O'),
 'BsmtQual': dtype('O'),
 'BsmtCond': dtype('O'),
 'BsmtExposure': dtype('O'),
 'BsmtFinType1': dtype('O'),
 'BsmtFinSF1': dtype('int64'),
 'BsmtFinType2': dtype('O'),
 'BsmtF

## Train Initial Model

In [6]:
var_list = ['OverallQual', 'OverallCond', 'TotalBsmtSF', 'FullBath', 'YearBuilt']

In [7]:
model = ensemble.GradientBoostingRegressor(n_estimators = 200,
                                           max_depth = 5,
                                           learning_rate = 0.05,
                                           random_state = 56)

In [8]:
training_data = X_train[var_list]
predictors_metadata = training_data.dtypes.to_dict()

In [9]:
model.fit(training_data, y_train.values.ravel())

GradientBoostingRegressor(learning_rate=0.05, max_depth=5, n_estimators=200,
                          random_state=56)

In [10]:
model.score(X_test[var_list], y_test)

0.7084299719372535

In [11]:
# Write out the model itself, along with the variable list
dump(model, "../models/model1.joblib")
dump(predictors_metadata, "../models/model1_predictors_metadata.joblib")

['../models/model1_predictors_metadata.joblib']

In [12]:
# Test reading in the serialized objects
#model2 = load("../models/model1.joblib")
#metadata = load("../models/model1_predictors_metadata.joblib")

## Train Better Model
Explore things like:
 - additional data
 - predict log sale price
 - group string data