In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
import xgboost as xgb



In [2]:
### Root Mean Squared Log Error function for GridSearch
from sklearn.metrics import make_scorer
def rmsle(predicted, actual):
    assert(len(predicted) == len(actual))
    p = np.log(np.array(predicted) + 1)
    a = np.log(np.array(actual) + 1)
    return (((p - a)**2).sum() / len(predicted))**0.5
rmsle_loss = make_scorer(rmsle, greater_is_better=True)

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
### Fill NAs with -1 and convert MSSubClass to strings
### since they are integers but are actually
### unordered categorical
train = train.fillna(-1)
train['MSSubClass'] = train['MSSubClass'].astype(str)

In [5]:
features = [c for c in train.columns if c not in['Id','SalePrice']]

In [None]:
cats = ['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 
        'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'CentralAir',
        'Condition1', 'Condition2', 'Electrical', 'ExterCond', 
        'Exterior1st', 'Exterior2nd', 'ExterQual', 'Fence', 
        'FireplaceQu', 'Foundation', 'Functional', 'GarageCond', 
        'GarageFinish', 'GarageQual', 'GarageType', 'Heating', 
        'HeatingQC', 'HouseStyle', 'KitchenQual', 'LandContour', 
        'LandSlope', 'LotConfig', 'LotShape', 'MasVnrType', 
        'MiscFeature', 'MoSold', 'MSSubClass', 'MSZoning', 
        'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle', 
        'SaleCondition', 'SaleType', 'Street', 'Utilities']
nums = ['GarageYrBlt', 'LotFrontage', 'MasVnrArea', '1stFlrSF', 
        '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 
        'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 
        'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 
        'GarageArea', 'GarageCars', 'GrLivArea', 'HalfBath', 
        'KitchenAbvGr', 'LotArea', 'LowQualFinSF', 'MiscVal', 
        'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 
        'ScreenPorch', 'TotalBsmtSF', 'TotRmsAbvGrd', 
        'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'YrSold']

In [6]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer().fit(train[features].T.to_dict().values())
data_matrix = dv.transform(train[features].T.to_dict().values())

In [None]:
data_matrix = hstack([cat_matrix, train[nums]])

In [7]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures()
poly_matrix = poly.fit_transform(data_matrix.toarray())

In [8]:
X_train, X_test, y_train, y_test = train_test_split(poly_matrix, train['SalePrice'], test_size=0.33, random_state=42)

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
boost = xgb.XGBRegressor()
# n_estimators = [50, 100, 300, 400]
n_estimators = [50]
max_depth = [5, 8]
# learning_rate = [0.0001, 0.001, 0.01, 0.1]
learning_rate = [0.01, 0.1]
tuned_parameters = dict(max_depth=max_depth, n_estimators=n_estimators, learning_rate=learning_rate)
grid = GridSearchCV(boost, tuned_parameters, cv=3, scoring=rmsle_loss, n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] n_estimators=50, learning_rate=0.01, max_depth=5 ................
[CV] n_estimators=50, learning_rate=0.01, max_depth=5 ................
[CV] n_estimators=50, learning_rate=0.01, max_depth=5 ................
[CV] n_estimators=50, learning_rate=0.01, max_depth=5 ................
[CV]  n_estimators=50, learning_rate=0.01, max_depth=5, score=-0.925917728994, total= 1.9min
[CV] n_estimators=50, learning_rate=0.01, max_depth=5 ................
[CV]  n_estimators=50, learning_rate=0.01, max_depth=5, score=-0.926792663908, total= 1.9min
[CV] n_estimators=50, learning_rate=0.01, max_depth=8 ................
[CV]  n_estimators=50, learning_rate=0.01, max_depth=5, score=-0.952629809471, total= 1.9min
[CV] n_estimators=50, learning_rate=0.01, max_depth=8 ................
[CV]  n_estimators=50, learning_rate=0.01, max_depth=5, score=-0.944266610279, total= 1.9min
[CV] n_estimators=50, learning_rate=0.01, max_depth=8 ................


In [None]:
# summarize results
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# plot
scores = np.array(means).reshape(len(learning_rate), len(max_depth), len(n_estimators))
for i, value in enumerate(learning_rate):
    plt.plot(n_estimators, scores[i][1], label='learning_rate: ' + str(value)) 
plt.legend()
plt.xlabel('n_estimators')
plt.ylabel('Mean Squared Error')
# print scores

In [None]:
# plot
scores = np.array(means).reshape(len(max_depth), len(learning_rate), len(n_estimators))
for i, value in enumerate(max_depth):
    plt.plot(n_estimators, scores[i][1], label='max_depth: ' + str(value)) 
plt.legend()
plt.xlabel('max_depth')
plt.ylabel('Mean Squared Error')

In [None]:
%%time
boost = xgb.XGBRegressor(n_estimators=400, max_depth=2, learning_rate=0.1)
boost = boost.fit(X_train, y_train)

In [None]:
%%time
train_preds = boost.predict(X_test)

In [None]:
%%time
from sklearn.metrics import mean_squared_log_error
print mean_squared_log_error(y_test, train_preds)**0.5

In [None]:
test = test.fillna(-1)
test_matrix = dv.transform(test[cats].T.to_dict().values())
test_data_matrix = hstack([test_matrix, test[nums]])
test_poly = poly.transform(test_data_matrix.toarray())

In [None]:
preds = boost.predict(test_poly)

In [None]:
submission = pd.DataFrame()
submission['Id'] = test.Id
submission['SalePrice'] = preds

In [None]:
submission.to_csv('submission_xgb.csv', index=False)