In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import skew
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
import xgboost as xgb



In [2]:
### Root Mean Squared Log Error function for GridSearch
from sklearn.metrics import make_scorer
def rmsle(predicted, actual):
    assert(len(predicted) == len(actual))
    p = np.log(np.array(predicted) + 1)
    a = np.log(np.array(actual) + 1)
    return (((p - a)**2).sum() / len(predicted))**0.5
rmsle_loss = make_scorer(rmsle, greater_is_better=False)

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
### Conversion to float32 and int32
for c, dtype in zip(train.columns, train.dtypes):
    if dtype == np.float64:
        train[c] = train[c].astype(np.float32)
for c, dtype in zip(test.columns, test.dtypes):
    if dtype == np.float64:
        test[c] = test[c].astype(np.float32)
for c, dtype in zip(train.columns, train.dtypes):
    if dtype == np.int64:
        train[c] = train[c].astype(np.int32)
for c, dtype in zip(test.columns, test.dtypes):
    if dtype == np.int64:
        test[c] = test[c].astype(np.int32)

In [5]:
cats = ['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 
        'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'CentralAir',
        'Condition1', 'Condition2', 'Electrical', 'ExterCond', 
        'Exterior1st', 'Exterior2nd', 'ExterQual', 'Fence', 
        'FireplaceQu', 'Foundation', 'Functional', 'GarageCond', 
        'GarageFinish', 'GarageQual', 'GarageType', 'Heating', 
        'HeatingQC', 'HouseStyle', 'KitchenQual', 'LandContour', 
        'LandSlope', 'LotConfig', 'LotShape', 'MasVnrType', 
        'MiscFeature', 'MoSold', 'MSSubClass', 'MSZoning', 
        'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle', 
        'SaleCondition', 'SaleType', 'Street', 'Utilities']
nums = ['GarageYrBlt', 'LotFrontage', 'MasVnrArea', '1stFlrSF', 
        '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 
        'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 
        'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 
        'GarageArea', 'GarageCars', 'GrLivArea', 'HalfBath', 
        'KitchenAbvGr', 'LotArea', 'LowQualFinSF', 'MiscVal', 
        'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 
        'ScreenPorch', 'TotalBsmtSF', 'TotRmsAbvGrd', 
        'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'YrSold']

In [6]:
### Fill NAs with -1 and convert MSSubClass to strings
### since they are integers but are actually
### unordered categorical
# train = train.fillna(-1)
train['MSSubClass'] = train['MSSubClass'].astype(str)

In [7]:
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

skewed_feats = train[nums].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

train[skewed_feats] = np.log1p(train[skewed_feats])

In [8]:
features = [c for c in train.columns if c not in['Id','SalePrice']]

In [9]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer().fit(train[cats].T.to_dict().values())
cat_matrix = dv.transform(train[cats].T.to_dict().values())

In [10]:
train[nums] = train[nums].fillna(train[nums].mean())

In [11]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)
# poly_matrix = poly.fit_transform(data_matrix.toarray())
poly_matrix = poly.fit_transform(train[nums])

In [12]:
data_matrix = hstack([cat_matrix, poly_matrix])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data_matrix, train['SalePrice'], test_size=0.33, random_state=42)

In [14]:
%%time
d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

CPU times: user 2.2 s, sys: 134 ms, total: 2.34 s
Wall time: 2.34 s


In [None]:
%%time
from sklearn.model_selection import GridSearchCV
boost = xgb.XGBRegressor()
n_estimators = [50, 100, 200, 300]
tuned_parameters = dict(n_estimators=n_estimators)
grid = GridSearchCV(boost, tuned_parameters, cv=3, scoring=rmsle_loss, n_jobs=-1, verbose=2)
grid.fit(data_matrix, train['SalePrice'])

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] n_estimators=50 .................................................
[CV] n_estimators=50 .................................................
[CV] n_estimators=50 .................................................
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=50, total=  22.1s
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=50, total=  22.4s
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=50, total=  22.7s
[CV] n_estimators=200 ................................................
[CV] ................................. n_estimators=100, total=  42.0s
[CV] n_estimators=200 ................................................
[CV] ................................. n_estimators=100, total=  43.0s
[CV] n_estimators

In [None]:
# summarize results
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# plot
# scores = np.array(means).reshape(len(learning_rate), len(max_depth), len(n_estimators))
# for i, value in enumerate(learning_rate):
#     plt.plot(n_estimators, scores[i][1], label='learning_rate: ' + str(value)) 
plt.errorbar(n_estimators, means, yerr=stds)
plt.legend()
plt.xlabel('n_estimators')
plt.ylabel('Mean Squared Error')
# print scores

In [18]:
%%time
params = {}
params['eta'] = 0.1
params['objective'] = 'reg:linear'
params['eval_metric'] = 'rmse'
params['max_depth'] = 2
params['silent'] = 1
params['learning_rate'] = 0.1
params['subsample']= 0.8

watchlist = [(d_train, 'train'), (d_test, 'test')]
boost = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

[0]	train-rmse:10.3843	test-rmse:10.372
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[10]	train-rmse:3.63794	test-rmse:3.63449
[20]	train-rmse:1.2848	test-rmse:1.29076
[30]	train-rmse:0.468604	test-rmse:0.480204
[40]	train-rmse:0.197933	test-rmse:0.224493
[50]	train-rmse:0.1231	test-rmse:0.161746
[60]	train-rmse:0.105212	test-rmse:0.149469
[70]	train-rmse:0.099518	test-rmse:0.146403
[80]	train-rmse:0.095319	test-rmse:0.144984
[90]	train-rmse:0.092218	test-rmse:0.143405
[100]	train-rmse:0.089085	test-rmse:0.142976
[110]	train-rmse:0.086647	test-rmse:0.142476
[120]	train-rmse:0.084124	test-rmse:0.141876
[130]	train-rmse:0.082318	test-rmse:0.141479
[140]	train-rmse:0.080699	test-rmse:0.140976
[150]	train-rmse:0.079011	test-rmse:0.140629
[160]	train-rmse:0.07726	test-rmse:0.140055
[170]	train-rmse:0.075685	test-rmse:0.140689
[180]	train-rmse:0.074357	test-rmse:0.140709
[190]	train-rmse:0.0727

In [19]:
%%time
train_preds = boost.predict(d_test)

CPU times: user 4.81 ms, sys: 1.63 ms, total: 6.44 ms
Wall time: 5.08 ms


In [20]:
%%time
from sklearn.metrics import mean_squared_log_error
print rmsle(y_test, train_preds)

0.0107893111323
CPU times: user 747 µs, sys: 581 µs, total: 1.33 ms
Wall time: 789 µs


In [None]:
%%time
test['MSSubClass'] = test['MSSubClass'].astype(str)
test_cat_matrix = dv.transform(test[cats].T.to_dict().values())
test[nums] = test[nums].fillna(test[nums].mean())
test_poly = poly.transform(test[nums])
test_data_matrix = hstack([test_cat_matrix, test_poly])
d_test = xgb.DMatrix(test_data_matrix)

In [None]:
preds = np.expm1(boost.predict(d_test))

In [None]:
submission = pd.DataFrame()
submission['Id'] = test.Id
submission['SalePrice'] = preds

In [None]:
submission.to_csv('submission_xgb.csv', index=False)