In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pickle
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_val_score,KFold
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor



import warnings


warnings.filterwarnings("ignore")

# environment settings
data_path = 'Data/'

# Deserialize previously saved data from "preprocessing"
with open(data_path+'train_pp.obj', 'rb') as train_pp, \
open(data_path+'test_pp.obj','rb') as test_pp:
    train_df = pickle.load(train_pp)
    test_df = pickle.load(test_pp)
train_df["SalePrice"] = np.log1p(train_df["SalePrice"])

In [2]:
X = train_df.loc[:,'Id':'SaleCondition_Partial']
y = train_df['SalePrice']
print("Shape of training set {}.\nShape of test set {}".format(X.shape,y.shape))

Shape of training set (1456, 304).
Shape of test set (1456,)


In [3]:
scaler = RobustScaler()
X_scaled=scaler.fit(X).transform(X)
test_scaled=scaler.transform(test_df)

In [4]:
xgb= XGBRegressor(colsample_bytree=0.1, gamma=0.03, 
                             learning_rate=0.02, max_depth=3, 
                             n_estimators=3000,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

kf = KFold(10, shuffle=True, random_state=42).get_n_splits(X)
rmse_cv_xgb= np.sqrt(-cross_val_score(xgb, X, y, scoring="neg_mean_squared_error", cv = kf))
print("The 10-fold crossvalidation RMSE of XGB is {:.5f} +/- {:.3f}".format(rmse_cv_xgb.mean(),
                                                                               rmse_cv_xgb.std()))

The 10-fold crossvalidation RMSE of XGB is 0.11041 +/- 0.015


The 10-fold crossvalidation RMSE of XGB is 0.11410 +/- 0.015 <br>
(colsample_bytree=0.4603, gamma=0.0468, <br>
                             learning_rate=0.05, max_depth=3 , <br>
                             min_child_weight=1.7817, n_estimators=2200,<br>
                             reg_alpha=0.4640, reg_lambda=0.8571, <br>
                             subsample=0.5213, silent=1,<br>
                             random_state =7, nthread = -1)

In [5]:
#Prediction on real test set using XGB
xgb.fit(X,y)
pred_results =xgb.predict(test_df)
pred_results = np.expm1(pred_results)
result_df = pd.DataFrame(data={'Id': test_df["Id"].values,
                               'SalePrice': pred_results})
#Create output csv file
result_df.to_csv(data_path+"outputs/xgb", index=False)