In [21]:
# Imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
# Load the Ames dataset
# Dataset description : https://www.kaggle.com/c/house-prices-advanced-regression-techniques
housing_data = pd.read_csv('../input/ames_unprocessed_data.csv', sep=',')

# One Hot encoding
housing_data_oh = pd.get_dummies(housing_data)

# Create arrays for the features and the target: X, y
X, y = housing_data_oh.iloc[:,:-1], housing_data_oh.iloc[:,-1]

# Create the training and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

In [4]:
# Instantiate the XGBRegressor: xg_reg
xg_reg = xgb.XGBRegressor(objective='reg:squarederror',n_estimators=10, seed=123)

# Fit the regressor to the training set
xg_reg.fit(X_train, y_train)

# Predict the labels of the test set: preds
preds = xg_reg.predict(X_test)

# Compute the rmse: rmse
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.014262


### Using Standardized data

In [10]:
steps = [('scaler', StandardScaler()),('xgbr', xgb.XGBRegressor(objective='reg:squarederror',n_estimators=10, seed=123))]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
preds_bis = pipeline.predict(X_test)
rmse_bis = np.sqrt(mean_squared_error(y_test, preds_bis))
print("RMSE: %f" % (rmse_bis))

RMSE: 0.014262


### Hyperparameters tunning

In [24]:
# GridSearch
housing_dmatrix = xgb.DMatrix(data=X_train, label=y_train)
gbm_param_grid = {
    "learning_rate": [0.01, 0.1, 0.5, 0.9],
    "n_estimators": [200],
    "subsample": [0.3, 0.5, 0.9]}
gbm = xgb.XGBRegressor()
grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, scoring='neg_mean_squared_error', cv=4, verbose=1)
grid_mse.fit(X_train, y_train)
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best parameters found:  {'learning_rate': 0.9, 'n_estimators': 200, 'subsample': 0.9}
Lowest RMSE found:  7.182564023155264e-06
