In [None]:
import xgboost as xgb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data=pd.read_csv("train.csv")
data=data.ix[:,1:81]

y=data['SalePrice']

dummies = data['Neighborhood'].str.get_dummies()

dummies.head()

dummies.columns = ["Neighborhood_" + col for col in dummies.columns]

data = pd.concat([data, dummies], axis=1)
data = data.drop("Neighborhood", axis=1)

def cat_to_columns(data, column):
    dummies = data[column].str.get_dummies()
    dummies.columns = [column + col for col in dummies.columns]
    data = pd.concat([data, dummies], axis=1)
    data = data.drop(column, axis=1)
    return data

data = cat_to_columns(data, "HouseStyle")
data = cat_to_columns(data, "BldgType")
data = cat_to_columns(data, "MSZoning")
data = cat_to_columns(data, "PavedDrive")

columns = [
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'Fireplaces',
 'GarageArea']

X = []
X=[data[columns], data[sum([[col for col in data.columns if col.startswith(name)] 
                       for name in ['Neighborhood', 'Housestyle', 'MSZoning',"PavedDrive"]],[])]]
X=pd.concat([X[0], X[1]], axis=1)

X.head()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

xg_reg = xgb.XGBRegressor(objective="reg:linear", booster="gblinear", seed=123)

xg_reg.fit(X_train, y_train)

preds = xg_reg.predict(X_test)

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(format(rmse))

from sklearn.linear_model import LinearRegression
model=LinearRegression()
X2=X.fillna(0)
model.fit(X2,y)
preds2=model.predict(X2)
rmse = np.sqrt(mean_squared_error(y, preds2))
print(rmse)

xg_reg = xgb.XGBRegressor(objective="reg:linear", booster="gbtree", seed=123)
xg_reg.fit(X,y)
preds3=xg_reg.predict(X2)
rmse = np.sqrt(mean_squared_error(y, preds3))
print(rmse)

import matplotlib
%matplotlib inline

housing_dmatrix = xgb.DMatrix(data=X, label=y)

params = {"objective":"reg:linear", "max_depth":2}

xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10)

# Plot the first tree
xgb.plot_tree(xg_reg, num_trees=0)

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(15, 10)
plt.show()

# Plot the fifth tree
xgb.plot_tree(xg_reg, num_trees=4)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(15, 10)
plt.show()

# Plot the last tree sideways
xgb.plot_tree(xg_reg, num_trees=9, rankdir='LR')
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(15, 10)
plt.show()

# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective": "reg:linear", "max_depth": 4}

# Train the model: xg_reg
xg_reg = xgb.train(dtrain=housing_dmatrix, num_boost_round=10, params=params)

# Plot the feature importances
xgb.plot_importance(xg_reg)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(15, 10)
plt.show()

# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:linear", "max_depth":4}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=5, metrics="rmse", as_pandas=True, seed=123)

# Extract and print final boosting round metric
print(cv_results)

from sklearn.model_selection import GridSearchCV
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform grid search: grid_mse
grid_mse = GridSearchCV(param_grid=gbm_param_grid, estimator=gbm, scoring="neg_mean_squared_error", cv=4, verbose=1)


# Fit grid_mse to the data
grid_mse.fit(X, y)


# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))