## Installing library
import sys
<br>!{sys.executable} -m pip install xgboost

In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data = pd.read_csv("cleaned_data.csv", index_col = 0)
variables = data.drop(columns=["neighbourhood_group","neighbourhood","room_type","price","host_name","price"], axis=1)
price=pd.DataFrame(data['price'])

In [4]:
#split the data
from sklearn.model_selection import train_test_split

X = variables
y = price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# check the sample sizes
print("Train Set :", X_train.shape, y_train.shape)
print("Test Set  :", X_test.shape, y_test.shape)

Train Set : (6324, 55) (6324, 1)
Test Set  : (1581, 55) (1581, 1)


## Default Model

In [5]:
xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)

xgbr = xgb.XGBRegressor(objective='reg:squarederror')
xgbr.fit(X_train, y_train)
 
ypred = xgbr.predict(X_test)
rmse = mean_squared_error(y_test, ypred,squared=False)
print("RMSE:" ,rmse)

RMSE: 319.8781306836073


##  Using Grid Search to find best parameters and lowest RMSE

In [6]:
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}
xgbr = xgb.XGBRegressor(seed = 20)
clf = GridSearchCV(estimator=xgbr, param_grid=params,
                   scoring='neg_mean_squared_error',verbose=1)
clf.fit(X, y)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters: {'colsample_bytree': 0.3, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500}
Lowest RMSE:  321.2036517878752


## Random Grid Search to find best parameters and lowest RMSE

In [7]:
params = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': [100, 500, 1000]}
xgbr = xgb.XGBRegressor(seed = 20)
clf = RandomizedSearchCV(estimator=xgbr,
                         param_distributions=params,
                         scoring='neg_mean_squared_error',
                         n_iter=25,
                         verbose=1)
clf.fit(X, y)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best parameters: {'subsample': 0.7999999999999999, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.6}
Lowest RMSE:  321.6305588865265
