In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train=pd.read_csv('preprocessedtraindata.csv')
test=pd.read_csv('preprocessedtestdata.csv')

In [3]:
X_train=train.drop(['Price'],axis=1)
y_train=train['Price']

X_test=test.drop(['Price'],axis=1)
y_test=test['Price']

In [4]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,accuracy_score
def predict(ml_model):
    print('Model is: {}'.format(ml_model))
    model= ml_model.fit(X_train,y_train)
    print("Training score: {}".format(model.score(X_train,y_train)))
    print("Test score:{}".format(model.score(X_test,y_test)))
    predictions = model.predict(X_test)
    print("Predictions are: {}".format(predictions))
    print('\n')
    r2score=r2_score(y_test,predictions) 
    print("r2 score is: {}".format(r2score))
          
    print('MAE:{}'.format(mean_absolute_error(y_test,predictions)))
    print('MSE:{}'.format(mean_squared_error(y_test,predictions)))
    print('RMSE:{}'.format(np.sqrt(mean_squared_error(y_test,predictions))))
    
    return model

# Importing Machine Learning Model Libraries



In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Random Forest Regressor



In [6]:
predict(RandomForestRegressor())

Model is: RandomForestRegressor()
Training score: 0.9517296891423164
Test score:0.8098811689804944
Predictions are: [7393.33 9668.82 8301.24 ... 4841.8  3877.44 4151.92]


r2 score is: 0.8098811689804944
MAE:1179.6628019601171
MSE:3923841.6036021477
RMSE:1980.8689011648771


RandomForestRegressor()

# Decision Tree Regressor

In [7]:
predict(DecisionTreeRegressor())


Model is: DecisionTreeRegressor()
Training score: 0.9705440821762137
Test score:0.6860881153869538
Predictions are: [7295. 8018. 8476. ... 4804. 3873. 4409.]


r2 score is: 0.6860881153869538
MAE:1373.5054945054944
MSE:6478792.7955619255
RMSE:2545.3472838813027


DecisionTreeRegressor()

# XG Boost Regressor



In [8]:
predict(XGBRegressor())

Model is: XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)
Training score: 0.9376103949657766
Test score:0.839765086215697
Predictions are: [8700.959  9433.354  8514.881  ... 4763.8467 3753.632  4421.112 ]


r2 score is: 0.839765086215697
MAE:1159.965198587927
MSE:3307070.7287903805
RMSE:1818.5353251422916


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

# Linear Regression

In [9]:
predict(LinearRegression())

Model is: LinearRegression()
Training score: 0.5691360451875732
Test score:0.5915918101530742
Predictions are: [ 6371.78878938 11441.03878938 10121.03878938 ...  3224.53878938
  5675.53878938  4232.53878938]


r2 score is: 0.5915918101530742
MAE:1960.142852835532
MSE:8429091.626430197
RMSE:2903.289793739198


LinearRegression()

# Support Vector Regression

In [10]:
predict(SVR())

Model is: SVR()
Training score: 0.050371821275320805
Test score:0.05653888321299427
Predictions are: [7831.20733187 8446.98978809 8377.76457094 ... 7919.36665243 7945.64429697
 7931.90439889]


r2 score is: 0.05653888321299427
MAE:3403.751032292937
MSE:19471990.026332457
RMSE:4412.70778845965


SVR()

#  K Nearest Neighbours



In [11]:
predict(KNeighborsRegressor())

Model is: KNeighborsRegressor()
Training score: 0.8190730329140331
Test score:0.749717394869148
Predictions are: [6156.2 9088.4 9696.2 ... 4888.  3853.8 4450. ]


r2 score is: 0.749717394869148
MAE:1406.1429526994746
MSE:5165555.107845199
RMSE:2272.78575933703


KNeighborsRegressor()

# So we will choose the best model which came out to be XGBRegressor

In [12]:
model=predict(XGBRegressor())

Model is: XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)
Training score: 0.9376103949657766
Test score:0.839765086215697
Predictions are: [8700.959  9433.354  8514.881  ... 4763.8467 3753.632  4421.112 ]


r2 score is: 0.839765086215697
MAE:1159.965198587927
MSE:3307070.7287903805
RMSE:1818.5353251422916


# Hyperparamater Optimization Model

In [13]:
from sklearn.model_selection import RandomizedSearchCV

In [14]:
# Create the random grid

random_grid = {
              'learning_rate': [0.01, 0.1],
              'max_depth': [3, 5, 7, 10],
              'min_child_weight': [1, 3, 5,7,9],
              'subsample': [0.3,0.5,0.7,0.9],
              'colsample_bytree': [0.5,0.7,0.9,1],
              'n_estimators' : [100, 200],
              'objective': ['reg:squarederror']
              }

In [16]:
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations
xgb_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid,scoring='neg_mean_absolute_error', n_iter = 10, cv = 5, verbose=2)

In [17]:

xgb_random.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=100, objective=reg:squarederror, subsample=0.3; total time=   1.2s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=100, objective=reg:squarederror, subsample=0.3; total time=   0.8s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=100, objective=reg:squarederror, subsample=0.3; total time=   0.8s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=100, objective=reg:squarederror, subsample=0.3; total time=   0.9s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, min_child_weight=7, n_estimators=100, objective=reg:squarederror, subsample=0.3; total time=   1.1s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=10, min_child_weight=1, n_estimators=100, objective=reg:squ

RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1,
                                          enable_categorical=False, gamma=0,
                                          gpu_id=-1, importance_type=None,
                                          interaction_constraints='',
                                          learning_rate=0.300000012,
                                          max_delta_step=0, max_depth=6,
                                          min_child_weight=1, missing=nan,
                                          monotone_constraints='()',
                                          n_estimato...
                                          reg_lambda=1, scale_pos_weight=1,
                                          subsample=1, tree_method='exac

In [18]:
xgb_random.best_params_

{'subsample': 0.5,
 'objective': 'reg:squarederror',
 'n_estimators': 100,
 'min_child_weight': 1,
 'max_depth': 10,
 'learning_rate': 0.1,
 'colsample_bytree': 0.5}

In [19]:
prediction = xgb_random.predict(X_test)

In [20]:
r2score=r2_score(y_test,prediction) 
print("r2 score is: {}".format(r2score))
          
print('MAE:{}'.format(mean_absolute_error(y_test,prediction)))
print('MSE:{}'.format(mean_squared_error(y_test,prediction)))
print('RMSE:{}'.format(np.sqrt(mean_squared_error(y_test,prediction))))

r2 score is: 0.8249601096712698
MAE:1208.512085252553
MSE:3612629.0082825227
RMSE:1900.6917183705839


# Saving Model

In [23]:
import pickle
#open a file where you want to store the data
file = open('optimizedmodel.pkl','wb')

#dump information
pickle.dump(xgb_random,file)

In [25]:
model= open('optimizedmodel.pkl','rb')
xgb=pickle.load(model)