In [1]:
import pandas as pd 
import numpy as np
import sklearn.linear_model as linear_model
from sklearn.linear_model import Lasso, BayesianRidge, ElasticNet
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer, r2_score
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb


# Import Data 

In [51]:
train =pd.read_csv('X_train.csv')

In [52]:
test = pd.read_csv('X_test.csv')

# Creating X and Y variables for Model Building 

In [53]:
X_train = train.drop(['SalePrice'], axis=1)

In [54]:
y_train = train.SalePrice

In [55]:
X_test = test.copy()

# Cross Validation 

In [56]:


def test_model(model, X_train=X_train, y_train=y_train):
    cv = KFold(n_splits = 3, shuffle=True, random_state = 45)
    r2 = make_scorer(r2_score)
    r2_val_score = cross_val_score(model, X_train, y_train, cv=cv, scoring = r2)
    score = [r2_val_score.mean()]
    return score

# Linear Regression

In [11]:

LR = linear_model.LinearRegression()
test_model(LR)

[-8.543231864770832e+21]

# Ridge and Lasso and ElasticNet

In [12]:
rdg = linear_model.Ridge(alpha=0.005, random_state = 4)
test_model(rdg)

[0.8454970710873354]

In [13]:
lasso = linear_model.Lasso(alpha=1e-4)
test_model(lasso)

[0.8524635789112677]

In [14]:
ENet = linear_model.ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)
test_model(ENet)

[0.8639497050512198]

# Support Vector Machine 

In [15]:

svr_reg = SVR(kernel='rbf')
test_model(svr_reg)

[0.8692210493859426]

# Decision Tree Regressor 

In [16]:

dt_reg = DecisionTreeRegressor(random_state=21)
test_model(dt_reg)

[0.6691992174595726]

# Random Forest Regressor 

In [17]:

rf_reg = RandomForestRegressor(n_estimators = 1000, random_state=51)
test_model(rf_reg)

[0.8657457964650583]

# Bagging & Boosting

In [18]:
br_reg = BaggingRegressor(n_estimators=1000, random_state=51)
test_model(br_reg)

[0.8660357027386202]

In [19]:
gbr_reg = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, loss='ls', random_state=51)
test_model(gbr_reg)

[0.8844778922047046]

# XGBoost

In [20]:
import xgboost
xgb_reg = xgboost.XGBRegressor(bbooster='gbtree', random_state=51)
test_model(xgb_reg)



Parameters: { "bbooster" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "bbooster" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "bbooster" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[0.8583970509260753]

# Hyperparameter Tunning 

we can all algorithm gives almost same accuracy hence will perform Hyperparameter Tunning to increase accuracy

# Support Vector Machine Hyperparameter Tunning

In [21]:
params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'rbf'],
         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
         'C': [0.1, 1, 10, 100, 1000],
         'epsilon': [1, 0.2, 0.1, 0.01, 0.001, 0.0001]}
rand_search = RandomizedSearchCV(svr_reg, param_distributions=params, n_jobs=-1, cv=10)
rand_search.fit(X_train, y_train)
rand_search.best_score_

0.8709721519881256

In [22]:
rand_search.best_params_

{'kernel': 'sigmoid', 'gamma': 0.001, 'epsilon': 0.001, 'C': 1000}

In [23]:
svr_reg1 = SVR(kernel= 'sigmoid', gamma = 0.001, epsilon = 0.001, C = 1000)
test_model(svr_reg1)

[0.87139754866591]

can see there is 1 % accuracy increase after tunning

# GradientBoosting  Hyperparameter Tunning

In [28]:
gbr_reg2 = GradientBoostingRegressor()
params = { 'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators': range(99, 2001, 80),
              'criterion' :['friedman_mse', 'mse', 'mae'], 'min_samples_split': [2,3], 'min_samples_leaf': [0.05,0.5],
              'max_features':['auto', 'sqrt', 'log2'],'alpha':[0.005, 0.05],
              'learning_rate': [0.2, 0.1, 0.01, 0.05] } 

rand_search_gbr2 = RandomizedSearchCV(estimator = gbr_reg2, param_distributions=params, n_iter=100, n_jobs=-1, 
                                     cv=10, verbose=11, random_state=51, return_train_score =True, 
                                     scoring='neg_mean_absolute_error') 

rand_search_gbr2.fit(X_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits




RandomizedSearchCV(cv=10, estimator=GradientBoostingRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'alpha': [0.005, 0.05],
                                        'criterion': ['friedman_mse', 'mse',
                                                      'mae'],
                                        'learning_rate': [0.2, 0.1, 0.01, 0.05],
                                        'loss': ['ls', 'lad', 'huber',
                                                 'quantile'],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [0.05, 0.5],
                                        'min_samples_split': [2, 3],
                                        'n_estimators': range(99, 2001, 80)},
                   random_state=51, return_train_score=True,
                   scoring='neg_mean_absolute_error', ve

In [29]:
rand_search_gbr2.best_score_

-0.08313443655885186

In [30]:
rand_search_gbr2.best_params_

{'n_estimators': 1699,
 'min_samples_split': 2,
 'min_samples_leaf': 0.05,
 'max_features': 'sqrt',
 'loss': 'ls',
 'learning_rate': 0.05,
 'criterion': 'mae',
 'alpha': 0.05}

In [57]:
gbr_reg2 = GradientBoostingRegressor(n_estimators=1699, random_state=51, min_samples_split= 2,
                                  min_samples_leaf = 0.05, max_features = 'sqrt', loss= 'lad',learning_rate= 0.05,
                                   criterion = 'mae', alpha = 0.05)
test_model(gbr_reg2)



[0.8901541473603126]

In [None]:
we can see all alogorithm gives almost same accuracy , among all Linear Regression is giving more accuracy hence our best model is "GradientBoosting"

# will still check with feature selection ,can improve accuracy or not 

for that we will define our X and Y varible again

In [None]:
train =pd.read_csv('X_train.csv')
test =pd.read_csv('X_test.csv')

# Import feature obtained after Feature selection

In [35]:
selected_feature = ['LotShape', 'BldgType', 'OverallQual', 'YearBuilt', 'YearRemodAdd',
       'ExterQual', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC',
       'CentralAir', '1stFlrSF', 'GrLivArea', 'FullBath', 'KitchenQual',
       'Functional', 'Fireplaces', 'GarageType', 'GarageFinish', 'GarageCars','GarageArea','GarageYrBlt',
       'PavedDrive', 'SaleCondition']

In [37]:
X_train = train[selected_feature]
X_test = test[selected_feature]
y_train = train.SalePrice

# Cross Validation

In [38]:
def test_model(model, X_train=X_train, y_train=y_train):
    cv = KFold(n_splits = 3, shuffle=True, random_state = 45)
    r2 = make_scorer(r2_score)
    r2_val_score = cross_val_score(model, X_train, y_train, cv=cv, scoring = r2)
    score = [r2_val_score.mean()]
    return score

# Linear Regression

In [39]:
LRF = linear_model.LinearRegression()
test_model(LRF)

[0.8563746218352245]

# Ridge and Lasso and ElasticNet

In [40]:
rdgF = linear_model.Ridge(alpha=0.005, random_state = 4)
test_model(rdgF)

[0.8563828583043618]

In [44]:
lassoF = linear_model.Lasso(alpha=1e-4)
test_model(lassoF)

[0.8566069513078524]

In [43]:
ENetF = linear_model.ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)
test_model(ENetF)

[0.8568969007536089]

# Support Vector Machine

In [45]:

svr_regF = SVR(kernel='rbf')
test_model(svr_regF)

[0.8496693475665772]

# Random Forest Regressor¶

In [46]:

rf_regF = RandomForestRegressor(n_estimators = 1000, random_state=51)
test_model(rf_regF)

[0.8561674394928668]

# Bagging & Boosting

In [47]:
br_regF = BaggingRegressor(n_estimators=1000, random_state=51)
test_model(br_regF)

[0.8563557064429409]

In [48]:
gbr_regF = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, loss='ls', random_state=51)
test_model(gbr_regF)

[0.8569433944817103]

# # XGBoost

In [49]:
import xgboost
xgb_regF = xgboost.XGBRegressor(bbooster='gbtree', random_state=51)
test_model(xgb_regF)


Parameters: { "bbooster" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "bbooster" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "bbooster" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[0.8414048834228062]

we can see all model accuracy has increased after Feature selection and all model giving almost same accuracy and in which , 
GradientBoostingregressor, Lasso, Ridge, Elasticnet are being remain high performance model throughout test 

# But to finalise model , GradientBoosting Model gives excellent accuracy in over all

In [58]:
gbr_reg2.fit(X_train,y_train)
y_pred = np.exp(gbr_reg2.predict(X_test)).round(2)



# For Model Save 

In [59]:

import pickle 

In [60]:

pickle.dump(gbr_reg2, open('model_house_price_prediction.csv', 'wb'))
model_house_price_prediction = pickle.load(open('model_house_price_prediction.csv', 'rb'))
model_house_price_prediction.predict(X_test)

array([11.79897599, 11.96888619, 12.19088503, ..., 12.01262345,
       11.72899386, 12.38439546])

In [61]:
test_model(model_house_price_prediction)



[0.8901541473603126]

# GradientBoostingRegressor Accuracy  = 89.013% 