In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
#importing dataset
input_train=pd.read_csv('train.csv')
input_test=pd.read_csv('test.csv')

In [3]:
train=input_train.drop(['Id','SalePrice'],axis=1)
test=input_test.drop(['Id'],axis=1)
y=input_train['SalePrice']#target value 

In [4]:
print('train shape',train.shape)
print('test shape',test.shape)

train shape (1460, 79)
test shape (1459, 79)


In [5]:
#null values in train data
for val in train.columns:
    print(val,train[val].isnull().sum())

MSSubClass 0
MSZoning 0
LotFrontage 259
LotArea 0
Street 0
Alley 1369
LotShape 0
LandContour 0
Utilities 0
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 0
Exterior2nd 0
MasVnrType 8
MasVnrArea 8
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinSF1 0
BsmtFinType2 38
BsmtFinSF2 0
BsmtUnfSF 0
TotalBsmtSF 0
Heating 0
HeatingQC 0
CentralAir 0
Electrical 1
1stFlrSF 0
2ndFlrSF 0
LowQualFinSF 0
GrLivArea 0
BsmtFullBath 0
BsmtHalfBath 0
FullBath 0
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 0
TotRmsAbvGrd 0
Functional 0
Fireplaces 0
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageCars 0
GarageArea 0
GarageQual 81
GarageCond 81
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
PoolQC 1453
Fence 1179
MiscFeature 1406
MiscVal 0
MoSold 0
YrSold 0
S

In [6]:
#null values in test data
for val in test.columns:
    print(val,test[val].isnull().sum())

MSSubClass 0
MSZoning 4
LotFrontage 227
LotArea 0
Street 0
Alley 1352
LotShape 0
LandContour 0
Utilities 2
LotConfig 0
LandSlope 0
Neighborhood 0
Condition1 0
Condition2 0
BldgType 0
HouseStyle 0
OverallQual 0
OverallCond 0
YearBuilt 0
YearRemodAdd 0
RoofStyle 0
RoofMatl 0
Exterior1st 1
Exterior2nd 1
MasVnrType 16
MasVnrArea 15
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 44
BsmtCond 45
BsmtExposure 44
BsmtFinType1 42
BsmtFinSF1 1
BsmtFinType2 42
BsmtFinSF2 1
BsmtUnfSF 1
TotalBsmtSF 1
Heating 0
HeatingQC 0
CentralAir 0
Electrical 0
1stFlrSF 0
2ndFlrSF 0
LowQualFinSF 0
GrLivArea 0
BsmtFullBath 2
BsmtHalfBath 2
FullBath 0
HalfBath 0
BedroomAbvGr 0
KitchenAbvGr 0
KitchenQual 1
TotRmsAbvGrd 0
Functional 2
Fireplaces 0
FireplaceQu 730
GarageType 76
GarageYrBlt 78
GarageFinish 78
GarageCars 1
GarageArea 1
GarageQual 78
GarageCond 78
PavedDrive 0
WoodDeckSF 0
OpenPorchSF 0
EnclosedPorch 0
3SsnPorch 0
ScreenPorch 0
PoolArea 0
PoolQC 1456
Fence 1169
MiscFeature 1408
MiscVal 0
MoSold 0
YrSold 0

In [7]:
#null limit is maximum number of null values that particular column can hold if  exceeds then drop column
null_limit=500 
train_remove_cols=[val for val in train.columns if train[val].isnull().sum()>null_limit]
test_remove_cols=[val for val in test.columns if test[val].isnull().sum()>null_limit]

In [8]:
print(train_remove_cols)
print(test_remove_cols)

['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']


In [9]:
#removing columns with high null values
train.drop(train_remove_cols,axis=1,inplace=True)
test.drop(train_remove_cols,axis=1,inplace=True)

In [10]:
#columns with null values
train_null_cols=[val for val in train.columns if train[val].isnull().any()]
test_null_cols=[val for val in test.columns if test[val].isnull().any()]

In [11]:
#HANDLING NULL VALUES
#if column contain categorical variable then fill null value of column by most frequent class
#if column contain numerical varible and if number of unique values is greater than "fill_limit" then fill
#null values by mean of column else fill null value with most frequent unique value
#also numerical column is considered categorial if number of unique values in column is less than equal to "fill_limit" 

In [12]:
fill_limit=5
for col in train_null_cols:
    if train[col].dtypes=='object':
        mode=train[col].mode()[0]
        train[col].fillna(mode,inplace=True)
    else:
        if len(train[col].unique())>fill_limit:
            mean=train[col].mean()
            train[col].fillna(mean,inplace=True)
        else:
            mode=train[col].mode()[0]
            train[col].fillna(mode,inplace=True)
for col in test_null_cols:
    if test[col].dtypes=='object':
        mode=test[col].mode()[0]
        test[col].fillna(mode,inplace=True)
    else:
        if len(test[col].unique())>fill_limit:
            mean=test[col].mean()
            test[col].fillna(mean,inplace=True)
        else:
            mode=test[col].mode()[0]
            test[col].fillna(mode,inplace=True)

In [13]:
#HANDLING CATEGORICAL COLUMNS
#columns with categorical values
categorical_cols=[col for col in train.columns if train[col].dtypes==object]

In [14]:
#number of differnt classes in categorical columns
for col in categorical_cols:
    print(col,len(train[col].unique())) 

MSZoning 5
Street 2
LotShape 4
LandContour 4
Utilities 2
LotConfig 5
LandSlope 3
Neighborhood 25
Condition1 9
Condition2 8
BldgType 5
HouseStyle 8
RoofStyle 6
RoofMatl 8
Exterior1st 15
Exterior2nd 16
MasVnrType 4
ExterQual 4
ExterCond 5
Foundation 6
BsmtQual 4
BsmtCond 4
BsmtExposure 4
BsmtFinType1 6
BsmtFinType2 6
Heating 6
HeatingQC 5
CentralAir 2
Electrical 5
KitchenQual 4
Functional 7
GarageType 6
GarageFinish 3
GarageQual 5
GarageCond 5
PavedDrive 3
SaleType 9
SaleCondition 6


In [15]:
#dummy_cols is categorical columns to be one hot encoded if number of unique values in particular column is less than or equal to "fill_limit"
#label_cols is categorical columns to be label encoded if number of unique values in particular column is greater than "fill_limit"     
dummy_cols=[col for col in categorical_cols if len(train[col].unique())<=fill_limit]
label_cols=[col for col in categorical_cols if len(train[col].unique())>fill_limit]

In [16]:
train_dummy=train[dummy_cols]
test_dummy=test[dummy_cols]

In [17]:
#one hot encoding dummy_cols
import sklearn.preprocessing as skp
one_hot=skp.OneHotEncoder(sparse=False,drop='first')
train_dummy=pd.DataFrame(one_hot.fit_transform(train_dummy))
test_dummy=pd.DataFrame(one_hot.transform(test_dummy))

In [18]:
#label encoding label_cols
for col in label_cols:
    a=skp.LabelEncoder()
    train[col]=a.fit_transform(train[col])
    test[col]=a.transform(test[col])

In [19]:
#dropping dummy_cols 
train.drop(dummy_cols,inplace=True,axis=1)
test.drop(dummy_cols,inplace=True,axis=1)

In [20]:
#feature scalling 
#no feature scalling on one hot encoded columns
scx=skp.StandardScaler()
temp_train=pd.DataFrame(scx.fit_transform(train))
temp_test=pd.DataFrame(scx.transform(test))
#feature scalling target value
scy=skp.StandardScaler()
y=scy.fit_transform(input_train[['SalePrice']]).ravel()

In [21]:
#preserving column names
temp_train.columns=train.columns
temp_test.columns=test.columns

In [22]:
#final train and test dataset with feature scalling 
ftrain=pd.concat([temp_train,train_dummy],axis=1)
ftest=pd.concat([temp_test,test_dummy],axis=1)

In [23]:
#MODEL PERFORMANCE TEST
# K FOLD CROSS VALIDATION FOR DIFFRENT REGRESSION MODEL
#no hyper parameter tuning now
#selecting model with highest r2 score

In [25]:
X=ftrain.copy()
Xtest=ftest.copy()
print(X.shape)

(1460, 117)


In [26]:
import sklearn.model_selection as skms

In [27]:
#LINEAR REGRESSION
import sklearn.linear_model as sklm
model_linear_reg=sklm.LinearRegression()
score=skms.cross_val_score(estimator=model_linear_reg,X=X,y=y,cv=5,n_jobs=-1)
score_linear_regression=score.mean()
print('r2score linear regression',score_linear_regression)

r2score linear regression -2.9684820077685427e+18


In [28]:
#RIDGE LINEAR REGRESSION
model_ridge_linear=sklm.Ridge()
score=skms.cross_val_score(estimator=model_ridge_linear,X=X,y=y,cv=5,n_jobs=-1)
score_ridge_linear=score.mean()
print('r2score ridge linear regression',score_ridge_linear)

r2score ridge linear regression 0.8192265611524991


In [29]:
#SUPPORT VECTOR REGRESSION
#SVR LINEAR
import sklearn.svm as skvm
model_svr_linear=skvm.SVR(kernel='linear')
score=skms.cross_val_score(estimator=model_svr_linear,X=X,y=y,cv=5,n_jobs=-1)
score_svr_linear=score.mean()
print('r2score SVR linear',score_svr_linear)

r2score SVR linear 0.8341747553518918


In [30]:
#SVR RBF kernel 
model_svr_rbf=skvm.SVR(kernel='rbf')
score=skms.cross_val_score(estimator=model_svr_rbf,X=X,y=y,cv=5,n_jobs=-1)
score_svr_rbf=score.mean()
print('r2score SVR kernel RBF',score_svr_rbf)

r2score SVR kernel RBF 0.8132528702681293


In [31]:
#DECISION TREE REGRESSOR
import sklearn.tree as skt
model_decision_tree=skt.DecisionTreeRegressor()
score=skms.cross_val_score(estimator=model_decision_tree,X=X,y=y,cv=5,n_jobs=-1)
score_decsion_tree=score.mean()
print('r2score decision tree',score_decsion_tree)

r2score decision tree 0.6967528626612628


In [32]:
#RANDOM FOREST REGRESSOR
import sklearn.ensemble as ske
model_random_forest=ske.RandomForestRegressor()
score=skms.cross_val_score(estimator=model_random_forest,X=X,y=y,cv=5,n_jobs=-1)
score_random_forest=score.mean()
print('r2score random forest',score_random_forest)

r2score random forest 0.8575584667042531


In [33]:
#XGBOOST REGRESSOR
import xgboost as xgb
model_xgb=xgb.XGBRegressor()
score=skms.cross_val_score(estimator=model_xgb,X=X,y=y,cv=5,n_jobs=-1)
score_xgb=score.mean()
print('r2score XGBoost',score_xgb)

r2score XGBoost 0.8711063580111891


In [34]:
print('r2score linear regression',score_linear_regression)
print('r2score ridge linear regression',score_ridge_linear)
print('r2score SVR linear',score_svr_linear)
print('r2score SVR kernel RBF',score_svr_rbf)
print('r2score decision tree',score_decsion_tree)
print('r2score random forest',score_random_forest)
print('r2score XGBoost',score_xgb)

r2score linear regression -2.9684820077685427e+18
r2score ridge linear regression 0.8192265611524991
r2score SVR linear 0.8341747553518918
r2score SVR kernel RBF 0.8132528702681293
r2score decision tree 0.6967528626612628
r2score random forest 0.8575584667042531
r2score XGBoost 0.8711063580111891


In [35]:
#XGBoost regressor is best model with highest r2 score
#hyperparameter tuning 
#rgrid search 
#finding best parameters which can fit model best

In [36]:
n_estimators=[20*i for i in range(1,31)]
max_depth=[i for i in range(1,51)]
max_depth.append(None)
model=xgb.XGBRegressor(random_state=0)
parameter={'n_estimators':n_estimators ,'max_depth':max_depth,'Learning_rate':[0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}
grid_search = skms.RandomizedSearchCV(estimator=model,param_distributions = parameter,cv = 5,n_jobs = -1)
grid_search.fit(X,y)
print('best_score grid search',grid_search.best_score_)
print('best_parameters',grid_search.best_params_)

Parameters: { Learning_rate } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


best_score grid search 0.8474636296562343
best_parameters {'n_estimators': 420, 'max_depth': 19, 'Learning_rate': 0.4}


In [37]:
best_model=grid_search.best_estimator_
best_model.fit(X,y)

Parameters: { Learning_rate } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(Learning_rate=0.4, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=19, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=420, n_jobs=4,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [38]:
import sklearn.metrics as skmet
ypred=best_model.predict(X)

In [39]:
print('root mean squred log error',np.sqrt(skmet.mean_squared_log_error(scy.inverse_transform(y),scy.inverse_transform(ypred))))

root mean squred log error 0.0003147649816284884


In [40]:
print('r2 score',skmet.r2_score(y,ypred))

r2 score 0.9999997840568646


In [41]:
test_prediction=scy.inverse_transform(best_model.predict(Xtest))

In [42]:
new=pd.DataFrame()
new['Id']=input_test['Id']
new['SalePrice']=test_prediction
new.to_csv('mysubmission.csv',index=False)