## IOWA HOUSE PRICE PREDICTION

##### In this problem we have two datasets , training set(81 variables and 1460 obesrvations) and testing dataset(80 variables and 1459 observations).
##### SalePrice is the target variable.

In [2]:
#we will import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score , mean_squared_error
import xgboost
import warnings
warnings.simplefilter('ignore')
%pylab inline                           

Populating the interactive namespace from numpy and matplotlib


In [3]:
#reading training and testing data from csv file
train = pd.read_csv('attachment_train__2_.csv')
test = pd.read_csv('attachment_test.csv')
labels = train['SalePrice']  #copying the target variable into labels 

In [4]:
#concatinating train and test dataset
data1 = pd.concat([train,test],ignore_index=True)
data1= data1.drop('SalePrice',1) #dropping the target variable

In [5]:
print(data1.shape[0])
print(train.shape[0])

2919
1460


### Data Cleansing

In [6]:
#filtering the variables which contain missing values
nulls = pd.isnull(data1).sum()
nulls[nulls>0]

Alley           2721
BsmtCond          82
BsmtExposure      82
BsmtFinSF1         1
BsmtFinSF2         1
BsmtFinType1      79
BsmtFinType2      80
BsmtFullBath       2
BsmtHalfBath       2
BsmtQual          81
BsmtUnfSF          1
Electrical         1
Exterior1st        1
Exterior2nd        1
Fence           2348
FireplaceQu     1420
Functional         2
GarageArea         1
GarageCars         1
GarageCond       159
GarageFinish     159
GarageQual       159
GarageType       157
GarageYrBlt      159
KitchenQual        1
LotFrontage      486
MSZoning           4
MasVnrArea        23
MasVnrType        24
MiscFeature     2814
PoolQC          2909
SaleType           1
TotalBsmtSF        1
Utilities          2
dtype: int64

In [7]:
#dropping the variables whose missing values are greater than 1000
data1 = data1.drop(['PoolQC','MiscFeature','FireplaceQu','Fence','Alley'],axis=1)

In [8]:
#filtering out categorical and non categorical variables
categorical=data1.columns[data1.dtypes == 'O']
non_categorical = data1.columns[data1.dtypes !='O']

In [9]:
categorical

Index(['BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'BsmtQual', 'CentralAir', 'Condition1', 'Condition2', 'Electrical',
       'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'Foundation',
       'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType',
       'Heating', 'HeatingQC', 'HouseStyle', 'KitchenQual', 'LandContour',
       'LandSlope', 'LotConfig', 'LotShape', 'MSZoning', 'MasVnrType',
       'Neighborhood', 'PavedDrive', 'RoofMatl', 'RoofStyle', 'SaleCondition',
       'SaleType', 'Street', 'Utilities'],
      dtype='object')

In [10]:
#filling the missing values by taking median and mode
data1['BsmtCond'] = data1['BsmtCond'].fillna(data1['BsmtCond'].mode()[0])
data1['BsmtExposure'] = data1['BsmtExposure'].fillna(data1['BsmtExposure'].mode()[0])
data1['BsmtFinSF1'] = data1['BsmtFinSF1'].fillna(data1['BsmtFinSF1'].median())
data1['BsmtFinSF2'] = data1['BsmtFinSF2'].fillna(data1['BsmtFinSF2'].median())
data1['BsmtFinType1'] = data1['BsmtFinType1'].fillna(data1['BsmtFinType1'].mode()[0])
data1['BsmtFinType2'] = data1['BsmtFinType2'].fillna(data1['BsmtFinType2'].mode()[0])
data1['BsmtFullBath'] = data1['BsmtFullBath'].fillna(data1['BsmtFullBath'].median())
data1['BsmtHalfBath'] = data1['BsmtHalfBath'].fillna(data1['BsmtHalfBath'].median())
data1['BsmtQual'] = data1['BsmtQual'].fillna(data1['BsmtQual'].mode()[0])
data1['BsmtUnfSF'] = data1['BsmtUnfSF'].fillna(data1['BsmtUnfSF'].median())
data1['Electrical'] = data1['Electrical'].fillna(data1['Electrical'].mode()[0])
data1['Exterior1st'] = data1['Exterior1st'].fillna(data1['Exterior1st'].mode()[0])
data1['Exterior2nd'] = data1['Exterior2nd'].fillna(data1['Exterior2nd'].mode()[0])
data1['Functional'] = data1['Functional'].fillna(data1['Functional'].mode()[0])
data1['GarageArea'] = data1['GarageArea'].fillna(data1['GarageArea'].median())
data1['GarageCars'] = data1['GarageCars'].fillna(data1['GarageCars'].median())
data1['GarageCond'] = data1['GarageCond'].fillna(data1['GarageCond'].mode()[0])
data1['GarageFinish'] = data1['GarageFinish'].fillna(data1['GarageFinish'].mode()[0])
data1['GarageQual'] = data1['GarageQual'].fillna(data1['GarageQual'].mode()[0])
data1['GarageType'] = data1['GarageType'].fillna(data1['GarageType'].mode()[0])
data1['GarageYrBlt'] = data1['GarageYrBlt'].fillna(data1['GarageYrBlt'].median())
data1['KitchenQual'] = data1['KitchenQual'].fillna(data1['KitchenQual'].mode()[0])
data1['LotFrontage'] = data1['LotFrontage'].fillna(data1['LotFrontage'].median())
data1['MSZoning'] = data1['MSZoning'].fillna(data1['MSZoning'].mode()[0])
data1['MasVnrArea'] = data1['MasVnrArea'].fillna(data1['MasVnrArea'].median())
data1['MasVnrType'] = data1['MasVnrType'].fillna(data1['MasVnrType'].mode()[0])
data1['SaleType'] = data1['SaleType'].fillna(data1['SaleType'].mode()[0])
data1['TotalBsmtSF'] = data1['TotalBsmtSF'].fillna(data1['TotalBsmtSF'].median())
data1['Utilities'] = data1['Utilities'].fillna(data1['Utilities'].mode()[0])

**Note:** we have only 1460 observations , so there will be no problem replacing the missing values by mode or median

In [11]:
#creating dummy variables of categorical variables
data1 = pd.get_dummies(data1,columns=categorical,drop_first=True)

In [12]:
data1.shape

(2919, 233)

In [13]:
#removing the duplicated columns before proceeding to dimensionality reduction
final_data1 = data1.loc[:,~data1.columns.duplicated()]

In [14]:
final_data1.shape

(2919, 233)

In [15]:
#importing PCA(Principle component analysis) which will help to reduce the no of variables
from sklearn.decomposition import PCA
pca = PCA(whiten=True) #whiten = True improves the predictive accuracy

In [16]:
pca.fit(final_data1)
variance = pd.DataFrame(pca.explained_variance_ratio_) 

In [17]:
np.cumsum(pca.explained_variance_ratio_)

array([0.96439327, 0.97538589, 0.98290966, 0.98825137, 0.99307069,
       0.99727247, 0.99812826, 0.99865352, 0.99913816, 0.99949109,
       0.99970898, 0.99977679, 0.99983461, 0.99988191, 0.99992432,
       0.99994851, 0.99996724, 0.99998022, 0.99998995, 0.99999428,
       0.99999744, 0.99999964, 0.99999976, 0.99999979, 0.99999981,
       0.99999983, 0.99999984, 0.99999986, 0.99999986, 0.99999987,
       0.99999987, 0.99999988, 0.99999988, 0.99999989, 0.99999989,
       0.9999999 , 0.9999999 , 0.9999999 , 0.99999991, 0.99999991,
       0.99999991, 0.99999992, 0.99999992, 0.99999992, 0.99999992,
       0.99999993, 0.99999993, 0.99999993, 0.99999993, 0.99999994,
       0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999995,
       0.99999995, 0.99999995, 0.99999995, 0.99999995, 0.99999995,
       0.99999996, 0.99999996, 0.99999996, 0.99999996, 0.99999996,
       0.99999996, 0.99999996, 0.99999996, 0.99999996, 0.99999997,
       0.99999997, 0.99999997, 0.99999997, 0.99999997, 0.99999

Here we can see the explaning power of each variable , first 21 variables has maximum variance. After 21 variables there is not much change in variance . So we will take the first 21 variables which explains the maximum data. This is called as dimensionality reduction , where we reduce the huge no of features to **n** no of features that explains the maximum relationship between the data. This will increase the accuracy of model

In [18]:
#we have taken the first 21 features
pca = PCA(n_components=21,whiten=True)
pca.fit(final_data1)
pca_data = pca.transform(final_data1) #transforming the data with 21 features 

In [19]:
#seperating the data into train and test
train_pca = pca_data[:1460,:]
test_pca = pca_data[1460:,:]

### Creating and selecting a model

In [20]:
#creating an instant of the models
lr=LinearRegression()
xgbr = xgboost.XGBRegressor()

In [21]:
#training the models
lr.fit(train_pca,labels)
xgbr.fit(train_pca,labels)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

In [22]:
#Predicting the SalePrice using the model that we have created
y_predlr = lr.predict(test_pca)
y_predxgb = xgbr.predict(test_pca)

To test my model, i have uploaded my predictions to kaggle competition , where it calculated the rmse score. XGBoost has got better score 0.191312 and my linear regression model has got 0.20.
But we can improve the rmse score of my xgboost by randomized search.

In [23]:
sample = pd.read_csv('sample_submission.csv')
ids = sample['Id']
y_predxgb = pd.DataFrame(y_predxgb)
result1 = pd.concat([sample['Id'],y_predxgb],axis=1)

In [24]:
result1.columns = ['Id','SalePrice']
result1.to_csv('Submission.csv',index=False)

In [38]:
sample = pd.read_csv('sample_submission.csv')
ids = sample['Id']
y_predlr = pd.DataFrame(y_predlr)
result1 = pd.concat([sample['Id'],y_predlr],axis=1)
result1.columns = ['Id','SalePrice']
result1.to_csv('Submissionlr.csv',index=False)

# We will try to use the Randomized search CV to improve score

In [25]:
from sklearn.model_selection import RandomizedSearchCV

In [26]:
params = {
    'n_estimators':[100,300,500,900,1100,1500],
    'base_score':[0.25,0.5,0.75,1],
    'booster':['gbtree','gblinear'],
    'learning_rate':[0.01,0.1,0.15,0.2,0.25],
    'max_depth':[3,5,6,9,10,11,15],
    'min_child_weight':[1,2,3,4,5],
    'gamma':[0.0,0.1,0.3,0.5,0.7],
    'colsample_bytree':[0.1,0.2,0.3,0.4,0.5],
    'sampling_method':['uniform','gradient_based']
}

In [27]:
xgbr1 = xgboost.XGBRegressor()
model = RandomizedSearchCV(estimator=xgbr1,param_distributions=params,n_iter=50,cv=5,n_jobs=4,
                           scoring='neg_mean_absolute_error',return_train_score=True,verbose=5)

In [28]:
#fitting the model with 4 fold
model.fit(train_pca,labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   13.9s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  9.7min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed: 13.4min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_con...
                                                             0.5],
                                        'gamma': [0.0, 0.1, 0.3, 0.5, 0.7],
                                        'learning_rate': [0.01, 0.1, 0.15, 0.2,
          

In [29]:
#finding out the best parameter
model.best_params_

{'sampling_method': 'uniform',
 'n_estimators': 900,
 'min_child_weight': 3,
 'max_depth': 3,
 'learning_rate': 0.1,
 'gamma': 0.0,
 'colsample_bytree': 0.5,
 'booster': 'gbtree',
 'base_score': 0.5}

In [30]:
xgbr2=model.best_estimator_

In [31]:
#training the model with best estimators
xgbr2.fit(train_pca,labels)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=3, missing=nan, monotone_constraints=None,
             n_estimators=900, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, sampling_method='uniform', scale_pos_weight=1,
             subsample=1, tree_method=None, validate_parameters=False,
             verbosity=None)

In [32]:
y_predxgbr2 = xgbr2.predict(test_pca)

In [33]:
y_predxgbr2

array([118975.8 , 236520.6 , 188579.73, ..., 168329.22, 125381.56,
       257143.33], dtype=float32)

In [34]:
sample = pd.read_csv('sample_submission.csv')
y_predxgbr2 = pd.DataFrame(y_predxgbr2)
result2 = pd.concat([sample['Id'],y_predxgbr2],axis=1)

In [35]:
result2.columns = ['Id','SalePrice']

In [36]:
result2.to_csv('Submission2.csv',index=False)

This time we have got the rmse score of 0.18436 which is an improvement from 0.19132

This is my kaggle Profile
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/submissions