In [276]:
import pandas as pd
import numpy as np

train = pd.read_csv('./train.csv')
X_test = pd.read_csv('./test.csv')

In [277]:
# Have a look at the first five rows
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [278]:
# Drop column id
train.drop(labels='Id', axis=1, inplace=True)

In [279]:
# Have a look at unique values of columns having missing data and filter numeric and categorical columns having missing data
numeric_missing = []
cat_missing = []
for i in train.columns[train.isna().any()]:
    print(train[i].unique())
    if train[i].dtype != 'object':
        numeric_missing.append(i)
    else:
        cat_missing.append(i)
        
# Filter the numeric columns having missing data
print(numeric_missing, cat_missing)

[ 65.  80.  68.  60.  84.  85.  75.  nan  51.  50.  70.  91.  72.  66.
 101.  57.  44. 110.  98.  47. 108. 112.  74. 115.  61.  48.  33.  52.
 100.  24.  89.  63.  76.  81.  95.  69.  21.  32.  78. 121. 122.  40.
 105.  73.  77.  64.  94.  34.  90.  55.  88.  82.  71. 120. 107.  92.
 134.  62.  86. 141.  97.  54.  41.  79. 174.  99.  67.  83.  43. 103.
  93.  30. 129. 140.  35.  37. 118.  87. 116. 150. 111.  49.  96.  59.
  36.  56. 102.  58.  38. 109. 130.  53. 137.  45. 106. 104.  42.  39.
 144. 114. 128. 149. 313. 168. 182. 138. 160. 152. 124. 153.  46.]
[nan 'Grvl' 'Pave']
['BrkFace' 'None' 'Stone' 'BrkCmn' nan]
[1.960e+02 0.000e+00 1.620e+02 3.500e+02 1.860e+02 2.400e+02 2.860e+02
 3.060e+02 2.120e+02 1.800e+02 3.800e+02 2.810e+02 6.400e+02 2.000e+02
 2.460e+02 1.320e+02 6.500e+02 1.010e+02 4.120e+02 2.720e+02 4.560e+02
 1.031e+03 1.780e+02 5.730e+02 3.440e+02 2.870e+02 1.670e+02 1.115e+03
 4.000e+01 1.040e+02 5.760e+02 4.430e+02 4.680e+02 6.600e+01 2.200e+01
 2.840e+02 7.600e+01 

In [280]:
#Numerical features to pass down the numerical pipeline 
numeric_features = train._get_numeric_data().columns

#Categrical features to pass down the categorical pipeline 
categorical_features = list(set(train.columns) - set(numeric_features))

In [281]:
print(numeric_features)
print(categorical_features)

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')
['Utilities', 'GarageCond', 'Street', 'Heating', 'KitchenQual', 'GarageType', 'LandSlope', 'Neighborhood', 'PoolQC', 'LotConfig', 'PavedDrive', 'GarageFinish', 'SaleType', 'Condition2', 'ExterCond', 'BsmtCond', 'HeatingQC', 'Alley', 'Exterior2nd', 'Exterior1st', 'BsmtExposure', 'Fence', 'MiscFeature', 'BsmtFinType2', 'BldgType', 'RoofStyle', 'FireplaceQu', 'SaleCondition', 'RoofMatl', 'Condition1', 'MSZon

In [282]:
# Imputing missing data from numeric columns
from sklearn.impute import KNNImputer, SimpleImputer

# Impute missing numeric columns
numeric_imputer = KNNImputer()
train[numeric_missing] = numeric_imputer.fit_transform(train[numeric_missing])

# Impute missing categorical columns
train[cat_missing] = (train[cat_missing].fillna('Unknown'))

In [283]:
# Check if any missing data
for i in cat_missing:
    print(train[i].unique())

['Unknown' 'Grvl' 'Pave']
['BrkFace' 'None' 'Stone' 'BrkCmn' 'Unknown']
['Gd' 'TA' 'Ex' 'Unknown' 'Fa']
['TA' 'Gd' 'Unknown' 'Fa' 'Po']
['No' 'Gd' 'Mn' 'Av' 'Unknown']
['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' 'Unknown' 'LwQ']
['Unf' 'BLQ' 'Unknown' 'ALQ' 'Rec' 'LwQ' 'GLQ']
['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' 'Unknown']
['Unknown' 'TA' 'Gd' 'Fa' 'Ex' 'Po']
['Attchd' 'Detchd' 'BuiltIn' 'CarPort' 'Unknown' 'Basment' '2Types']
['RFn' 'Unf' 'Fin' 'Unknown']
['TA' 'Fa' 'Gd' 'Unknown' 'Ex' 'Po']
['TA' 'Fa' 'Unknown' 'Gd' 'Po' 'Ex']
['Unknown' 'Ex' 'Fa' 'Gd']
['Unknown' 'MnPrv' 'GdWo' 'GdPrv' 'MnWw']
['Unknown' 'Shed' 'Gar2' 'Othr' 'TenC']


In [284]:
# Encoding categorical columns using get_dummies embedded in pandas
dummies = pd.get_dummies(train[categorical_features], drop_first=True)
X_train = train.drop(columns='SalePrice')
y_train = train['SalePrice']
X_train = X_train.drop(columns=categorical_features)
X_train = X_train.join(dummies)

In [285]:
# StandardScale the dataset and convert it back to dataframe type
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaled_X_train = sc.fit_transform(X_train)
X_train = pd.DataFrame(scaled_X_train, index=X_train.index, columns=X_train.columns)

In [286]:
# Check if X_train is in dataframe form
X_train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Functional_Sev,Functional_Typ,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageQual_Unknown
0,0.073375,-0.208075,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.510835,0.575425,-0.288653,...,-0.02618,0.271163,-0.098397,1.411313,-1.278819,-0.184376,-0.098397,-0.045376,0.337126,-0.24236
1,-0.872563,0.458403,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.574288,1.171992,-0.288653,...,-0.02618,0.271163,-0.098397,-0.70856,0.781971,-0.184376,-0.098397,-0.045376,0.337126,-0.24236
2,0.073375,-0.074779,0.07348,0.651479,-0.5172,0.984752,0.830215,0.3226,0.092907,-0.288653,...,-0.02618,0.271163,-0.098397,1.411313,-1.278819,-0.184376,-0.098397,-0.045376,0.337126,-0.24236
3,0.309859,-0.430234,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.574288,-0.499274,-0.288653,...,-0.02618,0.271163,-0.098397,-0.70856,0.781971,-0.184376,-0.098397,-0.045376,0.337126,-0.24236
4,0.073375,0.63613,0.375148,1.374795,-0.5172,0.951632,0.733308,1.363432,0.463568,-0.288653,...,-0.02618,0.271163,-0.098397,1.411313,-1.278819,-0.184376,-0.098397,-0.045376,0.337126,-0.24236


In [287]:
# Import XGBGeressor and GridSearchCV libraries
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

parameters = {'objective':['reg:linear'],
              'learning_rate': [.01, 0.05, 0.1, 0.2], #so called `eta` value
              'gamma': [0.5, 1, 1.5, 2, 5],
              'subsample': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 5, 10],
              'silent': [1],
              'n_estimators': [500, 600, 700],
                'colsample_bytree': [0.6, 0.8, 1.0],
                'max_depth': [3, 4, 5],
                'reg_alpha': [1.1, 1.2, 1.3],
                'reg_lambda': [1.1, 1.2, 1.3],
                'subsample': [0.7, 0.8, 0.9]
             }

xgb_grid = GridSearchCV(XGBRegressor(),
                        parameters,
                        cv = 2,
                        n_jobs = 2,
                        verbose=True)

# Try fitting training data sets with all parameters
xgb_grid.fit(X_train,y_train)

# Print the best parameters
print(xgb_grid.best_params_)

#Fit the training tests using the best parameters
gbm = XGBRegressor(**xgb_grid.best_params_)
gbm.fit(X_train,y_train)

# Print the accuracy of prediction
predictions = gbm.predict(X_test)
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

Fitting 2 folds for each of 43740 candidates, totalling 87480 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


KeyboardInterrupt: 