In [1]:
import pandas as pd
import numpy as np
import DataScience
import sidetable

train = pd.read_csv('./train.csv')
X_test = pd.read_csv('./test.csv')

In [2]:
# Have a look at the first five rows
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Drop column id
train.set_index(drop=True, inplace=True, keys='Id')
X_test.set_index(drop=True, inplace=True, keys='Id')

In [4]:
# Have a look at unique values of columns having missing data and filter numeric and categorical columns having missing data
numeric_missing = []
cat_missing = []
for i in train.columns[train.isna().any()]:
    print(train[i].unique())
    if train[i].dtype != 'object':
        numeric_missing.append(i)
    else:
        cat_missing.append(i)
        
# Filter the numeric columns having missing data
print(numeric_missing, cat_missing)

[ 65.  80.  68.  60.  84.  85.  75.  nan  51.  50.  70.  91.  72.  66.
 101.  57.  44. 110.  98.  47. 108. 112.  74. 115.  61.  48.  33.  52.
 100.  24.  89.  63.  76.  81.  95.  69.  21.  32.  78. 121. 122.  40.
 105.  73.  77.  64.  94.  34.  90.  55.  88.  82.  71. 120. 107.  92.
 134.  62.  86. 141.  97.  54.  41.  79. 174.  99.  67.  83.  43. 103.
  93.  30. 129. 140.  35.  37. 118.  87. 116. 150. 111.  49.  96.  59.
  36.  56. 102.  58.  38. 109. 130.  53. 137.  45. 106. 104.  42.  39.
 144. 114. 128. 149. 313. 168. 182. 138. 160. 152. 124. 153.  46.]
[nan 'Grvl' 'Pave']
['BrkFace' 'None' 'Stone' 'BrkCmn' nan]
[1.960e+02 0.000e+00 1.620e+02 3.500e+02 1.860e+02 2.400e+02 2.860e+02
 3.060e+02 2.120e+02 1.800e+02 3.800e+02 2.810e+02 6.400e+02 2.000e+02
 2.460e+02 1.320e+02 6.500e+02 1.010e+02 4.120e+02 2.720e+02 4.560e+02
 1.031e+03 1.780e+02 5.730e+02 3.440e+02 2.870e+02 1.670e+02 1.115e+03
 4.000e+01 1.040e+02 5.760e+02 4.430e+02 4.680e+02 6.600e+01 2.200e+01
 2.840e+02 7.600e+01 

In [5]:
# Retrieve numeric features from columns
num_cols = list(X_test._get_numeric_data().columns)

# Retrieve categorical features from columns
cat_cols = list(set(X_test.columns) - set(num_cols))

In [6]:
print('Numerical features', num_cols, sep='\n')
print()
print('Categorical features', cat_cols, sep='\n')

Numerical features
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']

Categorical features
['Neighborhood', 'Condition2', 'LotConfig', 'PavedDrive', 'HeatingQC', 'BldgType', 'Condition1', 'Heating', 'RoofStyle', 'Electrical', 'ExterQual', 'BsmtExposure', 'GarageType', 'LandSlope', 'Functional', 'ExterCond', 'MSZoning', 'KitchenQual', 'HouseStyle', 'Alley', 'RoofMatl', 'LotShape', 'BsmtCond', 'Utilities', 'PoolQC', 'GarageCond', 'Street', 'BsmtFinType1', 'MiscFeature', 'SaleCondition', 'LandContour', 'BsmtFinType2', 'FireplaceQu', 

In [7]:
# # Impute missing categorical data
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy='most_frequent')
# train[cat_cols] = imputer.fit_transform(train[cat_cols])
# X_test[cat_cols] = imputer.transform(X_test[cat_cols])

In [8]:
# Encoding categorical columns using get_dummies embedded in pandas
X_train = train.drop(columns='SalePrice')
y_train = train['SalePrice'].copy()
X_train = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)

In [9]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,1,0,0,1,0,0,0,1,0
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,0,0,1,0,0,1,0
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,1,0,0,1,0,0,0,1,0
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,0,1,0,0,0,1
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,1,0,0,1,0,0,0,1,0


In [10]:
# Predict the house prices using XGboost Regressor hypertuned by GridSearch
predictions = DataScience.xgb_regressor(X_train, y_train, X_test)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.0s


KeyboardInterrupt: 

In [None]:
# Get the index of the predicted table
index = X_test.index
predictions = pd.DataFrame(predictions, columns=['SalePrice'], index=index)

# saving the DataFrame as a CSV file 
csv_data = predictions.to_csv('Predictions.csv', index = True) 

In [None]:
# Verify the prediction's format
print(predictions.head())