In [1]:
import numpy as np
import pandas as pd
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Import Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X_train = train.drop('SalePrice', axis = 1)
X_test = test
y_train = train['SalePrice']

## Combine train and test data

In [3]:
# 合并一下train和test，两个一起处理，之后再分开
X = X_train.append(X_test, ignore_index = True)
# Drop id
X.drop('Id', axis = 1, inplace = True)

print(X_train.shape, X_test.shape, X.shape)
X.head()

(1460, 80) (1459, 80) (2919, 79)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [4]:
# 0 - 1459 是train
# 1460 开始是test

## Numerical columns and Categorical columns

In [5]:
# Numerical Columns
num_col = X.columns[(X.dtypes != 'object').values.tolist()].tolist()
cat_col = X.columns[(X.dtypes == 'object').values.tolist()].tolist()

In [6]:
# Check if we catch every column
print('num_col:', len(num_col))
print('cat_col:', len(cat_col))
print('X:', len(X.columns))

num_col: 36
cat_col: 43
X: 79


# Data preprocessing

## Missingness

In [7]:
missing = X.isnull().sum(axis = 0)
missing = (missing[missing != 0])/X.shape[0]
pd.DataFrame(missing, columns = ['Missing Percent']).sort_values('Missing Percent', ascending = False)

Unnamed: 0,Missing Percent
PoolQC,0.996574
MiscFeature,0.964029
Alley,0.932169
Fence,0.804385
FireplaceQu,0.486468
LotFrontage,0.166495
GarageFinish,0.054471
GarageQual,0.054471
GarageCond,0.054471
GarageYrBlt,0.054471


### Impute missingness Type 1 (miss是因为没有）

In [8]:
# Alley
X['Alley'].fillna('No_Alley', inplace = True)

# Bsmt相关
Bsmt_col = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for i in Bsmt_col:
    X[i].fillna('No_Bsmt', inplace = True)

# Garage相关
Garage_col = ['GarageType','GarageFinish', 'GarageQual', 'GarageCond']
for k in Garage_col:
    X[k].fillna('No_Garage', inplace = True)
    
Garage_col2 = ['GarageYrBlt','GarageCars', 'GarageArea']
for a in Garage_col2:
    X[a].fillna(0, inplace = True)

# FireplaceQu
X['FireplaceQu'].fillna('No_Fireplace', inplace = True)

# PoolQC
X['PoolQC'].fillna('No_Pool', inplace = True)

# Fence
X['Fence'].fillna('No_Fence', inplace = True)

# MiscFeature
X['MiscFeature'].fillna('No_MiscFeature', inplace = True)

In [9]:
# Check missingness again
missing = X.isnull().sum(axis = 0)
missing = (missing[missing != 0])/X.shape[0]
pd.DataFrame(missing, columns = ['Missing Percent']).sort_values('Missing Percent', ascending = False)

Unnamed: 0,Missing Percent
LotFrontage,0.166495
MasVnrType,0.008222
MasVnrArea,0.007879
MSZoning,0.00137
Utilities,0.000685
Functional,0.000685
BsmtHalfBath,0.000685
BsmtFullBath,0.000685
Electrical,0.000343
KitchenQual,0.000343


### Impute missingness Type 2 (median)

In [10]:
missing_col = missing.index.tolist()
num_missing_col = []
for i in missing_col:
    if i in num_col:
        num_missing_col.append(i)
num_missing_col

['LotFrontage',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath']

In [11]:
for i in num_missing_col:
    X[i] = X.groupby(['Neighborhood'])[i].apply(lambda x: x.fillna(x.median()))

### Impute missingness Type 3 (mode)

In [12]:
missing_col = missing.index.tolist()
cat_missing_col = []
for i in missing_col:
    if i in cat_col:
        cat_missing_col.append(i)
cat_missing_col

['MSZoning',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Electrical',
 'KitchenQual',
 'Functional',
 'SaleType']

In [13]:
for i in cat_missing_col:
    X[i] = X.groupby(['Neighborhood'])[i].apply(lambda x:x.fillna(x.value_counts().index[0]))

### Check missingness again

In [14]:
X.isnull().sum().sum()

0

In [15]:
# OK everything clear!

## Nominal / Ordinal

### Onehot

In [16]:
onehot_col = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','Neighborhood',
              'Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st',
              'Exterior2nd','MasVnrType','Foundation','Heating','CentralAir','Electrical','GarageType',
              'MiscFeature','SaleType','SaleCondition']

In [17]:
X = pd.get_dummies(X, columns = onehot_col, drop_first=True)

In [18]:
X.head()

Unnamed: 0,LotFrontage,LotArea,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,Gtl,7,5,2003,2003,196.0,Gd,TA,...,0,0,0,0,1,0,0,0,1,0
1,80.0,9600,Gtl,6,8,1976,1976,0.0,TA,TA,...,0,0,0,0,1,0,0,0,1,0
2,68.0,11250,Gtl,7,5,2001,2002,162.0,Gd,TA,...,0,0,0,0,1,0,0,0,1,0
3,60.0,9550,Gtl,7,5,1915,1970,0.0,TA,TA,...,0,0,0,0,1,0,0,0,0,0
4,84.0,14260,Gtl,8,5,2000,2000,350.0,Gd,TA,...,0,0,0,0,1,0,0,0,1,0


### Order

In [19]:
X['LandSlope'].replace({'Sev':2, 'Mod':1, 'Gtl':0}, inplace = True)

X['BsmtExposure'].replace({'Gd':4, 'Av':3, 'Mn':2, 'No':1, 'No_Bsmt':0}, inplace = True)

X['Functional'].replace({'Typ':7, 'Min1':6, 'Min2':5, 'Mod':4, 'Maj1':3, 
                        'Maj2':2, 'Sev':1, 'Sal':0}, inplace = True)

X['FireplaceQu'].replace({'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'No_Fireplace':0}, inplace = True)

X['GarageFinish'].replace({'Fin':3, 'RFn':2, 'Unf':1, 'No_Garage':0}, inplace = True)

X['PavedDrive'].replace({'Y':2, 'P':1, 'N':0}, inplace = True)

X['PoolQC'].replace({'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'No_Pool':0}, inplace = True)

X['Fence'].replace({'GdPrv':4, 'MnPrv':3, 'GdWo':2, 'MnWw':1, 'No_Fence':0}, inplace = True)

replace_dict1 = {'Ex':4, 'Gd':3, 'TA':2, 'Fa':1, 'Po':0}
type1_col = ['ExterQual', 'ExterCond', 'HeatingQC','KitchenQual']
for i in type1_col:
    X[i].replace(replace_dict1, inplace = True)

replace_dict2 = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'No_Bsmt':0}
type2_col = ['BsmtQual', 'BsmtCond']
for i in type2_col:
    X[i].replace(replace_dict2, inplace = True)

replace_dict3 = {'GLQ':6, 'ALQ':5, 'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1, 'No_Bsmt':0}
type3_col = ['BsmtFinType1','BsmtFinType2']
for i in type3_col:
    X[i].replace(replace_dict3, inplace = True)

replace_dict4 = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'No_Garage':0}
type4_col = ['GarageQual','GarageCond']
for i in type4_col:
    X[i].replace(replace_dict4, inplace = True)

In [20]:
# Check if we convert everything to numerical values
(X.dtypes).loc[(X.dtypes == object).tolist()]


Series([], dtype: object)

In [21]:
# Everything clear!

# Feature Engineering

In [22]:
print(X.shape)
X.head()

(2919, 218)


Unnamed: 0,LotFrontage,LotArea,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,0,7,5,2003,2003,196.0,3,2,...,0,0,0,0,1,0,0,0,1,0
1,80.0,9600,0,6,8,1976,1976,0.0,2,2,...,0,0,0,0,1,0,0,0,1,0
2,68.0,11250,0,7,5,2001,2002,162.0,3,2,...,0,0,0,0,1,0,0,0,1,0
3,60.0,9550,0,7,5,1915,1970,0.0,2,2,...,0,0,0,0,1,0,0,0,0,0
4,84.0,14260,0,8,5,2000,2000,350.0,3,2,...,0,0,0,0,1,0,0,0,1,0


### Generate new features

In [23]:
################## 所有关于时间的 ##################
X['Sell_from_Build'] = X['YrSold'] - X['YearBuilt']
X['Sell_from_Remod'] = X['YrSold'] - X['YearRemodAdd']
X['Sell_from_Garage'] = X['YrSold'] - X['GarageYrBlt']
X['Remod_from_Build'] = X['YearRemodAdd'] - X['YearBuilt']
X['Garage_from_Build'] = X['GarageYrBlt'] - X['YearBuilt']
# 注意那个GarageYrBlt没有Garage的就给我弄成0了！！

# Avoid negative number for ages
for i in ['Sell_from_Build','Sell_from_Remod','Sell_from_Garage','Remod_from_Build','Garage_from_Build']:
    X[i] = X[i].map(lambda x: 0 if x < 0 else x)

# Update num_col
num_col += ['Sell_from_Build','Sell_from_Remod','Sell_from_Garage','Remod_from_Build','Garage_from_Build']

In [24]:
################## 所有关于Bath的 ##################
X['BsmtBath'] = X['BsmtFullBath'] + 0.5 * X['BsmtHalfBath']
X['Bath'] = X['FullBath'] + 0.5 * X['HalfBath']    

In [25]:
################## 所有Cond+Qual ##################
X['Overall'] = X['OverallCond'] + X['OverallQual']
X['Exter'] = X['ExterCond'] + X['ExterQual']
X['Bsmt'] = X['BsmtCond'] + X['BsmtQual']
X['Garage'] = X['GarageQual'] + X['GarageCond']

In [26]:
print(X.shape)
X.head()

(2919, 229)


Unnamed: 0,LotFrontage,LotArea,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,Sell_from_Remod,Sell_from_Garage,Remod_from_Build,Garage_from_Build,BsmtBath,Bath,Overall,Exter,Bsmt,Garage
0,65.0,8450,0,7,5,2003,2003,196.0,3,2,...,5,5.0,0,0.0,1.0,2.5,12,5,7,6
1,80.0,9600,0,6,8,1976,1976,0.0,2,2,...,31,31.0,0,0.0,0.5,2.0,14,4,7,6
2,68.0,11250,0,7,5,2001,2002,162.0,3,2,...,6,7.0,1,0.0,1.0,2.5,12,5,7,6
3,60.0,9550,0,7,5,1915,1970,0.0,2,2,...,36,8.0,55,83.0,1.0,1.0,12,4,7,6
4,84.0,14260,0,8,5,2000,2000,350.0,3,2,...,8,8.0,0,0.0,1.0,2.5,13,5,7,6


### Skewness 

In [27]:
num_col.remove('MSSubClass')

In [28]:
col_name = []
col_skew = []
for i in num_col:
    col_name.append(i)
    col_skew.append(X[i].skew())
skew_df = pd.DataFrame({'Column':col_name, 'Skewness':col_skew}).sort_values('Skewness', ascending = False)
skew_df.head()

Unnamed: 0,Column,Skewness
32,MiscVal,21.95848
31,PoolArea,16.907017
1,LotArea,12.829025
13,LowQualFinSF,12.094977
29,3SsnPorch,11.381914


In [29]:
skewed_features = skew_df[abs(skew_df['Skewness']) >= 0.5]['Column'].tolist()


In [30]:
X[skewed_features] = np.log1p(X[skewed_features])

### StandardSclaer

In [31]:
from sklearn import preprocessing
sd = preprocessing.StandardScaler()

In [32]:
X[num_col] = sd.fit_transform(X[num_col])

In [33]:
print(X.shape)
X.head()

(2919, 229)


Unnamed: 0,LotFrontage,LotArea,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,Sell_from_Remod,Sell_from_Garage,Remod_from_Build,Garage_from_Build,BsmtBath,Bath,Overall,Exter,Bsmt,Garage
0,-0.052108,-0.103719,0,0.646183,-0.435439,1.040634,0.896833,1.217601,3,2,...,-0.887142,-0.804089,-0.738006,-0.41367,1.0,2.5,12,5,7,6
1,0.570217,0.146544,0,-0.063185,1.90637,0.161526,-0.395604,-0.799566,2,2,...,0.357658,0.211138,-0.738006,-0.41367,0.5,2.0,14,4,7,6
2,0.082971,0.457629,0,0.646183,-0.435439,0.975922,0.848965,1.145266,3,2,...,-0.839265,-0.629617,-0.310935,-0.41367,1.0,2.5,12,5,7,6
3,-0.291506,0.136301,0,0.646183,-0.435439,-1.869638,-0.682812,-0.799566,2,2,...,0.597043,-0.558184,1.742149,3.353037,1.0,1.0,12,4,7,6
4,0.716693,0.922662,0,1.355551,-0.435439,0.943542,0.753229,1.438126,3,2,...,-0.743511,-0.558184,-0.738006,-0.41367,1.0,2.5,13,5,7,6


### Delete correlated columns

In [34]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  35


In [35]:
X.drop(labels = corr_features, axis = 1, inplace = True)

## Back to Train and Test

In [36]:
X_train = X[:1460]
X_test = X[1460:]
y_train = np.log(y_train)

In [37]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(2919, 194)
(1460, 194)
(1459, 194)


In [39]:
from sklearn import ensemble
t = ensemble.RandomForestRegressor()
t.fit(X_train, y_train)

RandomForestRegressor()

In [40]:
sum(t.feature_importances_[:35])

0.9067357148692239

# Feature Selection

In [125]:
from sklearn import ensemble
from sklearn import feature_selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS


sfs = SFS(
    ensemble.RandomForestRegressor(n_jobs = -1),
    k_features = 16,
    forward = True,
    floating = False,
    scoring = 'r2',
    verbose = 2,
    cv = 5,
    n_jobs = -1
)
sfs = sfs.fit(X_train.to_numpy(), y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 194 out of 194 | elapsed:   41.5s finished

[2020-08-31 21:16:19] Features: 1/16 -- score: 0.664508439052406[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   34.8s
[Parallel(n_jobs=-1)]: Done 193 out of 193 | elapsed:   44.7s finished

[2020-08-31 21:17:04] Features: 2/16 -- score: 0.710760350169798[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:   45.7s finished

[2020-08-31 21:17:50] Features: 3/16 -- score: 0.7399043

In [128]:
# Get the selected feature
selected_feat = X_train.columns[list(sfs.k_feature_idx_)].tolist()
selected_feat

['LotArea',
 'LandSlope',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'BsmtExposure',
 'BsmtFinType1',
 '1stFlrSF',
 'GrLivArea',
 'Fireplaces',
 'GarageFinish',
 'GarageCars',
 'MSZoning_RL',
 'Heating_OthW',
 'CentralAir_Y',
 'Overall']

In [41]:
selected_feat = ['LotArea',
 'LandSlope',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'BsmtExposure',
 'BsmtFinType1',
 '1stFlrSF',
 'GrLivArea',
 'Fireplaces',
 'GarageFinish',
 'GarageCars',
 'MSZoning_RL',
 'Heating_OthW',
 'CentralAir_Y',
 'Overall']

## Get X and y

In [42]:
y_train = np.log(train['SalePrice'])

In [43]:
X_train_selected = X_train[selected_feat]
X_test_selected = X_test[selected_feat]

# Model Selection

In [117]:
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score
import xgboost

def neg_rmse(y_true, y_pred):
    return (-1) * np.sqrt(np.mean((y_true - y_pred) ** 2))


MLA = [
    #Linear Model
    linear_model.Lasso(),
    linear_model.Ridge(),
    linear_model.ElasticNet(),

    #Kernel Ridge
    KernelRidge(),
    
    #Tree
    tree.DecisionTreeRegressor(),
    
    #Ensemble
    ensemble.RandomForestRegressor(
    n_estimators = 400, 
    max_depth = 29,
    max_features = 'log2',
    min_samples_split = 4
),
    ensemble.GradientBoostingRegressor(
    n_estimators = 300,
    max_depth = 5,
    criterion = 'mae',
    max_features='sqrt',
    min_samples_split = 50
),
    
    #XGB
    xgboost.XGBRegressor(
    booster = 'gbtree',
    eta = 0.1,
    max_depth = 5,
    gamma = 0,
    subsample = 0.5,
    sampling_method = 'uniform',
    alpha = 0
)
]
scorer = make_scorer(neg_rmse)
df_columns = ['Name', 'Parameters', 'CV score mean']
df = pd.DataFrame(columns = df_columns)
from sklearn.model_selection import cross_validate
for i in np.arange(len(MLA)):
    model = MLA[i]
    model_name = model.__class__.__name__
    model_parameters = str(model.get_params())
    cv_results = (cross_val_score(model, X_train_selected, y_train, cv = 5, scoring=scorer)).mean()
    df.loc[i,:] = [model_name, model_parameters, cv_results]
    

df.sort_values('CV score mean', ascending = False)

Unnamed: 0,Name,Parameters,CV score mean
7,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",-0.130268
6,GradientBoostingRegressor,"{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': ...",-0.131357
5,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",-0.135598
1,Ridge,"{'alpha': 1.0, 'copy_X': True, 'fit_intercept'...",-0.14102
4,DecisionTreeRegressor,"{'ccp_alpha': 0.0, 'criterion': 'mse', 'max_de...",-0.19273
3,KernelRidge,"{'alpha': 1, 'coef0': 1, 'degree': 3, 'gamma':...",-0.219634
0,Lasso,"{'alpha': 1.0, 'copy_X': True, 'fit_intercept'...",-0.399228
2,ElasticNet,"{'alpha': 1.0, 'copy_X': True, 'fit_intercept'...",-0.399228


- GradientBoostingRegressor
- Ridge
- RandomForestRegressor
- XGBregressor

# Model tuning

In [62]:
# GradientBoostingRegressor
gb_reg = ensemble.GradientBoostingRegressor(
    n_estimators = 300,
    max_depth = 5,
    criterion = 'mae',
    max_features='sqrt',
    min_samples_split = 50
)
parameters = {
    'min_samples_split':[50, 55, 60]
}
cv = GridSearchCV(estimator=gb_reg, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

cv.fit(X_train_selected, y_train)

cv.best_params_

{'min_samples_split': 50}

In [118]:
# # Ridge
# ridge_reg = linear_model.Ridge(
#     alpha = 1,
#     solver = 'sparse_cg'
# )
# parameters = {
#     'alpha':[0.98, 0.99, 1],
#     'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
# }
# cv = GridSearchCV(estimator=ridge_reg, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

# cv.fit(X_train_selected, y_train)

# cv.best_params_

In [116]:
# RandomForestRegressor
rf_reg = ensemble.RandomForestRegressor(
    n_estimators = 400, 
    max_depth = 29,
    max_features = 'log2',
    min_samples_split = 4
)
parameters = {
    'min_samples_split':[3, 4, 5]
}
cv = GridSearchCV(estimator=rf_reg, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

cv.fit(X_train_selected, y_train)

cv.best_params_

{'min_samples_split': 4}

In [93]:
# XGBregressor
xgb_reg = xgboost.XGBRegressor(
    booster = 'gbtree',
    eta = 0.1,
    max_depth = 5,
    gamma = 0,
    subsample = 0.5,
    sampling_method = 'uniform',
    alpha = 0
)
parameters = {
    'alpha':[0, 0.1],
    'lambda':[0.9, 1, 1.1]
}
cv = GridSearchCV(estimator = xgb_reg, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

cv.fit(X_train_selected, y_train)

cv.best_params_

{'alpha': 0, 'lambda': 1}

# Train the Models

In [119]:
# GradientBoostingRegressor
gb_reg = ensemble.GradientBoostingRegressor(
    n_estimators = 300,
    max_depth = 5,
    criterion = 'mae',
    max_features='sqrt',
    min_samples_split = 50
)

# RandomForestRegressor
rf_reg = ensemble.RandomForestRegressor(
    n_estimators = 400, 
    max_depth = 29,
    max_features = 'log2',
    min_samples_split = 4
)

# XGBRegressor
xgb_reg = xgboost.XGBRegressor(
    booster = 'gbtree',
    eta = 0.1,
    max_depth = 5,
    gamma = 0,
    subsample = 0.5,
    sampling_method = 'uniform',
    alpha = 0
)

# Train the models 
gb_reg.fit(X_train_selected, y_train)
rf_reg.fit(X_train_selected, y_train)
xgb_reg.fit(X_train_selected, y_train)

XGBRegressor(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eta=0.1, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.100000001, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, sampling_method='uniform',
             scale_pos_weight=1, subsample=0.5, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [123]:
np.exp(gb_reg.predict(X_test_selected))

array([122721.5235134 , 155107.3408331 , 186919.85893281, ...,
       165814.42878139, 127442.64653689, 251102.95302876])

In [124]:
np.exp(rf_reg.predict(X_test_selected))

array([119047.82130093, 150325.0985036 , 177208.31650051, ...,
       160009.96302516, 117302.4504959 , 242780.70441905])

In [125]:
np.exp(xgb_reg.predict(X_test_selected))

array([116909.46, 155811.14, 181061.53, ..., 153363.47, 125078.67,
       228245.86], dtype=float32)

# Model Ensemble

In [102]:
prediction = 

result = pd.DataFrame({'Id': test['Id'], 'SalePrice': prediction})
result.head()

In [151]:
# result.to_csv('result.csv',index = False)