In [1]:
import numpy as np
import pandas as pd
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Import Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X_train = train.drop('SalePrice', axis = 1)
X_test = test
y_train = train['SalePrice']

## Combine train and test data

In [3]:
# 合并一下train和test，两个一起处理，之后再分开
X = X_train.append(X_test, ignore_index = True)
# Drop id
X.drop('Id', axis = 1, inplace = True)

print(X_train.shape, X_test.shape, X.shape)
X.head()

(1460, 80) (1459, 80) (2919, 79)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [4]:
# 0 - 1459 是train
# 1460 开始是test

## Numerical columns and Categorical columns

In [5]:
# Numerical Columns
num_col = X.columns[(X.dtypes != 'object').values.tolist()].tolist()
cat_col = X.columns[(X.dtypes == 'object').values.tolist()].tolist()

In [6]:
# Check if we catch every column
print('num_col:', len(num_col))
print('cat_col:', len(cat_col))
print('X:', len(X.columns))

num_col: 36
cat_col: 43
X: 79


# Data preprocessing

## Missingness

In [7]:
missing = X.isnull().sum(axis = 0)
missing = (missing[missing != 0])/X.shape[0]
pd.DataFrame(missing, columns = ['Missing Percent']).sort_values('Missing Percent', ascending = False)

Unnamed: 0,Missing Percent
PoolQC,0.996574
MiscFeature,0.964029
Alley,0.932169
Fence,0.804385
FireplaceQu,0.486468
LotFrontage,0.166495
GarageFinish,0.054471
GarageQual,0.054471
GarageCond,0.054471
GarageYrBlt,0.054471


### Impute missingness Type 1 (miss是因为没有）

In [8]:
# Alley
X['Alley'].fillna('No_Alley', inplace = True)

# Bsmt相关
Bsmt_col = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for i in Bsmt_col:
    X[i].fillna('No_Bsmt', inplace = True)

# Garage相关
Garage_col = ['GarageType','GarageFinish', 'GarageQual', 'GarageCond']
for k in Garage_col:
    X[k].fillna('No_Garage', inplace = True)
    
Garage_col2 = ['GarageYrBlt','GarageCars', 'GarageArea']
for a in Garage_col2:
    X[a].fillna(0, inplace = True)

# FireplaceQu
X['FireplaceQu'].fillna('No_Fireplace', inplace = True)

# PoolQC
X['PoolQC'].fillna('No_Pool', inplace = True)

# Fence
X['Fence'].fillna('No_Fence', inplace = True)

# MiscFeature
X['MiscFeature'].fillna('No_MiscFeature', inplace = True)

In [9]:
# Check missingness again
missing = X.isnull().sum(axis = 0)
missing = (missing[missing != 0])/X.shape[0]
pd.DataFrame(missing, columns = ['Missing Percent']).sort_values('Missing Percent', ascending = False)

Unnamed: 0,Missing Percent
LotFrontage,0.166495
MasVnrType,0.008222
MasVnrArea,0.007879
MSZoning,0.00137
Utilities,0.000685
Functional,0.000685
BsmtHalfBath,0.000685
BsmtFullBath,0.000685
Electrical,0.000343
KitchenQual,0.000343


### Impute missingness Type 2 (median)

In [10]:
missing_col = missing.index.tolist()
num_missing_col = []
for i in missing_col:
    if i in num_col:
        num_missing_col.append(i)
num_missing_col

['LotFrontage',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath']

In [11]:
for i in num_missing_col:
    X[i] = X.groupby(['Neighborhood'])[i].apply(lambda x: x.fillna(x.median()))

### Impute missingness Type 3 (mode)

In [12]:
missing_col = missing.index.tolist()
cat_missing_col = []
for i in missing_col:
    if i in cat_col:
        cat_missing_col.append(i)
cat_missing_col

['MSZoning',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Electrical',
 'KitchenQual',
 'Functional',
 'SaleType']

In [13]:
for i in cat_missing_col:
    X[i] = X.groupby(['Neighborhood'])[i].apply(lambda x:x.fillna(x.value_counts().index[0]))

### Check missingness again

In [14]:
X.isnull().sum().sum()

0

In [15]:
# OK everything clear!

## Nominal / Ordinal

### Onehot

In [16]:
onehot_col = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','Neighborhood',
              'Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st',
              'Exterior2nd','MasVnrType','Foundation','Heating','CentralAir','Electrical','GarageType',
              'MiscFeature','SaleType','SaleCondition']

In [17]:
X = pd.get_dummies(X, columns = onehot_col)

In [18]:
X.head()

Unnamed: 0,LotFrontage,LotArea,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,Gtl,7,5,2003,2003,196.0,Gd,TA,...,0,0,0,1,0,0,0,0,1,0
1,80.0,9600,Gtl,6,8,1976,1976,0.0,TA,TA,...,0,0,0,1,0,0,0,0,1,0
2,68.0,11250,Gtl,7,5,2001,2002,162.0,Gd,TA,...,0,0,0,1,0,0,0,0,1,0
3,60.0,9550,Gtl,7,5,1915,1970,0.0,TA,TA,...,0,0,0,1,1,0,0,0,0,0
4,84.0,14260,Gtl,8,5,2000,2000,350.0,Gd,TA,...,0,0,0,1,0,0,0,0,1,0


### Order

In [19]:
X['LandSlope'].replace({'Sev':2, 'Mod':1, 'Gtl':0}, inplace = True)

X['BsmtExposure'].replace({'Gd':4, 'Av':3, 'Mn':2, 'No':1, 'No_Bsmt':0}, inplace = True)

X['Functional'].replace({'Typ':7, 'Min1':6, 'Min2':5, 'Mod':4, 'Maj1':3, 
                        'Maj2':2, 'Sev':1, 'Sal':0}, inplace = True)

X['FireplaceQu'].replace({'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'No_Fireplace':0}, inplace = True)

X['GarageFinish'].replace({'Fin':3, 'RFn':2, 'Unf':1, 'No_Garage':0}, inplace = True)

X['PavedDrive'].replace({'Y':2, 'P':1, 'N':0}, inplace = True)

X['PoolQC'].replace({'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'No_Pool':0}, inplace = True)

X['Fence'].replace({'GdPrv':4, 'MnPrv':3, 'GdWo':2, 'MnWw':1, 'No_Fence':0}, inplace = True)

replace_dict1 = {'Ex':4, 'Gd':3, 'TA':2, 'Fa':1, 'Po':0}
type1_col = ['ExterQual', 'ExterCond', 'HeatingQC','KitchenQual']
for i in type1_col:
    X[i].replace(replace_dict1, inplace = True)

replace_dict2 = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'No_Bsmt':0}
type2_col = ['BsmtQual', 'BsmtCond']
for i in type2_col:
    X[i].replace(replace_dict2, inplace = True)

replace_dict3 = {'GLQ':6, 'ALQ':5, 'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1, 'No_Bsmt':0}
type3_col = ['BsmtFinType1','BsmtFinType2']
for i in type3_col:
    X[i].replace(replace_dict3, inplace = True)

replace_dict4 = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'No_Garage':0}
type4_col = ['GarageQual','GarageCond']
for i in type4_col:
    X[i].replace(replace_dict4, inplace = True)

In [20]:
# Check if we convert everything to numerical values
(X.dtypes).loc[(X.dtypes == object).tolist()]


Series([], dtype: object)

In [21]:
# Everything clear!

# Feature Engineering

In [22]:
print(X.shape)
X.head()

(2919, 244)


Unnamed: 0,LotFrontage,LotArea,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,0,7,5,2003,2003,196.0,3,2,...,0,0,0,1,0,0,0,0,1,0
1,80.0,9600,0,6,8,1976,1976,0.0,2,2,...,0,0,0,1,0,0,0,0,1,0
2,68.0,11250,0,7,5,2001,2002,162.0,3,2,...,0,0,0,1,0,0,0,0,1,0
3,60.0,9550,0,7,5,1915,1970,0.0,2,2,...,0,0,0,1,1,0,0,0,0,0
4,84.0,14260,0,8,5,2000,2000,350.0,3,2,...,0,0,0,1,0,0,0,0,1,0


### Generate new features

In [23]:
################## 所有关于时间的 ##################
X['Sell_from_Build'] = X['YrSold'] - X['YearBuilt']
X['Sell_from_Remod'] = X['YrSold'] - X['YearRemodAdd']
X['Sell_from_Garage'] = X['YrSold'] - X['GarageYrBlt']
X['Remod_from_Build'] = X['YearRemodAdd'] - X['YearBuilt']
X['Garage_from_Build'] = X['GarageYrBlt'] - X['YearBuilt']
# 注意那个GarageYrBlt没有Garage的就给我弄成0了！！

# Avoid negative number for ages
for i in ['Sell_from_Build','Sell_from_Remod','Sell_from_Garage','Remod_from_Build','Garage_from_Build']:
    X[i] = X[i].map(lambda x: 0 if x < 0 else x)

# Update num_col
num_col += ['Sell_from_Build','Sell_from_Remod','Sell_from_Garage','Remod_from_Build','Garage_from_Build']

In [24]:
################## 所有关于Bath的 ##################
X['BsmtBath'] = X['BsmtFullBath'] + 0.5 * X['BsmtHalfBath']
X['Bath'] = X['FullBath'] + 0.5 * X['HalfBath']
X['Total Bath'] = X['BsmtBath'] + X['Bath']
# Update num_col
num_col += ['BsmtBath', 'Bath']

In [25]:
################## 所有Cond+Qual ##################
X['Overall'] = X['OverallCond'] + X['OverallQual']
X['Exter'] = X['ExterCond'] + X['ExterQual']
X['Bsmt'] = X['BsmtCond'] + X['BsmtQual']
X['Garage'] = X['GarageQual'] + X['GarageCond']

# Update num_col
num_col += ['Overall', 'Exter', 'Bsmt', 'Garage']

In [26]:
################## 其他一些可以加起来的 ##################
X['TotalSF']=X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
X['Total_sqr_footage'] = (X['BsmtFinSF1'] + X['BsmtFinSF2'] +
                                 X['1stFlrSF'] + X['2ndFlrSF'])
X['Total_porch_sf'] = (X['OpenPorchSF'] + X['3SsnPorch'] +
                              X['EnclosedPorch'] + X['ScreenPorch'] +
                              X['WoodDeckSF'])
X['haspool'] = X['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
X['has2ndfloor'] = X['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
X['hasgarage'] = X['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
X['hasbsmt'] = X['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
X['hasfireplace'] = X['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
num_col += ['TotalSF', 'Total_sqr_footage', 'Total Bath', 'Total_porch_sf']

In [27]:
print(X.shape)
X.head()

(2919, 264)


Unnamed: 0,LotFrontage,LotArea,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,Bsmt,Garage,TotalSF,Total_sqr_footage,Total_porch_sf,haspool,has2ndfloor,hasgarage,hasbsmt,hasfireplace
0,65.0,8450,0,7,5,2003,2003,196.0,3,2,...,7,6,2566.0,2416.0,61,0,1,1,1,0
1,80.0,9600,0,6,8,1976,1976,0.0,2,2,...,7,6,2524.0,2240.0,298,0,0,1,1,1
2,68.0,11250,0,7,5,2001,2002,162.0,3,2,...,7,6,2706.0,2272.0,42,0,1,1,1,1
3,60.0,9550,0,7,5,1915,1970,0.0,2,2,...,7,6,2473.0,1933.0,307,0,1,1,1,1
4,84.0,14260,0,8,5,2000,2000,350.0,3,2,...,7,6,3343.0,2853.0,276,0,1,1,1,1


### Skewness 

In [28]:
num_col.remove('MSSubClass')

In [29]:
col_name = []
col_skew = []
for i in num_col:
    col_name.append(i)
    col_skew.append(X[i].skew())
skew_df = pd.DataFrame({'Column':col_name, 'Skewness':col_skew}).sort_values('Skewness', ascending = False)
skew_df.head()

Unnamed: 0,Column,Skewness
32,MiscVal,21.95848
31,PoolArea,16.907017
1,LotArea,12.829025
13,LowQualFinSF,12.094977
29,3SsnPorch,11.381914


In [30]:
skewed_features = skew_df[abs(skew_df['Skewness']) >= 0.5]['Column'].tolist()

In [31]:
X[skewed_features] = np.log1p(X[skewed_features])

### RobustScaler

In [32]:
from sklearn import preprocessing
sd = preprocessing.RobustScaler()

In [33]:
X[num_col] = sd.fit_transform(X[num_col])

In [34]:
print(X.shape)
X.head()

(2919, 264)


Unnamed: 0,LotFrontage,LotArea,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,Bsmt,Garage,TotalSF,Total_sqr_footage,Total_porch_sf,haspool,has2ndfloor,hasgarage,hasbsmt,hasfireplace
0,-0.257516,-0.256995,0,0.5,0.0,0.628146,0.25641,1.034716,3,2,...,0.0,0.0,0.116926,0.571388,-0.576688,0,1,1,1,0
1,0.464671,0.035356,0,0.0,2.630317,0.063243,-0.435897,0.0,2,2,...,0.0,0.0,0.075936,0.410124,0.350262,0,0,1,1,1
2,-0.100761,0.398758,0,0.5,0.0,0.586563,0.230769,0.997612,3,2,...,0.0,0.0,0.248871,0.440366,-0.792286,0,1,1,1,1
3,-0.535329,0.023391,0,0.5,0.0,-1.241957,-0.589744,0.0,2,2,...,0.0,0.0,0.025236,0.095868,0.367734,0,1,1,1,1
4,0.634651,0.941998,0,1.0,0.0,0.565756,0.179487,1.147836,3,2,...,0.0,0.0,0.773956,0.925878,0.305233,0,1,1,1,1


### Delete skewed columns

In [35]:
to_remove = X[num_col].skew()[(abs(X[num_col].skew()) > 1)].index.tolist()

In [36]:
to_remove

['LotFrontage',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'LowQualFinSF',
 'BsmtHalfBath',
 'KitchenAbvGr',
 'GarageYrBlt',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'Garage_from_Build',
 'Overall',
 'Bsmt',
 'Garage',
 'Total_porch_sf']

In [37]:
X.drop(to_remove, axis = 1, inplace = True)

### Delete correlated columns

In [38]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  42


In [39]:
X.drop(labels = corr_features, axis = 1, inplace = True)

## Back to Train and Test

In [40]:
X_train = X[:1460]
X_test = X[1460:]
y_train = np.log(y_train)

In [41]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(2919, 204)
(1460, 204)
(1459, 204)


### Drop outliers

In [42]:
X_train.drop([1298, 523], inplace = True)
y_train.drop([1298, 523], inplace = True)

# Feature Selection

In [43]:
import xgboost
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer
def neg_rmse(y_true, y_pred):
    return (-1) * np.sqrt(np.mean((y_true - y_pred) ** 2))


scorer = make_scorer(neg_rmse)
xgb_reg = xgboost.XGBRegressor()
selector = RFECV(xgb_reg, cv = 5, n_jobs = -1, scoring = scorer)
selector = selector.fit(X_train, y_train)



selected_feat = X_train.columns.values[selector.support_]
print('There are {} selected features'.format(selector.n_features_))
selected_feat

There are 41 selected features


array(['LotArea', 'OverallQual', 'OverallCond', 'YearRemodAdd',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', '1stFlrSF',
       'GrLivArea', 'BedroomAbvGr', 'KitchenQual', 'Functional',
       'Fireplaces', 'GarageFinish', 'GarageCars', 'GarageQual',
       'MSSubClass_30', 'MSSubClass_90', 'MSZoning_C (all)',
       'MSZoning_FV', 'Alley_Pave', 'LotConfig_CulDSac',
       'Neighborhood_BrkSide', 'Neighborhood_Crawfor',
       'Neighborhood_Edwards', 'Neighborhood_MeadowV',
       'Neighborhood_OldTown', 'Neighborhood_StoneBr',
       'Condition1_Artery', 'Condition1_Feedr', 'Condition1_Norm',
       'Condition1_PosA', 'Exterior1st_BrkComm', 'Exterior1st_BrkFace',
       'Heating_Grav', 'CentralAir_N', 'SaleType_New',
       'SaleCondition_Abnorml', 'SaleCondition_Family', 'Total Bath'],
      dtype=object)

## Get X and y

In [44]:
y_train = np.log(train['SalePrice'])
y_train.drop([1298, 523], inplace = True)

In [45]:
X_train_selected = X_train[selected_feat]
X_test_selected = X_test[selected_feat]

In [46]:
########################## 到这边应该都没有问题了 ###########################

# Model Selection

In [47]:
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score
import xgboost

def neg_rmse(y_true, y_pred):
    return (-1) * np.sqrt(np.mean((y_true - y_pred) ** 2))


MLA = [
    #Linear Model
    linear_model.Lasso(),
    linear_model.Ridge(
    alpha=0.98, 
    solver='sparse_cg'
),
    linear_model.ElasticNet(),

    #Kernel Ridge
    KernelRidge(),
    
    #Tree
    tree.DecisionTreeRegressor(),
    
    #Ensemble
    ensemble.RandomForestRegressor(
),
    ensemble.GradientBoostingRegressor(
    n_estimators=200,
    max_depth = 3,
    min_samples_split = 3,
    max_features = 'auto',
    criterion = 'mse'
    
),
    
    #XGB
    xgboost.XGBRegressor(
    eta = 0.1,
    max_depth = 3,
    gamma = 0,
    alpha = 0,
    tree_method = 'auto' 
)
]
scorer = make_scorer(neg_rmse)
df_columns = ['Name', 'Parameters', 'CV score mean']
df = pd.DataFrame(columns = df_columns)
from sklearn.model_selection import cross_validate
for i in np.arange(len(MLA)):
    model = MLA[i]
    model_name = model.__class__.__name__
    model_parameters = str(model.get_params())
    cv_results = (cross_val_score(model, X_train_selected, y_train, cv = 5, scoring=scorer)).mean()
    df.loc[i,:] = [model_name, model_parameters, cv_results]
    

df.sort_values('CV score mean', ascending = False)

Unnamed: 0,Name,Parameters,CV score mean
1,Ridge,"{'alpha': 0.98, 'copy_X': True, 'fit_intercept...",-0.117052
6,GradientBoostingRegressor,"{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': ...",-0.120737
7,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",-0.122524
5,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",-0.132538
4,DecisionTreeRegressor,"{'ccp_alpha': 0.0, 'criterion': 'mse', 'max_de...",-0.200309
0,Lasso,"{'alpha': 1.0, 'copy_X': True, 'fit_intercept'...",-0.399471
2,ElasticNet,"{'alpha': 1.0, 'copy_X': True, 'fit_intercept'...",-0.399471
3,KernelRidge,"{'alpha': 1, 'coef0': 1, 'degree': 3, 'gamma':...",-0.759288


- Ridge
- GradientBoosting Regressor
- XGBregressor
- RandomForestRegressor

# Model tuning

In [48]:
# Ridge
ridge_reg = linear_model.Ridge(
    alpha=0.98, 
    solver='sparse_cg'
)
parameters = {
    'alpha':[0.9, 0.95, 1, 0.96, 0.97, 0.94, 0.98],
    'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}
cv = GridSearchCV(estimator=ridge_reg, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

cv.fit(X_train_selected, y_train)

cv.best_params_

{'alpha': 1, 'solver': 'lsqr'}

In [49]:
# GradientBoostingRegressor
gb_reg = ensemble.GradientBoostingRegressor(
    n_estimators=200,
    max_depth = 3,
    min_samples_split = 3,
    max_features = 'auto',
    criterion = 'mse'
    
)
parameters = {
    'criterion':['mse', 'mae']
}
cv = GridSearchCV(estimator=gb_reg, param_grid=parameters, scoring=scorer, n_jobs = -1, cv = 5)

cv.fit(X_train_selected, y_train)

cv.best_params_

{'criterion': 'mse'}

In [50]:
# # RandomForestRegressor
# rf_reg = ensemble.RandomForestRegressor(
#      n_estimators = 400, 
#      max_depth = 45,
#      max_features = 'sqrt',
#      min_samples_split = 3
# )
# parameters = {
#     'min_samples_split':[2,3,4]
# }
# cv = GridSearchCV(estimator=rf_reg, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

# cv.fit(X_train_selected, y_train)

# cv.best_params_

In [79]:
# XGBregressor
xgb_reg = xgboost.XGBRegressor(
    eta = 0.1,
    max_depth = 3,
    gamma = 0,
    alpha = 0,
    tree_method = 'auto' 
)
parameters = {
    'process_type':['default', 'update']
}
cv = GridSearchCV(estimator = xgb_reg, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

cv.fit(X_train_selected, y_train)

cv.best_params_

{'process_type': 'default'}

# Train the Models

In [176]:
from sklearn import ensemble
from sklearn import linear_model
import xgboost

xgb_reg = xgboost.XGBRegressor(
    eta = 0.1,
    max_depth = 3,
    gamma = 0,
    alpha = 0,
    tree_method = 'auto' 
)

ridge_reg = linear_model.Ridge(
    alpha=0.98, 
    solver='sparse_cg'
)

gb_reg = ensemble.GradientBoostingRegressor(
    n_estimators=200,
    max_depth = 3,
    min_samples_split = 3,
    max_features = 'auto',
    criterion = 'mse'
)


# Train the models 
gb_reg.fit(X_train_selected, y_train)
ridge_reg.fit(X_train_selected, y_train)
xgb_reg.fit(X_train_selected, y_train)

XGBRegressor(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eta=0.1, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.100000001, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='auto', validate_parameters=1, verbosity=None)

# Model Ensemble

In [177]:
gb_pred = gb_reg.predict(X_train_selected)
ridge_pred = ridge_reg.predict(X_train_selected)
xgb_pred = xgb_reg.predict(X_train_selected)

In [178]:
new_X_train = pd.DataFrame({'gb':gb_pred, 'ridge':ridge_pred, 'xgb':xgb_pred})
new_X_train

Unnamed: 0,gb,ridge,xgb
0,12.223661,12.240627,12.227237
1,12.057255,12.120899,12.042456
2,12.247426,12.327717,12.286811
3,11.967368,12.178712,12.070547
4,12.579206,12.570559,12.612628
...,...,...,...
1453,12.087070,12.044300,12.029243
1454,12.189763,12.326564,12.231056
1455,12.497206,12.425185,12.411146
1456,11.852970,11.881718,11.836453


In [179]:
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score
import xgboost

def neg_rmse(y_true, y_pred):
    return (-1) * np.sqrt(np.mean((y_true - y_pred) ** 2))


MLA = [
    #Linear Model
    linear_model.Lasso(),
    linear_model.Ridge(),
    linear_model.ElasticNet(),

    #Kernel Ridge
    KernelRidge(alpha=0.01),
    
    #Tree
    tree.DecisionTreeRegressor(),
    
    #Ensemble
    ensemble.RandomForestRegressor(),
    ensemble.GradientBoostingRegressor(),
    
    #XGB
    xgboost.XGBRegressor()
]
scorer = make_scorer(neg_rmse)
df_columns = ['Name', 'Parameters', 'CV score mean']
df = pd.DataFrame(columns = df_columns)
from sklearn.model_selection import cross_validate
for i in np.arange(len(MLA)):
    model = MLA[i]
    model_name = model.__class__.__name__
    model_parameters = str(model.get_params())
    cv_results = (cross_val_score(model, new_X_train, y_train, cv = 5, scoring=scorer)).mean()
    df.loc[i,:] = [model_name, model_parameters, cv_results]
    

df.sort_values('CV score mean', ascending = False)

Unnamed: 0,Name,Parameters,CV score mean
3,KernelRidge,"{'alpha': 0.01, 'coef0': 1, 'degree': 3, 'gamm...",-0.0718899
1,Ridge,"{'alpha': 1.0, 'copy_X': True, 'fit_intercept'...",-0.0744481
6,GradientBoostingRegressor,"{'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': ...",-0.0768481
5,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",-0.0806734
7,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",-0.085367
4,DecisionTreeRegressor,"{'ccp_alpha': 0.0, 'criterion': 'mse', 'max_de...",-0.105341
0,Lasso,"{'alpha': 1.0, 'copy_X': True, 'fit_intercept'...",-0.399471
2,ElasticNet,"{'alpha': 1.0, 'copy_X': True, 'fit_intercept'...",-0.399471


In [180]:
from sklearn import linear_model
ridge = KernelRidge()

In [181]:
ridge.fit(new_X_train, y_train)

KernelRidge()

In [182]:
gb_pred = gb_reg.predict(X_test_selected)
ridge_pred = ridge_reg.predict(X_test_selected)
xgb_pred = xgb_reg.predict(X_test_selected)
new_X_test = pd.DataFrame({'gb':gb_pred, 'ridge':ridge_pred, 'xgb':xgb_pred})
new_X_test

Unnamed: 0,gb,ridge,xgb
0,11.714760,11.601386,11.691289
1,11.960043,11.989370,11.955441
2,12.133566,12.098096,12.060892
3,12.133768,12.157754,12.094581
4,12.134151,12.229529,12.197532
...,...,...,...
1454,11.203699,11.306467,11.289469
1455,11.338167,11.273463,11.363145
1456,11.962986,11.948905,11.915127
1457,11.693084,11.718857,11.746433


In [183]:
ridge.predict(new_X_test)

array([11.72071721, 11.9585032 , 12.13489086, ..., 11.96338609,
       11.69226016, 12.3652939 ])

In [184]:
prediction = np.exp(ridge.predict(new_X_test))
prediction

array([123095.6794611 , 156139.2010135 , 186258.51656481, ...,
       156903.47585616, 119642.11150359, 234519.47196243])

In [115]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

def neg_rmse(y_true, y_pred):
    return (-1) * np.sqrt(np.mean((y_true - y_pred) ** 2))
scorer = make_scorer(neg_rmse)

ridge_reg = KernelRidge(
    alpha=0.01,
    
)
parameters = {
    'kernel':['RBF', 'linear', 'polynominal', 'sigmoid', 'laplacian', ''],
    
}    
cv = GridSearchCV(estimator = ridge_reg, param_grid=parameters, scoring=scorer, n_jobs = -1, cv = 6)

cv.fit(new_X_train, y_train)

cv.best_params_

{'kernel': 'linear'}

# Make Predictions

In [203]:
prediction = gb_reg.predict(X_test_selected)
prediction 

array([116164.2420624 , 158018.97442903, 179653.82831848, ...,
       153859.27553661, 122824.16963526, 222834.86852385])

In [204]:
# prediction = np.exp(ridge_reg.predict(X_test_selected))

result = pd.DataFrame({'Id': test['Id'], 'SalePrice': prediction})
result.head()

Unnamed: 0,Id,SalePrice
0,1461,116164.242062
1,1462,158018.974429
2,1463,179653.828318
3,1464,185769.024418
4,1465,196862.385703


In [205]:
result.to_csv('result.csv',index = False)