In [129]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import ensemble
from sklearn import feature_selection
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score

import xgboost

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [130]:
train = pd.read_csv('train.csv')
train.drop('Id', axis = 1, inplace = True)

X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']


In [131]:
# Log Transform 'SalePrice', and then transform back after prediction
y = np.log(y)

In [132]:
def prepare_X(X, y):
    #########################去掉Outlier###############################
    
    outliers = [1298]
    X.drop(outliers, inplace = True)
    y = y.drop(outliers)
    
    #########################选择合适的features#########################
    
    cols = [
             'OverallQual','ExterQual','CentralAir','1stFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
             'KitchenQual','TotRmsAbvGrd','Fireplaces','PavedDrive','TotalBsmtSF','GarageArea','YearRemodAdd',
             'YearBuilt','BsmtFinSF1','2ndFlrSF','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr', 'LotArea',
             'PoolArea','MSZoning','Neighborhood','Exterior1st','Exterior2nd','Foundation','SaleType', 'WoodDeckSF',
             'OpenPorchSF','LotFrontage','MSSubClass', 'LotShape','LandContour','LandSlope','Condition1','BldgType',
             'OverallCond','RoofStyle', 'ExterCond', 'BsmtQual','BsmtCond', 'BsmtFinSF2', 'BsmtUnfSF','HeatingQC',
             'Electrical', 'LowQualFinSF', 'Functional','GarageType', 'GarageFinish','GarageCond', 'YrSold'
    ]
    X_prepared = X[cols] 
    
    #########################填补Missingness###############################
    X_prepared['BsmtFinSF2'].fillna(0, inplace = True)
    
    num_col = ['LotFrontage', 'MasVnrArea']
    
    for num in num_col:
        X_prepared[num] = X.groupby(['Neighborhood'])[num].apply(lambda x: x.fillna(x.median()))
    
    cat_col = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
    
    for cat in cat_col:
        X_prepared[cat] = X.groupby(['Neighborhood'])[cat].apply(lambda x:x.fillna(x.value_counts().index[0]))
    
    X_prepared['BsmtQual'] = X_prepared['BsmtQual'].fillna('No_Basement')
    X_prepared['BsmtCond'] = X_prepared['BsmtCond'].fillna('No_Basement')
    
    #########################处理一下Column############################
    
    ########Categorical######
    
    Qual_dict = {'Ex':3, 'Gd': 2, 'TA': 1, 'Fa': 0}
    Qual_dict2 = {'Ex':4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po':0}
    Qual_dict3 = {'Ex':4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'No_Basement':0}
    Qual_dict4 = {'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No_Basement':0}
    Qual_dict5 = {'Fin':2, 'RFn':1, 'Unf':0}
    
    # GarageCond
    X_prepared['GarageCond'] = X_prepared['GarageCond'].replace(Qual_dict2)
    # GarageQual
    X_prepared['GarageQual'] = X_prepared['GarageQual'].replace(Qual_dict2)
    # GarageFinish
    X_prepared['GarageFinish'] = X_prepared['GarageFinish'].replace(Qual_dict5)
    # HeatingQC
    X_prepared['HeatingQC'] = X_prepared['HeatingQC'].replace(Qual_dict2)
    # BsmtCond
    X_prepared['BsmtCond'] = X_prepared['BsmtCond'].replace(Qual_dict4)
    # BsmtQual
    X_prepared['BsmtQual'] = X_prepared['BsmtQual'].replace(Qual_dict3)
    # ExterCond
    X_prepared['ExterCond'] = X_prepared['ExterCond'].replace(Qual_dict2)
    # ExterQual
    X_prepared['ExterQual'] = X_prepared['ExterQual'].replace(Qual_dict)
    # KitchenQual
    X_prepared['KitchenQual'] = X_prepared['KitchenQual'].replace(Qual_dict)
    # LandSlope
    X_prepared['LandSlope'] = X_prepared['LandSlope'].replace({'Sev':2, 'Mod':1, 'Gtl':0})
    # ONE HOT
    onehot_col = ['MSZoning', 'Neighborhood','Exterior1st','Exterior2nd','Foundation','CentralAir',
                'PavedDrive', 'SaleType','MSSubClass', 'LotShape', 'LandContour','Condition1','BldgType',
                'RoofStyle','Electrical', 'Functional','GarageType'
                    ]
    X_prepared = pd.get_dummies(X_prepared, columns = onehot_col, drop_first=True)
    
    ########Numerical########
    
    high_skew_col = ['LotArea', 'WoodDeckSF', '1stFlrSF', 'TotalBsmtSF', 'OpenPorchSF', 'LotFrontage', 
                     'MasVnrArea', 'BsmtFinSF2']
    for col in high_skew_col:
        X_prepared[col] = np.log1p(X_prepared[col])
        
    #########################新feature################################
    
    X_prepared['Year_to_Sold'] = X_prepared['YrSold'] - X_prepared['YearBuilt']
    X_prepared['has_2ndsf'] = (X_prepared['2ndFlrSF'] != 0).astype(np.int)
    X_prepared['Bath'] = X_prepared['FullBath'] + 0.5 * X_prepared['HalfBath']
    X_prepared['has_pool'] = (X_prepared['PoolArea'] == 0).astype(np.int)
    X_prepared['has_remodel'] = (X_prepared['YearRemodAdd'] != X_prepared['YearBuilt']).astype(np.int)
    X_prepared['BsmtBath'] = X_prepared['BsmtFullBath'] + 0.5 * X_prepared['BsmtHalfBath']
    X_prepared['Overall'] = X_prepared['OverallCond'] + X_prepared['OverallQual']
    X_prepared['has_MasVnr'] = (X_prepared['MasVnrArea'] == 0).astype(np.int)
    X_prepared['Exter'] = X_prepared['ExterCond'] + X_prepared['ExterQual']
    X_prepared['Bsmt'] = X_prepared['BsmtCond'] + X_prepared['BsmtQual']
    X_prepared['has_BsmtFinSF2'] = (X_prepared['BsmtFinSF2'] != 0).astype(np.int)
    X_prepared['has_lowqualitysf'] = (X_prepared['LowQualFinSF'] != 0).astype(np.int)
    X_prepared['Garage'] = X_prepared['GarageQual'] + X_prepared['GarageCond']
    
    #########################不要的column扔了############################
    to_drop = ['YearBuilt', 'PoolArea', 'LowQualFinSF', 'YrSold']
    X_prepared.drop(to_drop, axis = 1, inplace = True)
    return X_prepared, y


In [133]:
X_prepared, y_prepared = prepare_X(X, y)
X_prepared.head()


Unnamed: 0,OverallQual,ExterQual,1stFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,KitchenQual,TotRmsAbvGrd,Fireplaces,TotalBsmtSF,...,has_pool,has_remodel,BsmtBath,Overall,has_MasVnr,Exter,Bsmt,has_BsmtFinSF2,has_lowqualitysf,Garage
0,7,2,6.753438,1710,1,0,2,8,0,6.753438,...,1,0,1.0,12,0,4,6,0,0,4
1,6,1,7.141245,1262,0,1,1,6,1,7.141245,...,1,0,0.5,14,1,3,6,0,0,4
2,7,2,6.82546,1786,1,0,2,6,1,6.82546,...,1,1,1.0,12,0,4,6,0,0,4
3,7,1,6.869014,1717,1,0,2,7,1,6.629363,...,1,1,1.0,12,1,3,6,0,0,4
4,8,2,7.044033,2198,1,0,2,9,1,7.044033,...,1,0,1.0,13,0,4,6,0,0,4


In [134]:
# Feature Selection
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))
def neg_rmse(estimator, X, y):
    y_true = y
    y_pred = estimator.predict(X)
    return (-1) * np.sqrt(np.mean((y_true - y_pred) ** 2))

negative_rmse = make_scorer(neg_rmse)

estimator = ensemble.GradientBoostingRegressor()
selector = feature_selection.RFECV(estimator, cv = 5, n_jobs = -1, scoring = neg_rmse)
selector = selector.fit(X_prepared, y_prepared)

print("The number of selected features is: {}".format(selector.n_features_))

features_keep = X_prepared.columns.values[selector.support_]

X_prepared = selector.transform(X_prepared)

The number of selected features is: 123


In [135]:
# 随便找一个试试
gb_reg = ensemble.GradientBoostingRegressor()
gb_reg.fit(X_prepared, y_prepared)

GradientBoostingRegressor()

In [136]:
cv_score = (cross_val_score(gb_reg, X_prepared, y_prepared, cv = 5)).mean()
mse = mean_squared_error(gb_reg.predict(X_prepared), y_prepared)

print('CV score is:', cv_score)
print('MSE is:', mse)
print('RMSE is:', mse ** 0.5)

CV score is: 0.8912044109407364
MSE is: 0.007240481637532624
RMSE is: 0.08509101972319184


In [137]:
# Check feature importances
feature_importance = pd.DataFrame({'Name': features_keep, 'Importance': gb_reg.feature_importances_}).sort_values('Importance'
                                                                                    ,ascending = False)
feature_importance.head()

Unnamed: 0,Name,Importance
0,OverallQual,0.398491
3,GrLivArea,0.151807
9,TotalBsmtSF,0.059513
10,GarageArea,0.045042
6,KitchenQual,0.044517


In [138]:
feature_importance

Unnamed: 0,Name,Importance
0,OverallQual,0.398491
3,GrLivArea,0.151807
9,TotalBsmtSF,0.059513
10,GarageArea,0.045042
6,KitchenQual,0.044517
...,...,...
70,PavedDrive_P,0.000000
97,Electrical_FuseP,0.000000
98,Electrical_Mix,0.000000
48,Neighborhood_NWAmes,0.000000


# Model Selection

In [139]:
# from sklearn import linear_model
# from sklearn import tree
# from sklearn import ensemble
# from sklearn.kernel_ridge import KernelRidge

# def neg_rmse(y_true, y_pred):
#     return (-1) * np.sqrt(np.mean((y_true - y_pred) ** 2))


# MLA = [
#     #Linear Model
#     linear_model.Lasso(),
#     linear_model.Ridge(),
#     linear_model.ElasticNet(),

#     #Kernel Ridge
#     KernelRidge(),
    
#     #Tree
#     tree.DecisionTreeRegressor(),
    
#     #Ensemble
#     ensemble.RandomForestRegressor(),
#     ensemble.GradientBoostingRegressor(),
    
#     #XGB
#     xgboost.XGBRegressor()
# ]
# scorer = make_scorer(neg_rmse)
# df_columns = ['Name', 'Parameters', 'CV score mean']
# df = pd.DataFrame(columns = df_columns)
# from sklearn.model_selection import cross_validate
# for i in np.arange(len(MLA)):
#     model = MLA[i]
#     model_name = model.__class__.__name__
#     model_parameters = str(model.get_params())
#     cv_results = (cross_val_score(model, X_prepared, y_prepared, cv = 5, scoring=scorer)).mean()
#     df.loc[i,:] = [model_name, model_parameters, cv_results]
    

# df.sort_values('CV score mean', ascending = False)


# Search for hyperparameters

## XGB

In [140]:
# #Search for parameters
# xgb_reg = xgboost.XGBRegressor(
#     eta = 0.2,
#     max_depth = 4
# )
# parameters = {
#     'subsample':[0.5, 0.8, 1]
# }
# cv = GridSearchCV(estimator=xgb_reg, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

# cv.fit(X_prepared, y_prepared)

# cv.best_params_

## Gradient Boosting Regressor

In [141]:
# #Search for parameters
# gb_reg = ensemble.GradientBoostingRegressor(
#     n_estimators=800,
#     learning_rate=0.05,
#     subsample=0.6,
#     criterion = 'mse'
# )
# parameters = {
#     'criterion':['friedman_mse', 'mse', 'mae']
# }
# cv = GridSearchCV(estimator=gb_reg, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

# cv.fit(X_prepared, y_prepared)

# cv.best_params_

## Random Forest Regressor

In [142]:
# #Search for parameters
# rf_reg = ensemble.RandomForestRegressor(
#     n_estimators=400,
#     ccp_alpha=0.01,
#     max_features='log2'
# )
# parameters = {
#     'max_features':['auto', 'sqrt', 'log2']
# }
# cv = GridSearchCV(estimator=gb_reg, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

# cv.fit(X_prepared, y_prepared)

# cv.best_params_

# Train the model

In [143]:
xgb_reg = xgboost.XGBRegressor(
    eta = 0.2,
    max_depth = 4
)
gb_reg = ensemble.GradientBoostingRegressor(
    n_estimators=800,
    learning_rate=0.05,
    subsample=0.6,
    criterion = 'mse'
)
rf_reg = ensemble.RandomForestRegressor(
    n_estimators=400,
    ccp_alpha=0.01,
    max_features='log2'
)

models = [xgb_reg, gb_reg, rf_reg]
for i in models:
    i.fit(X_prepared, y_prepared)

# Prediction

In [149]:
test = pd.read_csv('test.csv')


In [150]:
def prepare_test(X):
    #########################选择合适的features#########################
    
    cols = [
             'OverallQual','ExterQual','CentralAir','1stFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
             'KitchenQual','TotRmsAbvGrd','Fireplaces','PavedDrive','TotalBsmtSF','GarageArea','YearRemodAdd',
             'YearBuilt','BsmtFinSF1','2ndFlrSF','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr', 'LotArea',
             'PoolArea','MSZoning','Neighborhood','Exterior1st','Exterior2nd','Foundation','SaleType', 'WoodDeckSF',
             'OpenPorchSF','LotFrontage','MSSubClass', 'LotShape','LandContour','LandSlope','Condition1','BldgType',
             'OverallCond','RoofStyle', 'ExterCond', 'BsmtQual','BsmtCond', 'BsmtFinSF2', 'BsmtUnfSF','HeatingQC',
             'Electrical', 'LowQualFinSF', 'Functional','GarageType', 'GarageFinish','GarageCond','GarageQual',
             'MasVnrArea','YrSold'
    ]
    X_prepared = X[cols] 
    
    #########################处理一下Missingness############################
    
    X_prepared['BsmtQual'] = X_prepared['BsmtQual'].fillna('No_Basement')
    X_prepared['BsmtCond'] = X_prepared['BsmtCond'].fillna('No_Basement')
    
    # Categorical的用mode， 根据 Neighborhood
    # Numerical的用median， 根据 Neighborhood
    X_prepared['GarageFinish'] = X.groupby(['Neighborhood'])['GarageFinish'].apply(lambda x:x.fillna(x.value_counts().index[0]))
    X_prepared['GarageCond'] = X.groupby(['Neighborhood'])['GarageCond'].apply(lambda x:x.fillna(x.value_counts().index[0]))
    X_prepared['GarageQual'] = X.groupby(['Neighborhood'])['GarageQual'].apply(lambda x:x.fillna(x.value_counts().index[0]))
    
    mode_col = ['BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'TotalBsmtSF']
    median_col = ['GarageArea', 'BsmtFinSF1', 'LotFrontage', 'MasVnrArea', 'BsmtFinSF2', 'BsmtUnfSF' 
                     ]
                     #'GarageFinish', 'MasVnrArea']

    for i in mode_col:
        X_prepared[i] = X.groupby(['Neighborhood'])[i].apply(lambda x:x.fillna(x.value_counts().index[0]))
    for k in median_col:
        X_prepared[k] = X.groupby(['Neighborhood'])[i].apply(lambda x: x.fillna(x.median()))
    
    #########################处理一下Column############################
    
    ########Categorical######
    
    Qual_dict = {'Ex':3, 'Gd': 2, 'TA': 1, 'Fa': 0}
    Qual_dict2 = {'Ex':4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po':0}
    Qual_dict3 = {'Ex':4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'No_Basement':0}
    Qual_dict4 = {'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No_Basement':0}
    Qual_dict5 = {'Fin':2, 'RFn':1, 'Unf':0}
    
    # GarageCond
    X_prepared['GarageCond'] = X_prepared['GarageCond'].replace(Qual_dict2)
    # GarageQual
    X_prepared['GarageQual'] = X_prepared['GarageQual'].replace(Qual_dict2)
    # GarageFinish
    X_prepared['GarageFinish'] = X_prepared['GarageFinish'].replace(Qual_dict5)
    # HeatingQC
    X_prepared['HeatingQC'] = X_prepared['HeatingQC'].replace(Qual_dict2)
    # BsmtCond
    X_prepared['BsmtCond'] = X_prepared['BsmtCond'].replace(Qual_dict4)
    # BsmtQual
    X_prepared['BsmtQual'] = X_prepared['BsmtQual'].replace(Qual_dict3)
    # ExterCond
    X_prepared['ExterCond'] = X_prepared['ExterCond'].replace(Qual_dict2)
    # ExterQual
    X_prepared['ExterQual'] = X_prepared['ExterQual'].replace(Qual_dict)
    # KitchenQual
    X_prepared['KitchenQual'] = X_prepared['KitchenQual'].replace(Qual_dict)
    # LandSlope
    X_prepared['LandSlope'] = X_prepared['LandSlope'].replace({'Sev':2, 'Mod':1, 'Gtl':0})
    # ONE HOT
    onehot_col = ['MSZoning', 'Neighborhood','Exterior1st','Exterior2nd','Foundation','CentralAir',
                'PavedDrive', 'SaleType','MSSubClass', 'LotShape', 'LandContour','Condition1','BldgType',
                'RoofStyle','Electrical', 'Functional','GarageType'
                    ]
    X_prepared = pd.get_dummies(X_prepared, columns = onehot_col, drop_first=True)
    
    ########Numerical########
    
    high_skew_col = ['LotArea', 'WoodDeckSF', '1stFlrSF', 'TotalBsmtSF', 'OpenPorchSF', 'LotFrontage', 
                     'MasVnrArea', 'BsmtFinSF2']
    for col in high_skew_col:
        X_prepared[col] = np.log1p(X_prepared[col])
        
    #########################新feature################################
    
    X_prepared['Year_to_Sold'] = X_prepared['YrSold'] - X_prepared['YearBuilt']
    X_prepared['has_2ndsf'] = (X_prepared['2ndFlrSF'] != 0).astype(np.int)
    X_prepared['Bath'] = X_prepared['FullBath'] + 0.5 * X_prepared['HalfBath']
    X_prepared['has_pool'] = (X_prepared['PoolArea'] == 0).astype(np.int)
    X_prepared['has_remodel'] = (X_prepared['YearRemodAdd'] != X_prepared['YearBuilt']).astype(np.int)
    X_prepared['BsmtBath'] = X_prepared['BsmtFullBath'] + 0.5 * X_prepared['BsmtHalfBath']
    X_prepared['Overall'] = X_prepared['OverallCond'] + X_prepared['OverallQual']
    X_prepared['has_MasVnr'] = (X_prepared['MasVnrArea'] == 0).astype(np.int)
    X_prepared['Exter'] = X_prepared['ExterCond'] + X_prepared['ExterQual']
    X_prepared['Bsmt'] = X_prepared['BsmtCond'] + X_prepared['BsmtQual']
    X_prepared['has_BsmtFinSF2'] = (X_prepared['BsmtFinSF2'] != 0).astype(np.int)
    X_prepared['has_lowqualitysf'] = (X_prepared['LowQualFinSF'] != 0).astype(np.int)
    X_prepared['Garage'] = X_prepared['GarageQual'] + X_prepared['GarageCond']
    
    #########################不要的column扔了############################
    
    to_drop = ['YearBuilt', 'PoolArea', 'LowQualFinSF', 'YrSold']
    X_prepared.drop(to_drop, axis = 1, inplace = True)
    
    #########################test中没有的类型填一下############################
    
    miss_col = [
        'Exterior1st_ImStucc', 'Exterior1st_Stone', 'Exterior2nd_Other'
    ]
    for c in miss_col:
        X_prepared[c] = 0
    return X_prepared

In [152]:
test_prepared = prepare_test(test)
test_prepared = selector.transform(test_prepared)

In [153]:
# for i in test_prepared.columns:
#     if test_prepared[i].isnull().sum() > 0:
#         print(i)

In [154]:
gb_prediction = np.exp(gb_reg.predict(test_prepared))
gb_prediction

array([126821.69926451, 146721.08458383, 138609.91920603, ...,
       164049.0349844 , 119768.66414387, 164965.1741383 ])

In [155]:
rf_prediction = np.exp(rf_reg.predict(test_prepared))
rf_prediction

array([135416.76925428, 145794.11233214, 157289.81003322, ...,
       144782.97379355, 141021.47136318, 169411.01498228])

In [156]:
xgb_prediction = np.exp(xgb_reg.predict(test_prepared))
xgb_prediction

array([132978.17, 166191.81, 166631.89, ..., 152824.72, 127909.35,
       198247.83], dtype=float32)

In [157]:
predictions = (gb_prediction + rf_prediction + xgb_prediction)/3
result = pd.DataFrame({'Id': test['Id'], 'SalePrice': predictions})


In [158]:
result.head()


Unnamed: 0,Id,SalePrice
0,1461,131738.880131
1,1462,152902.336472
2,1463,154177.206621
3,1464,161715.980558
4,1465,181905.339708


In [None]:
# result.to_csv('result.csv',index = False)


In [None]:
# set(X_prepared.columns) - set(test_prepared.columns)