In [261]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import ensemble
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.decomposition import PCA

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [283]:
train = pd.read_csv('train.csv')
train.drop('Id', axis = 1, inplace = True)

X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']


In [284]:
def prepare_X(X, y):
    #########################去掉Outlier###############################
    
    outliers = [1298, 495, 249, 313, 335, 706, 185]
    X.drop(outliers, inplace = True)
    y = y.drop(outliers)
    
    #########################选择合适的features#########################
    
    cols = [
             'OverallQual','ExterQual','CentralAir','1stFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
             'KitchenQual','TotRmsAbvGrd','Fireplaces','PavedDrive','TotalBsmtSF','GarageArea','YearRemodAdd',
             'YearBuilt','BsmtFinSF1','2ndFlrSF','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr', 'LotArea',
             'PoolArea','MSZoning','Neighborhood','Exterior1st','Exterior2nd','Foundation','SaleType', 'WoodDeckSF',
             'OpenPorchSF','LotFrontage','MSSubClass', 'LotShape','LandContour','LandSlope','Condition1','BldgType',
             'OverallCond','RoofStyle', 'ExterCond', 'BsmtQual','BsmtCond', 'BsmtFinSF2', 'BsmtUnfSF','HeatingQC',
             'Electrical', 'LowQualFinSF', 'Functional','GarageType', 'GarageFinish','GarageCond'
    ]
    X_prepared = X[cols] 
    
    #########################填补Missingness###############################
    X_prepared['BsmtFinSF2'].fillna(0, inplace = True)
    
    num_col = ['LotFrontage', 'MasVnrArea']
    
    for num in num_col:
        X_prepared[num] = X.groupby(['Neighborhood'])[num].apply(lambda x: x.fillna(x.median()))
    
    cat_col = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
    
    for cat in cat_col:
        X_prepared[cat] = X.groupby(['Neighborhood'])[cat].apply(lambda x:x.fillna(x.value_counts().index[0]))
    
    X_prepared['BsmtQual'] = X_prepared['BsmtQual'].fillna('No_Basement')
    X_prepared['BsmtCond'] = X_prepared['BsmtCond'].fillna('No_Basement')
    
    #########################处理一下Column############################
    
    ########Categorical######
    
    Qual_dict = {'Ex':3, 'Gd': 2, 'TA': 1, 'Fa': 0}
    Qual_dict2 = {'Ex':4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po':0}
    Qual_dict3 = {'Ex':4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'No_Basement':0}
    Qual_dict4 = {'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No_Basement':0}
    Qual_dict5 = {'Fin':2, 'RFn':1, 'Unf':0}
    
    # GarageCond
    X_prepared['GarageCond'] = X_prepared['GarageCond'].replace(Qual_dict2)
    # GarageQual
    X_prepared['GarageQual'] = X_prepared['GarageQual'].replace(Qual_dict2)
    # GarageFinish
    X_prepared['GarageFinish'] = X_prepared['GarageFinish'].replace(Qual_dict5)
    # HeatingQC
    X_prepared['HeatingQC'] = X_prepared['HeatingQC'].replace(Qual_dict2)
    # BsmtCond
    X_prepared['BsmtCond'] = X_prepared['BsmtCond'].replace(Qual_dict4)
    # BsmtQual
    X_prepared['BsmtQual'] = X_prepared['BsmtQual'].replace(Qual_dict3)
    # ExterCond
    X_prepared['ExterCond'] = X_prepared['ExterCond'].replace(Qual_dict2)
    # ExterQual
    X_prepared['ExterQual'] = X_prepared['ExterQual'].replace(Qual_dict)
    # KitchenQual
    X_prepared['KitchenQual'] = X_prepared['KitchenQual'].replace(Qual_dict)
    # LandSlope
    X_prepared['LandSlope'] = X_prepared['LandSlope'].replace({'Sev':2, 'Mod':1, 'Gtl':0})
    # ONE HOT
    onehot_col = ['MSZoning', 'Neighborhood','Exterior1st','Exterior2nd','Foundation','CentralAir',
                'PavedDrive', 'SaleType','MSSubClass', 'LotShape', 'LandContour','Condition1','BldgType',
                'RoofStyle','Electrical', 'Functional','GarageType'
                    ]
    X_prepared = pd.get_dummies(X_prepared, columns = onehot_col)
    
    ########Numerical########
    
    high_skew_col = ['LotArea', 'WoodDeckSF', '1stFlrSF', 'TotalBsmtSF', 'OpenPorchSF', 'LotFrontage', 
                     'MasVnrArea', 'BsmtFinSF2']
    for col in high_skew_col:
        X_prepared[col] = np.log1p(X_prepared[col])
        
    #########################新feature################################
    
    X['has_2ndsf'] = (X['2ndFlrSF'] != 0).astype(np.int)
    X_prepared['Bath'] = X_prepared['FullBath'] + 0.5 * X_prepared['HalfBath']
    X_prepared['has_pool'] = (X_prepared['PoolArea'] == 0).astype(np.int)
    X_prepared['has_remodel'] = (X_prepared['YearRemodAdd'] != X_prepared['YearBuilt']).astype(np.int)
    X_prepared['BsmtBath'] = X_prepared['BsmtFullBath'] + 0.5 * X_prepared['BsmtHalfBath']
    X_prepared['Overall'] = X_prepared['OverallCond'] + X_prepared['OverallQual']
    X_prepared['has_MasVnr'] = (X_prepared['MasVnrArea'] == 0).astype(np.int)
    X_prepared['Exter'] = X_prepared['ExterCond'] + X_prepared['ExterQual']
    X_prepared['Bsmt'] = X_prepared['BsmtCond'] + X_prepared['BsmtQual']
    X_prepared['has_BsmtFinSF2'] = (X_prepared['BsmtFinSF2'] != 0).astype(np.int)
    X_prepared['has_lowqualitysf'] = (X_prepared['LowQualFinSF'] != 0).astype(np.int)
    X_prepared['Garage'] = X_prepared['GarageQual'] + X_prepared['GarageCond']
    
    #########################不要的column扔了############################
    to_drop = ['YearBuilt', 'PoolArea', 'LowQualFinSF']
    X_prepared.drop(to_drop, axis = 1, inplace = True)
    return X_prepared, y


In [285]:
X_prepared, y_prepared = prepare_X(X, y)
X_prepared.head()


Unnamed: 0,OverallQual,ExterQual,1stFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,KitchenQual,TotRmsAbvGrd,Fireplaces,TotalBsmtSF,...,has_pool,has_remodel,BsmtBath,Overall,has_MasVnr,Exter,Bsmt,has_BsmtFinSF2,has_lowqualitysf,Garage
0,7,2,6.753438,1710,1,0,2,8,0,6.753438,...,1,0,1.0,12,0,4,6,0,0,4
1,6,1,7.141245,1262,0,1,1,6,1,7.141245,...,1,0,0.5,14,1,3,6,0,0,4
2,7,2,6.82546,1786,1,0,2,6,1,6.82546,...,1,1,1.0,12,0,4,6,0,0,4
3,7,1,6.869014,1717,1,0,2,7,1,6.629363,...,1,1,1.0,12,1,3,6,0,0,4
4,8,2,7.044033,2198,1,0,2,9,1,7.044033,...,1,0,1.0,13,0,4,6,0,0,4


In [286]:
rf_reg = ensemble.RandomForestRegressor(
    warm_start=True,
    ccp_alpha=0.1,
    criterion='mae',
    max_depth = 50,
    n_estimators = 100
)
rf_reg.fit(X_prepared, y_prepared)

RandomForestRegressor(ccp_alpha=0.1, criterion='mae', max_depth=50,
                      warm_start=True)

In [277]:
(cross_val_score(rf_reg, X_prepared, y_prepared, cv = 5)).mean()

0.8637218735607772

In [287]:
mean_squared_error(rf_reg.predict(X_prepared), y_prepared)


122194016.62394887

In [288]:
# Check feature importances
feature_importance = pd.DataFrame({'Name': X_prepared.columns, 'Importance': rf_reg.feature_importances_}).sort_values('Importance'
                                                                                    ,ascending = False)
feature_importance.head()

Unnamed: 0,Name,Importance
0,OverallQual,0.363757
3,GrLivArea,0.110018
9,TotalBsmtSF,0.062856
10,GarageArea,0.048251
12,BsmtFinSF1,0.036745


In [272]:
#Search for parameters
rf_reg1 = ensemble.RandomForestRegressor(
    warm_start=True,
    ccp_alpha=0.1,
    criterion='mae',
    max_depth = 50,
    n_estimators = 100
)
parameters = {
    'max_depth':[45, 50, 55],
    'n_estimators':[100, 200, 400]
}
cv = GridSearchCV(estimator=rf_reg1, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

cv.fit(X_prepared, y_prepared)

cv.best_params_

{'max_depth': 50, 'n_estimators': 100}

In [None]:
    max_features = 'sqrt', 
    n_estimators = 400,
    max_depth = 100

# Model Selection

In [12]:
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn import ensemble

In [13]:
MLA = [
    #Linear Model
    linear_model.LinearRegression(),
    
    #svm
    svm.SVR(),
    
    #Tree
    tree.DecisionTreeRegressor(),
    
    #Ensemble
    ensemble.RandomForestRegressor()
]

df_columns = ['Name', 'Parameters', 'Train RMSE Mean', 'Test RMSE Mean']
df = pd.DataFrame(columns = df_columns)
from sklearn.model_selection import cross_validate
for i in np.arange(len(MLA)):
    model = MLA[i]
    model_name = model.__class__.__name__
    model_parameters = str(model.get_params())
    cv_results = cross_validate(model, X_prepared, y_prepared, cv = 5, return_train_score=True)
    train_accuracy = cv_results['train_score'].mean()
    test_accuracy = cv_results['test_score'].mean()
    df.loc[i,:] = [model_name, model_parameters, train_accuracy, test_accuracy]

In [14]:
df.sort_values('Test RMSE Mean', ascending = False)

Unnamed: 0,Name,Parameters,Train RMSE Mean,Test RMSE Mean
3,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.980025,0.867163
0,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",0.880025,0.839545
2,DecisionTreeRegressor,"{'ccp_alpha': 0.0, 'criterion': 'mse', 'max_de...",0.999995,0.744716
1,SVR,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd...",-0.0486991,-0.0505531


# Prediction

In [396]:
test = pd.read_csv('test.csv')


In [397]:
def prepare_test(X):
    #########################选择合适的features#########################
    
    cols = [
             'OverallQual','ExterQual','CentralAir','1stFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
             'KitchenQual','TotRmsAbvGrd','Fireplaces','PavedDrive','TotalBsmtSF','GarageArea','YearRemodAdd',
             'YearBuilt','BsmtFinSF1','2ndFlrSF','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr', 'LotArea',
             'PoolArea','MSZoning','Neighborhood','Exterior1st','Exterior2nd','Foundation','SaleType', 'WoodDeckSF',
             'OpenPorchSF','LotFrontage','MSSubClass', 'LotShape','LandContour','LandSlope','Condition1','BldgType',
             'OverallCond','RoofStyle', 'ExterCond', 'BsmtQual','BsmtCond', 'BsmtFinSF2', 'BsmtUnfSF','HeatingQC',
             'Electrical', 'LowQualFinSF', 'Functional','GarageType', 'GarageFinish','GarageCond','GarageQual',
             'MasVnrArea'
    ]
    X_prepared = X[cols] 
    
    #########################处理一下Missingness############################
    
    X_prepared['BsmtQual'] = X_prepared['BsmtQual'].fillna('No_Basement')
    X_prepared['BsmtCond'] = X_prepared['BsmtCond'].fillna('No_Basement')
    
    # Categorical的用mode， 根据 Neighborhood
    # Numerical的用median， 根据 Neighborhood
    X_prepared['GarageFinish'] = X.groupby(['Neighborhood'])['GarageFinish'].apply(lambda x:x.fillna(x.value_counts().index[0]))
    X_prepared['GarageCond'] = X.groupby(['Neighborhood'])['GarageCond'].apply(lambda x:x.fillna(x.value_counts().index[0]))
    X_prepared['GarageQual'] = X.groupby(['Neighborhood'])['GarageQual'].apply(lambda x:x.fillna(x.value_counts().index[0]))
    
    mode_col = ['BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'TotalBsmtSF']
    median_col = ['GarageArea', 'BsmtFinSF1', 'LotFrontage', 'MasVnrArea', 'BsmtFinSF2', 'BsmtUnfSF' 
                     ]
                     #'GarageFinish', 'MasVnrArea']

    for i in mode_col:
        X_prepared[i] = X.groupby(['Neighborhood'])[i].apply(lambda x:x.fillna(x.value_counts().index[0]))
    for k in median_col:
        X_prepared[k] = X.groupby(['Neighborhood'])[i].apply(lambda x: x.fillna(x.median()))
    
    #########################处理一下Column############################
    
    ########Categorical######
    
    Qual_dict = {'Ex':3, 'Gd': 2, 'TA': 1, 'Fa': 0}
    Qual_dict2 = {'Ex':4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po':0}
    Qual_dict3 = {'Ex':4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'No_Basement':0}
    Qual_dict4 = {'Gd':4, 'TA': 3, 'Fa': 2, 'Po': 1, 'No_Basement':0}
    Qual_dict5 = {'Fin':2, 'RFn':1, 'Unf':0}
    
    # GarageCond
    X_prepared['GarageCond'] = X_prepared['GarageCond'].replace(Qual_dict2)
    # GarageQual
    X_prepared['GarageQual'] = X_prepared['GarageQual'].replace(Qual_dict2)
    # GarageFinish
    X_prepared['GarageFinish'] = X_prepared['GarageFinish'].replace(Qual_dict5)
    # HeatingQC
    X_prepared['HeatingQC'] = X_prepared['HeatingQC'].replace(Qual_dict2)
    # BsmtCond
    X_prepared['BsmtCond'] = X_prepared['BsmtCond'].replace(Qual_dict4)
    # BsmtQual
    X_prepared['BsmtQual'] = X_prepared['BsmtQual'].replace(Qual_dict3)
    # ExterCond
    X_prepared['ExterCond'] = X_prepared['ExterCond'].replace(Qual_dict2)
    # ExterQual
    X_prepared['ExterQual'] = X_prepared['ExterQual'].replace(Qual_dict)
    # KitchenQual
    X_prepared['KitchenQual'] = X_prepared['KitchenQual'].replace(Qual_dict)
    # LandSlope
    X_prepared['LandSlope'] = X_prepared['LandSlope'].replace({'Sev':2, 'Mod':1, 'Gtl':0})
    # ONE HOT
    onehot_col = ['MSZoning', 'Neighborhood','Exterior1st','Exterior2nd','Foundation','CentralAir',
                'PavedDrive', 'SaleType','MSSubClass', 'LotShape', 'LandContour','Condition1','BldgType',
                'RoofStyle','Electrical', 'Functional','GarageType'
                    ]
    X_prepared = pd.get_dummies(X_prepared, columns = onehot_col)
    
    ########Numerical########
    
    high_skew_col = ['LotArea', 'WoodDeckSF', '1stFlrSF', 'TotalBsmtSF', 'OpenPorchSF', 'LotFrontage', 
                     'MasVnrArea', 'BsmtFinSF2']
    for col in high_skew_col:
        X_prepared[col] = np.log1p(X_prepared[col])
        
    #########################新feature################################
    
    X['has_2ndsf'] = (X['2ndFlrSF'] != 0).astype(np.int)
    X_prepared['Bath'] = X_prepared['FullBath'] + 0.5 * X_prepared['HalfBath']
    X_prepared['has_pool'] = (X_prepared['PoolArea'] == 0).astype(np.int)
    X_prepared['has_remodel'] = (X_prepared['YearRemodAdd'] != X_prepared['YearBuilt']).astype(np.int)
    X_prepared['BsmtBath'] = X_prepared['BsmtFullBath'] + 0.5 * X_prepared['BsmtHalfBath']
    X_prepared['Overall'] = X_prepared['OverallCond'] + X_prepared['OverallQual']
    X_prepared['has_MasVnr'] = (X_prepared['MasVnrArea'] == 0).astype(np.int)
    X_prepared['Exter'] = X_prepared['ExterCond'] + X_prepared['ExterQual']
    X_prepared['Bsmt'] = X_prepared['BsmtCond'] + X_prepared['BsmtQual']
    X_prepared['has_BsmtFinSF2'] = (X_prepared['BsmtFinSF2'] != 0).astype(np.int)
    X_prepared['has_lowqualitysf'] = (X_prepared['LowQualFinSF'] != 0).astype(np.int)
    X_prepared['Garage'] = X_prepared['GarageQual'] + X_prepared['GarageCond']
    
    #########################不要的column扔了############################
    
    to_drop = ['YearBuilt', 'PoolArea', 'LowQualFinSF']
    X_prepared.drop(to_drop, axis = 1, inplace = True)
    
    #########################test中没有的类型填一下############################
    
    miss_col = [
        'Exterior1st_ImStucc', 'Exterior1st_Stone', 'Exterior2nd_Other'
    ]
    for c in miss_col:
        X_prepared[c] = 0
    return X_prepared

In [398]:
test_prepared = prepare_test(test)


In [399]:
for i in test_prepared.columns:
    if test_prepared[i].isnull().sum() > 0:
        print(i)

In [400]:
predictions = rf_reg.predict(test_prepared)
result = pd.DataFrame({'Id': test['Id'], 'SalePrice': predictions})


In [401]:
result.head()


Unnamed: 0,Id,SalePrice
0,1461,114198.28
1,1462,143735.77
2,1463,160085.32
3,1464,173835.82
4,1465,245838.53


In [21]:
# result.to_csv('result.csv',index = False)


In [390]:
set(X_prepared.columns) - set(test_prepared.columns)

{'Electrical_Mix'}