In [363]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import ensemble
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [422]:
train = pd.read_csv('train.csv')
train.drop('Id', axis = 1, inplace = True)

X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']


- FullBath不行
- MasVnrArea 似乎也不可以


In [423]:
def prepare_X(X, y):
    #########################去掉Outlier###############################
    
    outliers = [1298, 495]
    X.drop(outliers, inplace = True)
    y = y.drop(outliers)
    
    #########################选择合适的features#########################
    
    cols = [
             'OverallQual','ExterQual','CentralAir','1stFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
             'KitchenQual','TotRmsAbvGrd','Fireplaces','PavedDrive','TotalBsmtSF','GarageArea','YearRemodAdd',
             'YearBuilt','BsmtFinSF1','2ndFlrSF','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
             'PoolArea','MSZoning','Neighborhood','Exterior1st','Exterior2nd','Foundation','SaleType',
             'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'
    ]
    X_prepared = X[cols] 
    
    #########################处理一下Column############################
    
    # ExterQual
    X_prepared['ExterQual'] = X_prepared['ExterQual'].replace({'Ex':3, 'Gd': 2, 'TA': 1, 'Fa': 0})
    # KitchenQual
    X_prepared['KitchenQual'] = X_prepared['KitchenQual'].replace({'Ex':3, 'Gd': 2, 'TA': 1, 'Fa': 0})
    
    # ONE HOT
    onehot_col = ['MSZoning', 'Neighborhood','Exterior1st','Exterior2nd','Foundation','CentralAir',
                    'PavedDrive', 'SaleType']
    X_prepared = pd.get_dummies(X_prepared, columns = onehot_col)
    
    #########################新feature################################
   
    X_prepared['Bath'] = X_prepared['FullBath'] + 0.5 * X_prepared['HalfBath']
    X_prepared['has_pool'] = (X_prepared['PoolArea'] == 0).astype(np.int)
    X_prepared['has_remodel'] = (X_prepared['YearRemodAdd'] != X_prepared['YearBuilt']).astype(np.int)
    X_prepared['BsmtBath'] = X_prepared['BsmtFullBath'] + 0.5 * X_prepared['BsmtHalfBath']
    X_prepared['Total_Porch'] = X_prepared['OpenPorchSF'] + X_prepared['EnclosedPorch'] + (
                                X_prepared['3SsnPorch']+ X_prepared['ScreenPorch'])
    X_prepared['has_porch'] = (X_prepared['Total_Porch'] == 0).astype(np.int)
    
    #########################不要的column扔了############################
    to_drop = ['YearBuilt', 'PoolArea']
    X_prepared.drop(to_drop, axis = 1, inplace = True)
    return X_prepared, y


In [424]:
X_prepared, y_prepared = prepare_X(X, y)
X_prepared.head()


Unnamed: 0,OverallQual,ExterQual,1stFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,KitchenQual,TotRmsAbvGrd,Fireplaces,TotalBsmtSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Bath,has_pool,has_remodel,BsmtBath,Total_Porch,has_porch
0,7,2,856,1710,1,0,2,8,0,856,...,0,0,0,1,2.5,1,0,1.0,61,0
1,6,1,1262,1262,0,1,1,6,1,1262,...,0,0,0,1,2.0,1,0,0.5,0,1
2,7,2,920,1786,1,0,2,6,1,920,...,0,0,0,1,2.5,1,1,1.0,42,0
3,7,1,961,1717,1,0,2,7,1,756,...,0,0,0,1,1.0,1,1,1.0,307,0
4,8,2,1145,2198,1,0,2,9,1,1145,...,0,0,0,1,2.5,1,0,1.0,84,0


In [425]:
rf_reg = ensemble.RandomForestRegressor(max_features = 'sqrt', n_estimators = 400)
rf_reg.fit(X_prepared, y_prepared)


RandomForestRegressor(max_features='sqrt', n_estimators=400)

In [426]:
rf_reg.score(X_prepared, y_prepared)


0.982118840569148

In [427]:
mean_squared_error(rf_reg.predict(X_prepared), y_prepared)


112660285.87620212

In [428]:
# Check feature importances
feature_importance = pd.DataFrame({'Name': X_prepared.columns, 'Importance': rf_reg.feature_importances_}).sort_values('Importance'
                                                                                    ,ascending = False)
feature_importance.head()

Unnamed: 0,Name,Importance
0,OverallQual,0.11288
3,GrLivArea,0.09764
9,TotalBsmtSF,0.079218
1,ExterQual,0.072492
10,GarageArea,0.067396


In [429]:
# Search for parameters
rf_reg1 = ensemble.RandomForestRegressor(max_features = 'sqrt', n_estimators = 400)
parameters = {
    'n_estimators': [300, 400, 500]
}
cv = GridSearchCV(estimator=rf_reg1, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

In [430]:
cv.fit(X_prepared, y_prepared)

GridSearchCV(cv=5,
             estimator=RandomForestRegressor(max_features='sqrt',
                                             n_estimators=400),
             n_jobs=-1, param_grid={'n_estimators': [300, 400, 500]},
             scoring='r2')

In [431]:
cv.best_params_

{'n_estimators': 500}

# Model Selection

In [432]:
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn import ensemble

In [433]:
MLA = [
    #Linear Model
    linear_model.LinearRegression(),
    
    #svm
    svm.SVR(),
    
    #Tree
    tree.DecisionTreeRegressor(),
    
    #Ensemble
    ensemble.RandomForestRegressor()
]

df_columns = ['Name', 'Parameters', 'Train RMSE Mean', 'Test RMSE Mean']
df = pd.DataFrame(columns = df_columns)
from sklearn.model_selection import cross_validate
for i in np.arange(len(MLA)):
    model = MLA[i]
    model_name = model.__class__.__name__
    model_parameters = str(model.get_params())
    cv_results = cross_validate(model, X_prepared, y_prepared, cv = 5, return_train_score=True)
    train_accuracy = cv_results['train_score'].mean()
    test_accuracy = cv_results['test_score'].mean()
    df.loc[i,:] = [model_name, model_parameters, train_accuracy, test_accuracy]

In [434]:
df.sort_values('Test RMSE Mean', ascending = False)

Unnamed: 0,Name,Parameters,Train RMSE Mean,Test RMSE Mean
3,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.980213,0.860833
2,DecisionTreeRegressor,"{'ccp_alpha': 0.0, 'criterion': 'mse', 'max_de...",0.999993,0.729276
1,SVR,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd...",-0.0495093,-0.0513542
0,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",0.877708,-298853000000.0


# Prediction

In [414]:
test = pd.read_csv('test.csv')


In [415]:
def prepare_test(X):
    #########################选择合适的features#########################
    
    cols = [
        'OverallQual',
        'ExterQual',
        'CentralAir',
        '1stFlrSF',
        'GrLivArea',
        'BsmtFullBath',
        'BsmtHalfBath',
        'KitchenQual',
        'TotRmsAbvGrd',
        'Fireplaces',
        'PavedDrive',
        'TotalBsmtSF',
        'GarageArea',
        'YearRemodAdd',
        'YearBuilt',
        'BsmtFinSF1',
        '2ndFlrSF',
        'FullBath',
        'HalfBath',
        'BedroomAbvGr',
        'KitchenAbvGr',
        'PoolArea',
        'MSZoning',
        'Neighborhood',
        'Exterior1st',
        'Exterior2nd',
        'Foundation'
    ]
    X_prepared = X[cols] 
    
    #########################处理一下Column############################
    # Missingness
    mode_col = ['BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'TotalBsmtSF']
    median_col = ['GarageArea', 'BsmtFinSF1']

    for i in mode_col:
        X_prepared[i] = X_prepared[i].fillna((X_prepared[i].mode())[0])
    for k in median_col:
        X_prepared[k] = X_prepared[k].fillna(X_prepared[k].median())
    # ExterQual
    X_prepared['ExterQual'] = X_prepared['ExterQual'].replace({'Ex':4, 'Gd': 3, 'TA': 2, 'Fa': 1})
    # CentralAir
    X_prepared['CentralAir'] = X_prepared['CentralAir'].replace({'Y':1,'N': 0})
    # KitchenQual
    X_prepared['KitchenQual'] = X_prepared['KitchenQual'].replace({'Ex':4, 'Gd': 3, 'TA': 2, 'Fa': 1})
    # PavedDrive
    X_prepared['PavedDrive'] = X_prepared['PavedDrive'].replace({'Y':2, 'P':1, 'N': 0})    
    # ONE HOT
    onehot_col = ['MSZoning', 'Neighborhood','Exterior1st','Exterior2nd','Foundation']
    X_prepared = pd.get_dummies(X_prepared, columns = onehot_col)
    #########################新feature################################
   
    X_prepared['Bath'] = X_prepared['FullBath'] + 0.5 * X_prepared['HalfBath']
    X_prepared['has_pool'] = (X_prepared['PoolArea'] == 0).astype(np.int)
    X_prepared['has_remodel'] = (X_prepared['YearRemodAdd'] != X_prepared['YearBuilt']).astype(np.int)
    X_prepared['BsmtBath'] = X_prepared['BsmtFullBath'] + 0.5 * X_prepared['BsmtHalfBath']
    
    #########################不要的column扔了############################
    to_drop = ['YearBuilt']
    X_prepared.drop(to_drop, axis = 1, inplace = True)
    
    #########################test中没有的类型填一下############################
    
    miss_col = ['3SsnPorch',
                'CentralAir_N',
                'CentralAir_Y',
                'EnclosedPorch',
                'OpenPorchSF',
                'PavedDrive_N',
                'PavedDrive_P',
                'PavedDrive_Y',
                'SaleType_COD',
                'SaleType_CWD',
                'SaleType_Con',
                'SaleType_ConLD',
                'SaleType_ConLI',
                'SaleType_ConLw',
                'SaleType_New',
                'SaleType_Oth',
                'SaleType_WD',
                'ScreenPorch',
                'Total_Porch',
                'has_porch'
               ]
    for c in miss_col:
        X_prepared[c] = 0
    return X_prepared

In [416]:
test_prepared = prepare_test(test)


In [417]:
test_prepared

Unnamed: 0,OverallQual,ExterQual,CentralAir,1stFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,KitchenQual,TotRmsAbvGrd,Fireplaces,...,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,ScreenPorch,Total_Porch,has_porch
0,5,2,1,896,896,0.0,0.0,2,5,0,...,0,0,0,0,0,0,0,0,0,0
1,6,2,1,1329,1329,0.0,0.0,3,6,0,...,0,0,0,0,0,0,0,0,0,0
2,5,2,1,928,1629,0.0,0.0,2,6,1,...,0,0,0,0,0,0,0,0,0,0
3,6,2,1,926,1604,0.0,0.0,3,7,1,...,0,0,0,0,0,0,0,0,0,0
4,8,3,1,1280,1280,0.0,0.0,3,5,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,4,2,1,546,1092,0.0,0.0,2,5,0,...,0,0,0,0,0,0,0,0,0,0
1455,4,2,1,546,1092,0.0,0.0,2,6,0,...,0,0,0,0,0,0,0,0,0,0
1456,5,2,1,1224,1224,1.0,0.0,2,7,1,...,0,0,0,0,0,0,0,0,0,0
1457,5,2,1,970,970,0.0,1.0,2,6,0,...,0,0,0,0,0,0,0,0,0,0


In [418]:
predictions = rf_reg.predict(test_prepared)
result = pd.DataFrame({'Id': test['Id'], 'SalePrice': predictions})


In [419]:
result.head()


Unnamed: 0,Id,SalePrice
0,1461,142874.286667
1,1462,146251.615
2,1463,141466.5175
3,1464,144443.15
4,1465,183810.015


In [413]:
result.to_csv('result.csv',index = False)


In [406]:
set(X_prepared.columns) - set(test_prepared.columns)

{'3SsnPorch',
 'CentralAir_N',
 'CentralAir_Y',
 'EnclosedPorch',
 'OpenPorchSF',
 'PavedDrive_N',
 'PavedDrive_P',
 'PavedDrive_Y',
 'SaleType_COD',
 'SaleType_CWD',
 'SaleType_Con',
 'SaleType_ConLD',
 'SaleType_ConLI',
 'SaleType_ConLw',
 'SaleType_New',
 'SaleType_Oth',
 'SaleType_WD',
 'ScreenPorch',
 'Total_Porch',
 'has_porch'}