In [105]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import ensemble
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.decomposition import PCA

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [120]:
train = pd.read_csv('train.csv')
train.drop('Id', axis = 1, inplace = True)

X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']


In [121]:
def prepare_X(X, y):
    #########################去掉Outlier###############################
    
    outliers = [1298, 495, 249, 313, 335, 706]
    X.drop(outliers, inplace = True)
    y = y.drop(outliers)
    
    #########################选择合适的features#########################
    
    cols = [
             'OverallQual','ExterQual','CentralAir','1stFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
             'KitchenQual','TotRmsAbvGrd','Fireplaces','PavedDrive','TotalBsmtSF','GarageArea','YearRemodAdd',
             'YearBuilt','BsmtFinSF1','2ndFlrSF','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr', 'LotArea',
             'PoolArea','MSZoning','Neighborhood','Exterior1st','Exterior2nd','Foundation','SaleType', 'WoodDeckSF'
    ]
    X_prepared = X[cols] 
    
    #########################处理一下Column############################
    
    ########Categorical######
    
    Qual_dict = {'Ex':3, 'Gd': 2, 'TA': 1, 'Fa': 0}
    # ExterQual
    X_prepared['ExterQual'] = X_prepared['ExterQual'].replace(Qual_dict)
    # KitchenQual
    X_prepared['KitchenQual'] = X_prepared['KitchenQual'].replace(Qual_dict)
    # ONE HOT
    onehot_col = ['MSZoning', 'Neighborhood','Exterior1st','Exterior2nd','Foundation','CentralAir',
                    'PavedDrive', 'SaleType']
    X_prepared = pd.get_dummies(X_prepared, columns = onehot_col)
    
    ########Numerical########
    
    high_skew_col = ['LotArea', 'WoodDeckSF', '1stFlrSF', 'TotalBsmtSF']
    for col in high_skew_col:
        X_prepared[col] = np.log1p(X_prepared[col])
        
    #########################新feature################################
   
    X_prepared['Bath'] = X_prepared['FullBath'] + 0.5 * X_prepared['HalfBath']
    X_prepared['has_pool'] = (X_prepared['PoolArea'] == 0).astype(np.int)
    X_prepared['has_remodel'] = (X_prepared['YearRemodAdd'] != X_prepared['YearBuilt']).astype(np.int)
    X_prepared['BsmtBath'] = X_prepared['BsmtFullBath'] + 0.5 * X_prepared['BsmtHalfBath']
    
    #########################不要的column扔了############################
    to_drop = ['YearBuilt', 'PoolArea']
    X_prepared.drop(to_drop, axis = 1, inplace = True)
    return X_prepared, y


In [122]:
X_prepared, y_prepared = prepare_X(X, y)
X_prepared.head()


Unnamed: 0,OverallQual,ExterQual,1stFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,KitchenQual,TotRmsAbvGrd,Fireplaces,TotalBsmtSF,...,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Bath,has_pool,has_remodel,BsmtBath
0,7,2,6.753438,1710,1,0,2,8,0,6.753438,...,0,0,0,0,0,1,2.5,1,0,1.0
1,6,1,7.141245,1262,0,1,1,6,1,7.141245,...,0,0,0,0,0,1,2.0,1,0,0.5
2,7,2,6.82546,1786,1,0,2,6,1,6.82546,...,0,0,0,0,0,1,2.5,1,1,1.0
3,7,1,6.869014,1717,1,0,2,7,1,6.629363,...,0,0,0,0,0,1,1.0,1,1,1.0
4,8,2,7.044033,2198,1,0,2,9,1,7.044033,...,0,0,0,0,0,1,2.5,1,0,1.0


In [123]:
rf_reg = ensemble.RandomForestRegressor(
    max_features = 'sqrt', 
    n_estimators = 400,
    max_depth = 100,
    warm_start = True,
    ccp_alpha = 0.1
)
rf_reg.fit(X_prepared, y_prepared)

RandomForestRegressor(ccp_alpha=0.1, max_depth=100, max_features='sqrt',
                      n_estimators=400, warm_start=True)

In [124]:
(cross_val_score(rf_reg, X_prepared, y_prepared, cv = 5)).mean()

0.8752102477981861

In [125]:
mean_squared_error(rf_reg.predict(X_prepared), y_prepared)


106587327.73110396

In [126]:
# Check feature importances
feature_importance = pd.DataFrame({'Name': X_prepared.columns, 'Importance': rf_reg.feature_importances_}).sort_values('Importance'
                                                                                    ,ascending = False)
feature_importance.head()

Unnamed: 0,Name,Importance
0,OverallQual,0.133561
3,GrLivArea,0.098977
9,TotalBsmtSF,0.078774
10,GarageArea,0.074706
6,KitchenQual,0.065393


In [9]:
# Search for parameters
rf_reg1 = ensemble.RandomForestRegressor(
    max_features = 'sqrt', 
    n_estimators = 400,
    max_depth = 100
)
parameters = {
    'warm_start': [True, False],
    'ccp_alpha':[0.001, 0.01, 0.1]
}
cv = GridSearchCV(estimator=rf_reg1, param_grid=parameters, scoring='r2', n_jobs = -1, cv = 5)

In [10]:
cv.fit(X_prepared, y_prepared)

GridSearchCV(cv=5,
             estimator=RandomForestRegressor(max_depth=100, max_features='sqrt',
                                             n_estimators=400),
             n_jobs=-1,
             param_grid={'ccp_alpha': [0.001, 0.01, 0.1],
                         'warm_start': [True, False]},
             scoring='r2')

In [11]:
cv.best_params_

{'ccp_alpha': 0.1, 'warm_start': True}

# Model Selection

In [74]:
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn import ensemble

In [13]:
MLA = [
    #Linear Model
    linear_model.LinearRegression(),
    
    #svm
    svm.SVR(),
    
    #Tree
    tree.DecisionTreeRegressor(),
    
    #Ensemble
    ensemble.RandomForestRegressor()
]

df_columns = ['Name', 'Parameters', 'Train RMSE Mean', 'Test RMSE Mean']
df = pd.DataFrame(columns = df_columns)
from sklearn.model_selection import cross_validate
for i in np.arange(len(MLA)):
    model = MLA[i]
    model_name = model.__class__.__name__
    model_parameters = str(model.get_params())
    cv_results = cross_validate(model, X_prepared, y_prepared, cv = 5, return_train_score=True)
    train_accuracy = cv_results['train_score'].mean()
    test_accuracy = cv_results['test_score'].mean()
    df.loc[i,:] = [model_name, model_parameters, train_accuracy, test_accuracy]

In [14]:
df.sort_values('Test RMSE Mean', ascending = False)

Unnamed: 0,Name,Parameters,Train RMSE Mean,Test RMSE Mean
3,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.98119,0.861618
2,DecisionTreeRegressor,"{'ccp_alpha': 0.0, 'criterion': 'mse', 'max_de...",0.999993,0.733548
1,SVR,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd...",-0.0495002,-0.0513448
0,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",0.876532,-93669900.0


# Prediction

In [114]:
test = pd.read_csv('test.csv')


In [115]:
def prepare_test(X):
    #########################选择合适的features#########################
    
    cols = [
             'OverallQual','ExterQual','CentralAir','1stFlrSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
             'KitchenQual','TotRmsAbvGrd','Fireplaces','PavedDrive','TotalBsmtSF','GarageArea','YearRemodAdd',
             'YearBuilt','BsmtFinSF1','2ndFlrSF','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr', 'LotArea',
             'PoolArea','MSZoning','Neighborhood','Exterior1st','Exterior2nd','Foundation','SaleType', 'WoodDeckSF'
    ]
    X_prepared = X[cols] 
    
    #########################处理一下Missingness############################
    
    # Categorical的用mode， 根据 Neighborhood
    # Numerical的用median， 根据 Neighborhood
    mode_col = ['BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'TotalBsmtSF']
    median_col = ['GarageArea', 'BsmtFinSF1']

    for i in mode_col:
        X_prepared[i] = X.groupby(['Neighborhood'])[i].apply(lambda x:x.fillna(x.value_counts().index[0]))
    for k in median_col:
        X_prepared[k] = X.groupby(['Neighborhood'])[i].apply(lambda x: x.fillna(x.median()))
    
    #########################处理一下Column############################
    
    ###Categorical###
    
    Qual_dict = {'Ex':3, 'Gd': 2, 'TA': 1, 'Fa': 0}
    # ExterQual
    X_prepared['ExterQual'] = X_prepared['ExterQual'].replace(Qual_dict)
    # KitchenQual
    X_prepared['KitchenQual'] = X_prepared['KitchenQual'].replace(Qual_dict)
    # ONE HOT
    onehot_col = ['MSZoning', 'Neighborhood','Exterior1st','Exterior2nd','Foundation','CentralAir',
                    'PavedDrive', 'SaleType']
    X_prepared = pd.get_dummies(X_prepared, columns = onehot_col)
    
    ###Numerical###
    
    high_skew_col = ['LotArea', 'WoodDeckSF', '1stFlrSF']
    for col in high_skew_col:
        X_prepared[col] = np.log1p(X_prepared[col])
    
    #########################新feature################################
   
    X_prepared['Bath'] = X_prepared['FullBath'] + 0.5 * X_prepared['HalfBath']
    X_prepared['has_pool'] = (X_prepared['PoolArea'] == 0).astype(np.int)
    X_prepared['has_remodel'] = (X_prepared['YearRemodAdd'] != X_prepared['YearBuilt']).astype(np.int)
    X_prepared['BsmtBath'] = X_prepared['BsmtFullBath'] + 0.5 * X_prepared['BsmtHalfBath']
    
    #########################不要的column扔了############################
    to_drop = ['YearBuilt', 'PoolArea']
    X_prepared.drop(to_drop, axis = 1, inplace = True)
    
    #########################test中没有的类型填一下############################
    
    miss_col = [
        'Exterior1st_ImStucc', 'Exterior1st_Stone', 'Exterior2nd_Other'
    ]
    for c in miss_col:
        X_prepared[c] = 0
    return X_prepared

In [116]:
test_prepared = prepare_test(test)


In [117]:
test_prepared

Unnamed: 0,OverallQual,ExterQual,1stFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,KitchenQual,TotRmsAbvGrd,Fireplaces,TotalBsmtSF,...,SaleType_New,SaleType_Oth,SaleType_WD,Bath,has_pool,has_remodel,BsmtBath,Exterior1st_ImStucc,Exterior1st_Stone,Exterior2nd_Other
0,5,1,6.799056,896,0.0,0.0,1,5,0,882.0,...,0,0,1,1.0,1,0,0.0,0,0,0
1,6,1,7.192934,1329,0.0,0.0,2,6,0,1329.0,...,0,0,1,1.5,1,0,0.0,0,0,0
2,5,1,6.834109,1629,0.0,0.0,1,6,1,928.0,...,0,0,1,2.5,1,1,0.0,0,0,0
3,6,1,6.831954,1604,0.0,0.0,2,7,1,926.0,...,0,0,1,2.5,1,0,0.0,0,0,0
4,8,2,7.155396,1280,0.0,0.0,2,5,0,1280.0,...,0,0,1,2.0,1,0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,4,1,6.304449,1092,0.0,0.0,1,5,0,546.0,...,0,0,1,1.5,1,0,0.0,0,0,0
1455,4,1,6.304449,1092,0.0,0.0,1,6,0,546.0,...,0,0,1,1.5,1,0,0.0,0,0,0
1456,5,1,7.110696,1224,1.0,0.0,1,7,1,1224.0,...,0,0,1,1.0,1,1,1.0,0,0,0
1457,5,1,6.878326,970,0.0,1.0,1,6,0,912.0,...,0,0,1,1.0,1,0,0.5,0,0,0


In [118]:
predictions = rf_reg.predict(test_prepared)
result = pd.DataFrame({'Id': test['Id'], 'SalePrice': predictions})


In [119]:
result.head()


Unnamed: 0,Id,SalePrice
0,1461,128566.87
1,1462,177706.2575
2,1463,176045.62
3,1464,185941.585
4,1465,216514.795


In [21]:
# result.to_csv('result.csv',index = False)


In [22]:
set(X_prepared.columns) - set(test_prepared.columns)

set()