# Data Preparation &Cleaning & Feature Engineering

In [None]:
from math import sqrt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import scipy.sparse
from scipy.sparse import hstack
from matplotlib import pyplot as plt
from scipy import stats
from scipy.stats import norm, skew
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

# Load Data 

In [39]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
Y_train = train_df['SalePrice']

train_id = train_df['Id']
test_id = test_df['Id']
train_df.drop(["Id", 'SalePrice'], inplace=True, axis=1)
test_df.drop("Id", inplace=True, axis=1)

# Analyze & Clean Data 

## Check and fill missing data

In [40]:
ntrain = train_df.shape[0]
ntest = test_df.shape[0]
all_data = pd.concat((train_df, test_df))

print("all_data size is : {}".format(all_data.shape))

all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head

In [41]:
import math
fill_none = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageFinish',
             'GarageQual', 'GarageCond', 'GarageType', 'BsmtExposure', 'BsmtCond', 'BsmtQual',
            'BsmtFinType2', 'BsmtFinType1', 'MasVnrType', 'MasVnrArea']
fill_zero = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']
fill_neighb = ['MSZoning', 'LotFrontage']
fill_mode = ['Exterior2nd', 'Exterior1st', 'Functional', 'SaleType', 'Electrical', 'KitchenQual']

all_data.drop(['Utilities'], axis=1, inplace=True)
for col in fill_none:
    all_data[col].fillna("None", inplace=True)

for col in fill_zero:
    all_data[col].fillna(0, inplace=True)

all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
all_data['LotFrontage'] = all_data.groupby("Neighborhood")['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in fill_mode:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
    
for idx, row in all_data.iterrows():
    if(math.isnan(row['GarageYrBlt'])):
        all_data.set_value(idx, 'GarageYrBlt', row['YearBuilt'])

In [42]:
for col in all_data.columns:
    if all_data[col].isnull().any():
        print(col)


## Convert all data to numerical type

In [43]:
#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])


# all_data = pd.get_dummies(all_data)
print(all_data.columns)

from sklearn.preprocessing import LabelEncoder
for col in all_data.columns:
    lbl = LabelEncoder()
    if(all_data[col].dtype == 'object'):
        lbl.fit(list(all_data[col].values))
        
        all_data[col] = lbl.transform(list(all_data[col].values))
# for col in all_data.columns:
#     print(all_data[col].dtype)
# pd.DataFrae({'GrLivArea', all_data['GrLivArea']}).hist()
Y_train=np.log1p(Y_train)
X_train = all_data[:train_id.shape[0]]
X_test = all_data[train_id.shape[0]:]

# Feature Selection

#  Run Recursive Feature Elimination

In [44]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Lasso

rfecv = RFECV(estimator=Lasso(max_iter=3000), step=1, cv=10, scoring='neg_mean_squared_error')
rfecv.fit(X_train, Y_train)

features = X_train.columns[rfecv.support_]
print("selected {:d} features: ".format(len(features)))
print(features)

In [45]:
X_train = X_train[features]
X_test = X_test[features] 

# Function of parameter search for single model 

In [46]:
'''
Input: 
params: parameter set to search on
model: the model to search on
metric: evaluation metric
X_data: input features
Y_data: real output
n_fold: number of folds in Cross Validation

Return:
best_params: dictionary of best parameters
best_score: best performance
'''
from sklearn.model_selection import GridSearchCV

def Search_para( model, params, metric, X_data, Y_data=None, n_fold=5):
    if(Y_data is not None): #supervised learning
        searcher = GridSearchCV(model, params, cv=n_fold, scoring = metric, verbose=1)
        searcher.fit(X_data, Y_data)
        best_params = searcher.best_params_
        best_score = sqrt(-searcher.best_score_)

    return (best_params, best_score)

# Function to search parameter for each model and find the best model  

In [47]:
'''
Input:
model_list: list contataining name of models
model_collection: tuples of candidate models
params_collection: tuples of dictionaries of parameters
metric: evaluation metric
X_data: input features
Y_data: real output
cv_model: whether the model has already imp
Return:
res_dic: Dictionary that stores the best params  and performance as value, model name as key
'''

def Search_Compare(model_list, model_collection, params_collection, metric, X_data, Y_data=None):
    
    #store the scores for model comparison
    scores_list = []
    best_params_list = []
    for model_name, model, params in zip(model_list, model_collection, params_collection):
        res_tup = \
        Search_para(model, params, metric = metric, X_data=X_data, Y_data = Y_data)
        best_params_list.append(res_tup[0])
        scores_list.append(res_tup[1])
    
        print("model: " + model_name)
        print("Best parameter: {}\n Score: {:5f}".format(res_tup[0], res_tup[1]))
        
    d = {'model': model_list, 'scores': scores_list, 'param': best_params_list}
    
    res_df = pd.DataFrame(data = d)
    print(res_df.sort_values(by=['scores'], ascending=True))
    
    

# Set model names list,  model collection, parameter set collection

In [48]:
from sklearn import linear_model
from sklearn import ensemble
model_names = [ 'Ridge Regression', 'LASSO Regression', 'Bagging', 'Random Forest','GradientBoosting']


models = [linear_model.Ridge(), linear_model.Lasso(), 
          ensemble.BaggingRegressor(), ensemble.RandomForestRegressor(),ensemble.GradientBoostingRegressor()]


parameters = [{'alpha':[0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50]},{'alpha' : [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50]},
              {'n_estimators':[300, 400, 500, 600, 700]},{'n_estimators':[500, 600, 700],'max_features':[5,6,8,10,15,20,25]},
              {'learning_rate':[1e-2, 1e-1],'min_samples_leaf':[60, 80],'n_estimators':[5000, 7000],
               'max_depth':list(range(3,5,1))}]

In [49]:
Search_Compare(model_list=model_names, model_collection=models, params_collection=parameters, 
              metric='neg_mean_squared_error',X_data=X_train, Y_data=Y_train)

# Predict and Output Result

In [50]:
final_regressor=ensemble.GradientBoostingRegressor(learning_rate= 0.01, max_depth= 3, min_samples_leaf= 60, n_estimators= 5000)
final_regressor.fit(X_train,Y_train)
preds=np.expm1(final_regressor.predict(X_test))
solution = pd.DataFrame({"id":test_id, "SalePrice":preds})
solution.to_csv("boost_sol.csv", index = False)