# Modules
This file contains modules that may be used for data manipulation, transformation, and feature engineering.

In [1]:
def load_data(filepath):
## load the dataset into dataframe

    import pandas as pd
    houses = pd.read_csv(filepath)
#     print('There are {} samples with {} features: {}.'.format(houses.shape[0], houses.shape[1]-2, houses.columns.values))
    return houses

In [2]:
def split_variable_features(data):
## separate features and saleprice

    y = data['SalePrice']
    X = data.drop('SalePrice',1)
    return (X,y)

In [3]:
def fill_NA(houses):
## Drop the 'Id', 'MiscFeature', 'GarageCars' features
## Drop the one sample with missing 'Electrical' information
## Fill all other NA with 'None' (categorical) or 0 (numerical)

    feature_fillNA_drop = ['MiscFeature','GarageCars']
    houses = houses.drop(feature_fillNA_drop,1)
    from statistics import mode
    feature_fillNA_with_none = ['PoolQC','Alley','Fence','FireplaceQu','GarageCond','GarageType','GarageYrBlt','GarageFinish',
                                'GarageQual','BsmtQual','BsmtCond','BsmtFinType1','BsmtFinType2','BsmtExposure','MasVnrType',
                                'Street','LotShape','LandContour','KitchenQual','Functional']
    feature_fillNA_with_mode = ['LotFrontage','MSZoning','Electrical','MSSubClass','Utilities','SaleType']
    feature_fillNA_with_zero = ['MasVnrArea','LotArea','Exterior1st','Exterior2nd','PoolArea','BsmtFullBath','BsmtHalfBath',
                                'GarageArea','BsmtFinSF2','BsmtFinSF1','TotalBsmtSF','BsmtUnfSF']
    
    houses[feature_fillNA_with_none] = houses[feature_fillNA_with_none].fillna('None')
    houses[feature_fillNA_with_mode] = houses[feature_fillNA_with_mode].fillna(houses[feature_fillNA_with_mode].mode().loc[0])
    houses[feature_fillNA_with_zero] = houses[feature_fillNA_with_zero].fillna(0.0)
#     # Remove the sample with missing Electrical informaiton
#     idx_naElectrical = houses.loc[houses['Electrical'].isnull()].index
#     houses = houses.drop(idx_naElectrical)
#     print('After filling NAs, There are {} samples with {} features: {}.'.format(houses.shape[0], houses.shape[1]-2, houses.columns.values))
    return houses

In [4]:
def redefine_category_numeric(data):
## change some categorical feature into numerical feature to show the quality/condition order and for further feature engineering
## change some numerical feature into categorical feature, 'MSSubClass', 'MoSold', 'YrSold'

#     categorical_features = data.select_dtypes(include = ["object"]).columns
#     numerical_features = data.select_dtypes(exclude = ["object"]).columns
    data = data.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                       50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                       80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                       150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                       "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                   7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"},
                       "YrSold": {2008:'2008', 2007:'2007',2006:'2006',2009:'2009',2010:'2010'}
                      })
    data = data.replace({
#                         "Street" : {"Grvl" : 1, "Pave" : 2},
#                         "Alley" : {"None" : 0, "Grvl" : 1, "Pave" : 2},
                        "ExterQual" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                        "ExterCond" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                        "BsmtQual" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                        "BsmtCond" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
#                         "BsmtExposure" : {"None" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                        "BsmtFinType1" : {"None" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                        "BsmtFinType2" : {"None" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                        "HeatingQC" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "KitchenQual" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "Functional" : {"None" : 0, "Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, 
                                       "Min2" : 6, "Min1" : 7, "Typ" : 8},
                        "FireplaceQu" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "GarageFinish" : {"None" : 0, "Unf" : 1, "RFn" : 2, "Fin" : 3},
                        "GarageQual" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "GarageCond" : {"None" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
#                         "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                        "PoolQC" : {"None" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
#                         "Fence" : {"None" : 0, "MnWw" : 1, "MnPrv" : 1, "GdWo" : 2, "GdPrv" : 2}
                        })
    return data

In [5]:
def add_new_features(houses):
## create new features from combination of existing features
## remove the corresponding features

    houses['Overall'] = houses['OverallQual'] * houses['OverallCond'] * houses['Functional']
    houses['Pool'] = houses['PoolQC'] * houses['PoolArea']
    houses['Exter'] = houses['ExterQual'] * houses['ExterCond']
    houses['Kitchen'] = houses['KitchenAbvGr'] * houses['KitchenQual']
    houses['Garage'] = houses['GarageQual'] * houses['GarageCond'] * houses['GarageFinish'] * houses['GarageArea']
    houses['Fireplace'] = houses['Fireplaces'] * houses['FireplaceQu']
    houses['Basement'] = houses['BsmtQual'] * houses['BsmtCond'] * (houses['BsmtFinType1']*houses['BsmtFinSF1']+houses['BsmtFinType2']*houses['BsmtFinSF2']+houses['BsmtUnfSF'])/houses['TotalBsmtSF']
    houses['Basement'] = houses['Basement'].fillna(0)
    houses['OpenAreaSF'] = houses['WoodDeckSF'] + houses['OpenPorchSF'] + houses['EnclosedPorch'] + houses['3SsnPorch'] + houses['ScreenPorch']
    houses['TotBath'] = houses["BsmtFullBath"] + (0.5 * houses["BsmtHalfBath"]) + houses["FullBath"] + (0.5 * houses["HalfBath"])
    houses['TotSF'] = houses['GrLivArea'] + houses['TotalBsmtSF']
    
    
    houses = houses.drop(['OverallQual','OverallCond','Functional'],1)  
    houses = houses.drop(['PoolQC','PoolArea'],1)    
    houses = houses.drop(['ExterQual','ExterCond'],1)    
    houses = houses.drop(['KitchenAbvGr','KitchenQual'],1)    
    houses = houses.drop(['GarageQual','GarageCond','GarageFinish','GarageArea'],1)    
    houses = houses.drop(['Fireplaces','FireplaceQu'],1)    
    houses = houses.drop(['BsmtQual','BsmtCond','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2'],1)    
    houses = houses.drop(['WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch'],1)
    houses = houses.drop(["BsmtFullBath","BsmtHalfBath",'FullBath','HalfBath'],1)
    
    return houses

In [6]:
def log_transform_skew(data):
## using log-transformation to make the features more like normal distribution (less sknewness)
## better regression result to smooth out some irregularities
    
    import numpy as np
    log_transform_features = ['LotFrontage','LotArea','MasVnrArea','1stFlrSF','2ndFlrSF','GrLivArea',
                              'Exter','Kitchen','Fireplace','Basement','TotSF']
    data[log_transform_features] = np.log1p(data[log_transform_features])
    data['BsmtUnfSF'] = np.log(data['BsmtUnfSF']+500)
    data['TotalBsmtSF'] = np.log(data['TotalBsmtSF']+1000)
    data['Garage'] = np.log(data['Garage']+500)
    data['Basement'] = np.log(data['Basement']+500)
    data['OpenAreaSF'] = np.log(data['OpenAreaSF']+500)
    data['Overall'] = np.log(data['Overall']+300)
    
    data['YearBuilt'] = data['YearBuilt']/1000
    data['YearRemodAdd'] = data['YearRemodAdd']/1000
    
    data = data.drop(['LowQualFinSF','MiscVal','Pool'],1)
    return data

In [7]:
def standard_scaling_numeric_features(data):
    from sklearn.preprocessing import StandardScaler
    feature = ['LotFrontage','LotArea','1stFlrSF','GrLivArea',
               'Exter','Kitchen','Basement','TotSF','BsmtUnfSF','TotalBsmtSF',
               'Garage','Basement','OpenAreaSF']
    stdSc = StandardScaler()
    data.loc[:, feature] = stdSc.fit_transform(data.loc[:, feature])
    return data

In [8]:
def robust_scaling_numeric_features(data):
    from sklearn.preprocessing import RobustScaler
    feature = ['LotFrontage','LotArea','1stFlrSF','GrLivArea',
               'Exter','Kitchen','Basement','TotSF','BsmtUnfSF','TotalBsmtSF',
               'Garage','Basement','OpenAreaSF']
    rSc = RobustScaler()
    data.loc[:, feature] = rSc.fit_transform(data.loc[:, feature])
    return data

In [9]:
def encode_category_features(data):
    import pandas as pd
    data = pd.get_dummies(data)
    return data

In [10]:
from sklearn.model_selection import cross_val_score, ShuffleSplit, cross_val_predict
from sklearn.metrics import mean_squared_error, make_scorer
scorer = make_scorer(mean_squared_error, greater_is_better = False)
import numpy as np
def rmse_cv(model):
    cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
#     cv = 10
    rmse = np.sqrt(-cross_val_score(model, X_training, y, scoring = scorer, cv=cv))
    return rmse

In [11]:
def linear_regression():
    from sklearn.linear_model import LinearRegression
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    print("RMSE for Linear Regression: {:0.4f} (+/- {:0.4f})".format(rmse_cv(lr).mean(), rmse_cv(lr).std() * 2))
    return lr

In [12]:
def ridge_regression():
    from sklearn.linear_model import RidgeCV
    model = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])
    model.fit(X_training,y)
    alpha = model.alpha_
    model = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, 
                          alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
                          alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4], 
                cv = 10)
    model.fit(X_training,y)
    alpha = model.alpha_
    print("RMSE for Ridge Regression with alpha {}: {:0.4f} (+/- {:0.4f})".format(alpha, rmse_cv(model).mean(), rmse_cv(model).std() * 2))
    return model

In [13]:
def lasso_regression():
    from sklearn.linear_model import LassoCV
    model = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 
                          0.3, 0.6, 1], 
                max_iter = 50000, cv = 10)
    model.fit(X_training,y)
    alpha = model.alpha_
    model = LassoCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, 
                          alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
                          alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4], 
                    max_iter = 50000,cv = 10)
    model.fit(X_training,y)
    alpha = model.alpha_
    print("RMSE for Lasso Regression with alpha {}: {:0.4f} (+/- {:0.4f})".format(alpha, rmse_cv(model).mean(), rmse_cv(model).std() * 2))
    
    return model

In [14]:
def elastic_regression():
    from sklearn.linear_model import ElasticNetCV
    model = ElasticNetCV(l1_ratio = [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1],
                          alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 
                                    0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6], 
                          max_iter = 50000, cv = 10)
    model.fit(X_training,y)
    alpha = model.alpha_
    ratio = model.l1_ratio_
    model = ElasticNetCV(l1_ratio = [ratio * .85, ratio * .9, ratio * .95, ratio, ratio * 1.05, ratio * 1.1, ratio * 1.15],
                          alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6], 
                          max_iter = 50000, cv = 10)
    model.fit(X_training,y)
    alpha = model.alpha_
    ratio = model.l1_ratio_
    print("RMSE for Elastic Regression with alpha {}, ratio {}: {:0.4f} (+/- {:0.4f})".format(alpha, ratio, rmse_cv(model).mean(), rmse_cv(model).std() * 2))
    
    return model

In [15]:
def partial_ls_regression():
    from sklearn.cross_decomposition import PLSRegression
    model = PLSRegression(n_components=2)
    model.fit(X_train, y_train)
    print("RMSE for PLS: {:0.4f} (+/- {:0.4f})".format(rmse_cv(model).mean(), rmse_cv(model).std() * 2))
    return model

In [16]:
def xgboost_model():
    import xgboost as xgb
#     %matplotlib inline
#     import matplotlib.pyplot as plt
#     dtrain = xgb.DMatrix(X_train, label = y_train)
#     params = {"max_depth":2, "eta":0.1}
#     model = xgb.cv(params, dtrain,  num_boost_round=500, early_stopping_rounds=100)
#     model.loc[30:,["train-rmse-mean"]].plot()
    model_xgb = xgb.XGBRegressor(n_estimators=400, max_depth=2, learning_rate=0.1) #the params were tuned using xgb.cv
    model_xgb.fit(X_train, y_train)
    print("RMSE for xgboost: {:0.4f} (+/- {:0.4f})".format(rmse_cv(model_xgb).mean(), rmse_cv(model_xgb).std() * 2))
    return model_xgb

In [17]:

training = load_data('./dataset/train.csv')
testing = load_data('./dataset/test.csv')
ID = testing['Id']
import pandas as pd
houses = pd.concat((training.loc[:,'MSSubClass':'SaleCondition'],
                      testing.loc[:,'MSSubClass':'SaleCondition']))
houses = fill_NA(houses)
houses = redefine_category_numeric(houses)
houses = add_new_features(houses)
X = log_transform_skew(houses)
X = standard_scaling_numeric_features(X)
X = encode_category_features(X)
X_training = X[:training.shape[0]]
X_testing = X[training.shape[0]:]
import numpy as np
y = np.log(training['SalePrice'])

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_training, y, test_size = 0.3, random_state = 0)
# #standard scaling
# X_train, stdSc_X, numerical_features = standard_scaling_numeric_features(X_train)
# X_valid.loc[:,numerical_features] = stdSc_X.transform(X_valid.loc[:,numerical_features])
# #robust scaling
# X_train, rSc_X, numerical_features = robust_scaling_numeric_features(X_train)
# X_valid.loc[:,numerical_features] = rSc_X.transform(X_valid.loc[:,numerical_features])
# X_train = encode_category_features(X_train)
# X_valid = encode_category_features(X_valid)

In [19]:
# lr = linear_regression()
# ridge = ridge_regression()
lasso = lasso_regression()
# elastic = elastic_regression()
# pls = partial_ls_regression()
xgboost = xgboost_model()

RMSE for Lasso Regression with alpha 0.0005099999999999999: 0.1316 (+/- 0.0291)
RMSE for xgboost: 0.1267 (+/- 0.0140)


In [21]:
ratio = 0.7
preds = np.exp(ratio*xgboost.predict(X_testing)+(1-ratio)*lasso.predict(X_testing))
solution = pd.DataFrame({"Id":ID, "SalePrice":preds})
solution.to_csv("lasso_sol.csv", index = False)