In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import norm

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
n_train = train.shape[0]
n_test = test.shape[0]

test_id = test['Id']
y = train.SalePrice

train = train.drop('Id',axis=1)
test = test.drop('Id',axis=1)

# normalization the target ,预测结果记得变回来
y = np.log1p(y)

In [3]:
all_data = pd.concat([train,test])
all_data.drop('SalePrice',axis=1,inplace=True)
all_data.shape

(2919, 79)

# Missing data

In [4]:
miss_count = all_data.isnull().sum().sort_values(ascending=False)
miss_per = (all_data.isnull().sum()/all_data.isnull().count()).sort_values(ascending=False)
miss_data = pd.concat([miss_count,miss_per],axis=1,keys=['count','percent'])
drop_index = miss_data[miss_data['percent']>0.15].index
# miss_index = miss_data[(miss_data['percent']<0.15) & (miss_data['count']>0)].index
# drop 丢失率大于15%的数据
all_data.drop(drop_index,axis=1,inplace=True)

28 col neet to fill


# Filling null data

In [5]:
# 对数字取中位数
all_data = all_data.fillna(all_data.mean())

miss_count = all_data.isnull().sum().sort_values(ascending=False)
miss_index = miss_count[miss_count>0].index

In [19]:
# 对cat类取分布最多的值
for index in miss_index:
    all_data[index] = all_data[index].fillna(all_data[index].value_counts().keys()[0])

In [26]:
cat_index = all_data.dtypes[train.dtypes == "object"].index
num_index = all_data.dtypes[train.dtypes != "object"].index

In [None]:
# half_num_index = []
# count_index = []
# for index in num_index:
#     count = len(train[index].unique())
#     if count < 30:
#         half_num_index.append(index)
#         count_index.append(count)
# half_num_index
# count_index

# from sklearn.preprocessing import LabelEncoder
# for c in half_num_index:
#     lbl = LabelEncoder() 
#     lbl.fit(list(all_data[c].values)) 
#     train[c] = lbl.transform(list(all_data[c].values))
# all_data[half_num_index].head(10)

## 对num进行normalization


In [34]:
from scipy import stats
from scipy.stats import norm, skew

skew_features = all_data[num_index].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skew_index = skew_features[skew_features > 0.75].index
all_data[skew_index] = np.log1p(all_data[skew_index])

In [35]:
all_data.shape

(2919, 73)

# 对cat类进行编码

In [36]:
all_data = pd.get_dummies(all_data)
all_data.shape

(2919, 269)

In [37]:
train = all_data[:n_train]
test = all_data[n_train:]

# Modeling 

In [74]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [41]:
n_fold = 5

def rmse_cv(model):
    kf = KFold(n_fold,shuffle=True,random_state=0).get_n_splits(train)
    rmse = np.sqrt(-cross_val_score(model,train,y,scoring='neg_mean_squared_error',cv=kf))
    return rmse


In [75]:
lasso = make_pipeline(RobustScaler(),Lasso(alpha=0.0005,random_state=1))
Enet = make_pipeline(RobustScaler(),ElasticNet(alpha=0.0005,l1_ratio=0.9,random_state=1))
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1, nthread = -1)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [55]:
r1 = rmse_cv(lasso)
r2 = rmse_cv(Enet)
r3 = rmse_cv(GBoost)
r4 = rmse_cv(model_xgb)
r5 = rmse_cv(model_lgb)

In [71]:
class AverageModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self,models):
        self.models = models
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.models]
        for model in self.models_:
            model.fit(X,y)
        return self
    def predict(self,X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions,axis=1)

In [82]:
ave_model = AverageModels(models=(lasso,Enet,GBoost))
ave_score = rmse_cv(ave_model)

# final predict

In [79]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [88]:
ave_model.fit(train,y)
ave_train_prediction = ave_model.predict(train)
ave_rediction = np.expm1(ave_model.predict(test))
rms_ave = rmsle(y,ave_train_prediction)

model_xgb.fit(train,y)
xgb_train_prediction = model_xgb.predict(train)
xgb_rediction = np.expm1(model_xgb.predict(test))
rms_xgb = rmsle(y,xgb_train_prediction)

model_lgb.fit(train,y)
lgb_train_prediction = model_lgb.predict(train)
lgb_rediction = np.expm1(model_lgb.predict(test))
rms_lgb = rmsle(y,lgb_train_prediction)


AverageModels(models=(Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('lasso', Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_...           presort='auto', random_state=5, subsample=1.0, verbose=0,
             warm_start=False)))

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.4603,
       gamma=0.0468, learning_rate=0.05, max_delta_step=0, max_depth=3,
       min_child_weight=1.7817, missing=None, n_estimators=2200,
       nthread=-1, objective='reg:linear', reg_alpha=0.464,
       reg_lambda=0.8571, scale_pos_weight=1, seed=0, silent=1,
       subsample=0.5213)

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
       boosting_type='gbdt', colsample_bytree=1.0, feature_fraction=0.2319,
       feature_fraction_seed=9, learning_rate=0.05, max_bin=55,
       max_depth=-1, min_child_samples=10, min_child_weight=5,
       min_data_in_leaf=6, min_split_gain=0.0, min_sum_hessian_in_leaf=11,
       n_estimators=720, n_jobs=-1, num_leaves=5, objective='regression',
       random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=50000, subsample_freq=1)

In [87]:
print(rms_ave,rms_xgb,rms_lgb)

(0.084514189019064462, 0.080097747429496505, 0.076313672913250877)


In [90]:
final_prediction = ave_rediction*0.4 + xgb_rediction*0.3 + lgb_rediction*0.3
submission = pd.DataFrame({"Id":test_id, "SalePrice":final_prediction})
submission.to_csv('submission.csv',index=False)