In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [5]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

os.chdir('/gdrive/MyDrive/Dacon/House/')

In [6]:
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [7]:
dir = os.getcwd() + '/Dataset/'

train = pd.read_csv(dir + 'train.csv').drop(['id'], axis=1)
test  = pd.read_csv(dir + 'test.csv').drop(['id'], axis=1)
sample_submission = pd.read_csv(dir + 'sample_submission.csv')

In [8]:
categorical_feature = ['Exter Qual', 'Kitchen Qual', 'Bsmt Qual']

Frequency Encoding

In [12]:
for c in categorical_feature:
    grouped_df = train.groupby(c)['target'].size().reset_index(name=f'count_{c}')
    train = pd.merge(train, grouped_df, how='left')
    test = pd.merge(test, grouped_df, how='left')

target encoding

In [13]:
for c in categorical_feature:
    grouped_df = train.groupby(c)['target'].median().reset_index(name=f'ord_{c}')
    train = pd.merge(train, grouped_df, how='left')
    test = pd.merge(test, grouped_df, how='left')

In [14]:
train = train.drop(categorical_feature, axis=1)
test = test.drop(categorical_feature, axis=1)

In [15]:
X = train.drop(['target'], axis=1)
y = train['target']

In [16]:
X.head()

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Cars,Garage Area,Total Bsmt SF,1st Flr SF,Full Bath,Year Built,Year Remod/Add,Garage Yr Blt,count_Exter Qual,count_Kitchen Qual,count_Bsmt Qual,ord_Exter Qual,ord_Kitchen Qual,ord_Bsmt Qual
0,10,2392,3,968,2392,2392,2,2003,2003,2003,49,107,134,345000.0,323262.0,323631
1,7,1352,2,466,1352,1352,2,2006,2007,2006,485,560,134,219990.0,200912.5,323631
2,5,900,1,288,864,900,1,1967,1967,1967,808,660,605,142312.5,139725.0,137500
3,5,1174,2,576,680,680,1,1900,2006,2000,808,560,605,142312.5,200912.5,137500
4,7,1958,3,936,1026,1026,2,2005,2005,2005,485,560,582,219990.0,200912.5,193000


Outlier

In [17]:
numeric_feature = ['Overall Qual', 'Gr Liv Area', 'Garage Cars', 'Garage Area',
                   'Total Bsmt SF', '1st Flr SF', 'Full Bath', 'Year Built',
                   'Year Remod/Add', 'Garage Yr Blt']

In [18]:
for i in numeric_feature:
    quartile_1 = X[i].quantile(0.25)
    quartile_3 = X[i].quantile(0.75)
    IQR = quartile_3 - quartile_1

    outlier_index = X.loc[(X[i] > quartile_3 + 1.5 * IQR) | (X[i] < quartile_1 - 1.5 * IQR)].index
    X = X.drop(outlier_index, axis=0).reset_index(drop=True)
    y = y.drop(outlier_index, axis=0).reset_index(drop=True)

add features

In [19]:
all_data = pd.concat([X, test], axis=0)
all_data['Total Area'] = all_data['Gr Liv Area'] + all_data['Garage Area'] + all_data['Total Bsmt SF'] + all_data['1st Flr SF']
all_data['Year Built/Remod'] = all_data['Year Built'] + all_data['Year Remod/Add']

In [20]:
print(X.shape, test.shape)
X, test = all_data[:len(X)], all_data[len(X):]
print(X.shape, test.shape)

(1243, 16) (1350, 16)
(1243, 18) (1350, 18)


In [21]:
test = test.fillna(test.mean())

In [22]:
y = np.log1p(y)

모델 비교

In [23]:
!pip install -q lightgbm
!pip install -q catboost
!pip install -q xgboost
!pip install -q mlxtend

[K     |████████████████████████████████| 76.1 MB 51 kB/s 
[?25h

In [24]:
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split
import lightgbm as lgb
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=821)

In [68]:
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=[0.01, 0.1, 1, 5, 10], cv=5))
lasso = make_pipeline(RobustScaler(), LassoCV(alphas=[0.00005, 0.0001, 0.0003, 0.0005, 0.0007, 0.0009], max_iter=1e5))
bridge = make_pipeline(RobustScaler(), BayesianRidge())
svm = make_pipeline(RobustScaler(), SVR(C=10))
forest = RandomForestRegressor(n_estimators=1024, max_depth=10, random_state=821, n_jobs=-1)
lgbm = lgb.LGBMRegressor(objective='regression', learning_rate=0.01, n_estimators=1000, num_leaves=8, verbose=-1)
xgb = XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.01, max_depth=8, random_state=821, n_jobs=-1)
catb = CatBoostRegressor(learning_rate=0.01, depth=8, n_estimators=1000, random_state=821, verbose=0)

In [69]:
models = [('lasso', lasso), ('ridge', ridge), ('bridge', bridge), ('svm', svm),
          ('forest', forest), ('lgbm',lgbm), ('xgb', xgb), ('cat', catb)]

for name, model in models:
    clf = model.fit(X_train, y_train)
    pred = clf.predict(X_val)
    print(f'model name: {name} score: {NMAE(y_val, pred)}')

model name: lasso score: 0.008069661691728956
model name: ridge score: 0.008058719313821685
model name: bridge score: 0.008078237133478742
model name: svm score: 0.007960991720813809
model name: forest score: 0.00741087868790449
model name: lgbm score: 0.007451281767276514
model name: xgb score: 0.0074534479321745045
model name: cat score: 0.007433678301640019


model stacking and refit

In [47]:
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, svm, forest, lgbm, xgb, catb),
                                meta_regressor=catb,
                                use_features_in_secondary=True)

In [70]:
print('START Fit')

print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))

print('Lasso')
lasso_model_full_data = lasso.fit(X, y)

print('Ridge')
ridge_model_full_data = ridge.fit(X, y)

print('SVM')
svm_model_full_data = svm.fit(X, y)

print('RandomForest')
forest_model_full_data = forest.fit(X, y)

print('catboost')
catb_model_full_data = catb.fit(X, y)

print('lightgbm')
lgb_model_full_data = lgbm.fit(X, y)

print('xgboost')
xgb_model_full_data = xgb.fit(X, y)

START Fit
stack_gen
Lasso
Ridge
SVM
RandomForest
catboost
lightgbm
xgboost


model blending

model name: lasso score: 0.008069661691728956  
model name: ridge score: 0.008058719313821685  
model name: svm score: 0.007960991720813809  
model name: forest score: 0.00741087868790449  
model name: lgbm score: 0.007451281767276514  
model name: xgb score: 0.0074534479321745045  
model name: cat score: 0.007433678301640019  

model name: lasso score: 0.0081230495297185  
model name: ridge score: 0.008100253496371067  
model name: bridge score: 0.008105134396548794  
model name: svm score: 0.007735600776761367  
model name: forest score: 0.00761348877559685  
model name: lgbm score: 0.007536975129334673  
model name: xgb score: 0.007514162585071616  
cat score: 0.0073283111622326645

In [71]:
def blend_models_predict(X):
    return ((0.1 * svm_model_full_data.predict(X)) + \
            (0.05 * lasso_model_full_data.predict(X)) + \
            (0.05 * ridge_model_full_data.predict(X)) + \
            (0.125 * forest_model_full_data.predict(X)) + \
            (0.125 * lgb_model_full_data.predict(X)) + \
            (0.125 * xgb_model_full_data.predict(X)) + \
            (0.125 * catb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))

In [72]:
print('score on train data:')
print(NMAE(y, blend_models_predict(X)))

score on train data:
0.0053890188576642415


In [73]:
sample_submission['target'] = np.expm1(blend_models_predict(test))
sample_submission.head()

Unnamed: 0,id,target
0,1,334158.873693
1,2,130026.682342
2,3,176219.367158
3,4,250956.639395
4,5,132142.673045


In [74]:
sample_submission.to_csv(dir + 'submission_4.csv', index=False)