In [1]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import xgboost, lightgbm
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split, KFold, cross_val_score

import os
import time
import gc
import warnings

warnings.filterwarnings('ignore')


In [2]:
train_X = pd.read_csv('./imputed_train.csv')
test_X = pd.read_csv('./imputed_test.csv')

train_y = pd.read_csv('./train_y.csv')

sub = pd.read_csv('./data/sample_submission.csv')

In [3]:
# target = (train_y - np.mean(train_y))/np.std(train_y)

In [4]:
X = train_X.copy()
X2 = test_X.copy()

In [5]:
for c in train_X.columns:
    if train_X[c].dtype == 'object' or test_X[c].dtype == 'object':
        lbe = LabelEncoder().fit(pd.concat([train_X[c], test_X[c]], axis=0))
        train_X[c] = lbe.transform(train_X[c])
        test_X[c] = lbe.transform(test_X[c])
        

In [6]:
xgb_bounds = {
    'max_depth': (3, 12),
    'subsample': (0.3, 1),
    'colsample_bytree': (0.3, 1),
    'reg_alpha': (0.1, 3),
    'reg_lamda': (0.1, 3)
}

best_ns = []

def bayes_xgb(max_depth, subsample, colsample_bytree, reg_alpha, reg_lamda):
    global best_ns
    params = {
        'n_estimators': 1000,
        'booster': 'gbtree',
        'tree_method': 'hist',
        'random_state': 0,
        'n_jobs': -1,
        'max_depth': int(max_depth),
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lamda': reg_lamda
    }
    
    acc = 0
    best_n = 0
    for tr_idx, val_idx in kfold.split(train_X):
        tr_X, tr_y = train_X.iloc[tr_idx, :], train_y.iloc[tr_idx]
        val_X, val_y = train_X.iloc[val_idx, :], train_y.iloc[val_idx]
        clf = XGBRegressor(**params, objective='reg:squarederror').fit(tr_X, tr_y, eval_metric = 'error', eval_set=[[val_X, val_y]], early_stopping_rounds=100, verbose=0)
        acc -= mean_squared_error(val_y, clf.predict(val_X))/10
        best_n += clf.best_iteration/10
    best_ns.append(best_n)
    return acc


In [7]:
kfold = KFold(n_splits=10, shuffle=True, random_state=0)

In [8]:
%%time
optimizer = BayesianOptimization(bayes_xgb, xgb_bounds, random_state=0)

init_points = 5
n_iter = 10

optimizer.maximize(init_points=init_points, n_iter=n_iter)

|   iter    |  target   | colsam... | max_depth | reg_alpha | reg_lamda | subsample |
-------------------------------------------------------------------------------------


KeyboardInterrupt: 

In [None]:
import pickle
with open('./best_params.bin', 'rb') as f:
    best_params = pickle.load(f)

In [None]:
optimizer.max['params']

In [None]:
best_params = {
    'n_estimators': int(best_ns[12]),
    'booster': 'gbtree',
    'tree_method': 'hist',
    'random_state': 0,
    'n_jobs': -1,
    'max_depth': int(optimizer.max['params']['max_depth']),
    'subsample': optimizer.max['params']['subsample'],
    'colsample_bytree': optimizer.max['params']['colsample_bytree'],
    'reg_alpha': optimizer.max['params']['reg_alpha'],
    'reg_lamda': optimizer.max['params']['reg_lamda'],
    'metric': 'error'
}

reg = XGBRegressor(**best_params, objective='reg:squarederror').fit(train_X, train_y)


In [None]:
ans = reg.predict(test_X)

In [None]:
ans

In [None]:
ans = (ans * np.std(train_y)[0]) + np.mean(train_y)[0]

In [None]:
ans

In [None]:
sub.to_csv('./sub/sub6.csv', index=False)

In [20]:
from sklearn.linear_model import ElasticNet, Lasso

reg = Lasso().fit(train_X, train_y)

In [21]:
reg.predict(test_X)

array([122753.86324206, 433422.6856666 , 189804.80179817, ...,
       132824.11602104, 122123.85642378, 246691.93893299])

In [13]:
test_X = test_X.drop('Id', axis=1)

In [22]:
sub['SalePrice'] = reg.predict(test_X)

In [23]:
sub.to_csv('./sub/sub8.csv', index=False)

In [18]:
sub

Unnamed: 0,Id,SalePrice
0,1461,135911.295646
1,1462,411515.247092
2,1463,197331.338798
3,1464,209858.709534
4,1465,203324.208499
...,...,...
1454,2915,81746.846929
1455,2916,80737.707865
1456,2917,149680.320257
1457,2918,116611.327685
