In [1]:
import xgboost as xgb
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

In [2]:
# xgboost model parameters
eta = 0.1
max_depth = 6
subsample = 0.9
colsample_bytree = 0.85
min_child_weight = 55
num_boost_round = 500

params = {"objective":"reg:linear",
         "booster":"gbtree",
         "eta":eta,
         "max_depth":int(max_depth),
         "subsample":subsample,
         "colsample_bytree":colsample_bytree,
         "min_child_weight":min_child_weight,
         "silent":1
         }

In [3]:
# join two data
train = pd.read_csv("train.csv")
train_label = train['target']
train_id = train['id']
del train['target'], train['id']

test = pd.read_csv("test.csv")
test_id = test['id']
del test['id']

data = train.append(test)
data.reset_index(inplace=True)
train_rows = train.shape[0]

In [4]:
import pickle

In [5]:
# making new derived variables
feature_results = []

for target_g in ['car', 'ind', 'reg']:
    features = [x for x in list(data) if target_g not in x]
    target_list = [x for x in list(data) if target_g in x]
    train_fea = np.array(data[features])
    for target in target_list:
        print(target)
        train_label = data[target]
        kfold = KFold(n_splits=5, random_state=218, shuffle=True)
        kf = kfold.split(data)
        cv_train = np.zeros(shape=(data.shape[0],1))
        for i, (train_fold, validate) in enumerate(kf):
            X_train, X_validate, label_train, label_validate = \
            train_fea[train_fold, :], train_fea[validate, :], train_label[train_fold], train_label[validate]
        dtrain = xgb.DMatrix(X_train, label_train)
        dvalid = xgb.DMatrix(X_validate, label_validate)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        # train xgboost model
        bst = xgb.train(params, dtrain, num_boost_round, evals=watchlist, verbose_eval=50, early_stopping_rounds=10)
        # save
        cv_train[validate, 0] += bst.predict(xgb.DMatrix(X_validate), ntree_limit=bst.best_ntree_limit)
    feature_results.append(cv_train)
feature_results = np.hstack(feature_results)
train_features = feature_results[:train_rows, :]
test_features = feature_results[train_rows:, :]
pickle.dump([train_features, test_features], open("fea0.pk",'wb'))

ps_car_01_cat


  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:7.44212	valid-rmse:7.44263
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[50]	train-rmse:2.30985	valid-rmse:2.30253
[100]	train-rmse:2.29976	valid-rmse:2.29445
[150]	train-rmse:2.29558	valid-rmse:2.29238
[200]	train-rmse:2.29181	valid-rmse:2.29087
[250]	train-rmse:2.28835	valid-rmse:2.28994
[300]	train-rmse:2.28557	valid-rmse:2.28948
[350]	train-rmse:2.28265	valid-rmse:2.28866
[400]	train-rmse:2.27942	valid-rmse:2.28794
[450]	train-rmse:2.27653	valid-rmse:2.28753
[499]	train-rmse:2.27386	valid-rmse:2.28704
ps_car_02_cat
[0]	train-rmse:0.476072	valid-rmse:0.476008
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[50]	train-rmse:0.352787	valid-rmse:0.352271
[100]	train-rmse:0.351931	valid-rmse:0.351887
[150]	train-rmse:0.351393	valid-rmse:0.351828
[200]	train-rmse:0.350939	valid-rm