# Santander Product Recommendation

* The aim is to
* Modeling: Ensemble LightGBM, XGboost, NN(5 layers)

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import StandardScaler

from keras import models, layers
from keras.callbacks import EarlyStopping
import xgboost as xgb
import lightgbm as lgbm

Using TensorFlow backend.


In [2]:
with open('../input/meta_data.pkl', 'rb') as fin:
    meta = pickle.load(fin)

features = meta['features']
target = meta['target']
prods = meta['prods']

with open('../input/processed_data.pkl', 'rb') as finn:
    data = pickle.load(finn)

#validation data
tst_vld = data['tst_vld']
tst_all = data['tst_all']

#training data
trn = data['trn_all']

del meta, data

# Evaluation Method MAP@7.
<br>
<br>
actually purchased products

In [3]:
ncodpers_tst_vld = tst_vld['ncodpers'].values

def get_purchased_products():    

    # 검증 데이터에서 신규 구매를 구한다.
    for prod in prods:
        prev = prod + '_prev'
        padd = prod + '_add'
        tst_vld[padd] = tst_vld[prod] - tst_vld[prev]


    # 고객별 신규 구매 정답 값을 add_vld_list에 저장하고, 총 count를 count_vld에 저장한다.
    add_vld = tst_vld[[prod + '_add' for prod in prods]].values
    add_vld_list = [list() for i in range(len(ncodpers_tst_vld))]

    count_vld = 0
    for ncodper in range(len(ncodpers_tst_vld)):
        for prod in range(len(prods)):
            if add_vld[ncodper, prod] > 0:
                add_vld_list[ncodper].append(prod)
                count_vld += 1
                
    return add_vld_list

add_vld_list = get_purchased_products()

In [4]:
def predict_7_products(preds_vld):
    # 검증 데이터 예측 상위 7개를 추출한다.
    result_vld = []
    
    for ncodper, pred in zip(ncodpers_tst_vld, preds_vld):
        y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods[target], target)]
        y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
        result_vld.append([ip for y,p,ip in y_prods])
    
    return result_vld

Evaluation method is MAP @ 7.
It matches to 7

In [5]:
def apk(actual, predicted, k=7, default=0.0):
    # MAP@7 이므로, 최대 7개만 사용한다
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        # 점수를 부여하는 조건은 다음과 같다 :
        # 예측값이 정답에 있고 (‘p in actual’)
        # 예측값이 중복이 아니면 (‘p not in predicted[:i]’) 
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    # 정답값이 공백일 경우, 무조건 0.0점을 반환한다
    if not actual:
        return default

    # 정답의 개수(len(actual))로 average precision을 구한다
    return score / min(len(actual), k)

def mapk(actual, predicted, k=7, default=0.0):
    # list of list인 정답값(actual)과 예측값(predicted)에서 고객별 Average Precision을 구하고, np.mean()을 통해 평균을 계산한다
    return np.mean([apk(a, p, k, default) for a, p in zip(actual, predicted)]) 

In [6]:
# 검증 데이터에서 얻을 수 있는 MAP@7 최고점을 미리 구한다. (0.042613)
print(mapk(add_vld_list, add_vld_list, 7, 0.0))

0.04266379915553903


For validation, pull out actual products list

# Train the models with validation data
<br>
trn_vld<br>
eval_vld<br>
tst_vld<br>

In [7]:
vld_date = '2016-05-28'

trn_vld = trn[trn['fecha_dato'] < vld_date]
eval_vld = trn[trn['fecha_dato']==vld_date]

X_trn_vld = trn_vld[features].values
y_trn_vld = trn_vld['target'].values

X_eval_vld = eval_vld[features].values
y_eval_vld = eval_vld['target'].values

In [8]:
X_trn_vld.shape, y_trn_vld.shape, X_eval_vld.shape, y_eval_vld.shape

((10765757, 60), (10765757,), (689132, 60), (689132,))

# 2) XGBoost Model

* validation data
X_trn_vld ,y_trn_vld , X_eval_vld , y_eval_vld

1) XGBoost Model Training with validation data

In [70]:
# XGBoost 모델 parameter를 설정한다.
param_xgb = {
    #'booster': 'gbtree',
    'max_depth': 8,
    'nthread': 4,
    'num_class': 17,
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'min_child_weight': 10,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'seed': 2018,
    }

# 훈련, 검증 데이터를 XGBoost 형태로 변환한다.
dtrn = xgb.DMatrix(X_trn_vld, label=y_trn_vld, feature_names=features)
dvld = xgb.DMatrix(X_eval_vld, label=y_eval_vld, feature_names=features)

# XGBoost 모델을 훈련 데이터로 학습한다!
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model_xgb = xgb.train(param_xgb, dtrn, num_boost_round=100, evals=watch_list, early_stopping_rounds=10)
best_ntree_limit = model_xgb.best_ntree_limit


# 학습한 모델을 저장한다.
pickle.dump(model_xgb, open("../model/xgb.pkl", "wb"))

[0]	train-mlogloss:2.11779	eval-mlogloss:2.09076
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:1.82085	eval-mlogloss:1.75636
[2]	train-mlogloss:1.63405	eval-mlogloss:1.52409
[3]	train-mlogloss:1.44439	eval-mlogloss:1.34769
[4]	train-mlogloss:1.30098	eval-mlogloss:1.2078
[5]	train-mlogloss:1.21081	eval-mlogloss:1.08793
[6]	train-mlogloss:1.11172	eval-mlogloss:0.989476
[7]	train-mlogloss:1.02199	eval-mlogloss:0.903919
[8]	train-mlogloss:0.948211	eval-mlogloss:0.828923
[9]	train-mlogloss:0.892847	eval-mlogloss:0.763686
[10]	train-mlogloss:0.834503	eval-mlogloss:0.705944
[11]	train-mlogloss:0.791048	eval-mlogloss:0.655371
[12]	train-mlogloss:0.748521	eval-mlogloss:0.610169
[13]	train-mlogloss:0.713106	eval-mlogloss:0.569988
[14]	train-mlogloss:0.6793	eval-mlogloss:0.534392
[15]	train-mlogloss:0.642269	eval-mlogloss:0.501605
[16]	train-mlogloss:0.617519	eval-mlogloss:0

In [71]:
# 검증 데이터에 대한 예측 값을 구한다.
X_tst_vld = tst_vld[features].values
X_tst_vld = xgb.DMatrix(X_tst_vld, feature_names=features)
preds_vld_xgb = model_xgb.predict(X_tst_vld, ntree_limit=best_ntree_limit)



ValueError: Unable to coerce to DataFrame, shape must be (696539, 24): given (696539, 17)

In [72]:
preds_vld_xgb_16 = np.delete(preds_vld_xgb, 16, axis=1)


(696539, 16)

In [74]:
# 저번 달에 보유한 제품은 신규 구매가 불가하기 때문에, 확률값에서 미리 1을 빼준다
preds_vld_xgb_16 = preds_vld_xgb_16 - tst_vld[[prod+'_prev' for prod in prods[target]]]

In [75]:
result_xgb = predict_7_products(preds_vld_xgb_16.values)

# 검증 데이터에서의 MAP@7 점수를 구한다. (0.03609679275470783)
print(mapk(add_vld_list, result_xgb, 7, 0.0))

0.03609679275470783


# 3) lightGBM Model Training with validation data

X_trn_vld ,y_trn_vld , X_eval_vld , y_eval_vld

In [9]:
train = lgbm.Dataset(X_trn_vld, label=y_trn_vld, feature_name=features)
validate = lgbm.Dataset(X_eval_vld, label=y_eval_vld, feature_name=features, reference=train)

# 다양한 실험을 통해 얻은 최적의 학습 parameter
params_lgb = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'multiclass',
    'num_class': 17,
    'metric' : {'multi_logloss'},
    'is_training_metric': True,
    'max_bin': 255,
    'num_leaves' : 64,
    'learning_rate' : 0.1,
    'feature_fraction' : 0.8,
    'min_data_in_leaf': 10,
    'min_sum_hessian_in_leaf': 5,
    # 'num_threads': 16,
}

# XGBoost와 동일하게 훈련/검증 데이터를 기반으로 최적의 트리 개수를 계산한다
model_lgb = lgbm.train(params_lgb, train, num_boost_round=1000, valid_sets=validate, early_stopping_rounds=20)
best_iteration = model_lgb.best_iteration
# 학습된 모델과 최적의 트리 개수 정보를 저장한다
model_lgb.save_model("../model/lgbm.model.txt")
pickle.dump(best_iteration, open("../model/lgbm.model.meta", "wb"))

[1]	valid_0's multi_logloss: 0.367338
Training until validation scores don't improve for 20 rounds
[2]	valid_0's multi_logloss: 0.345466
[3]	valid_0's multi_logloss: 0.330157
[4]	valid_0's multi_logloss: 0.316865
[5]	valid_0's multi_logloss: 0.303929
[6]	valid_0's multi_logloss: 0.293882
[7]	valid_0's multi_logloss: 0.285786
[8]	valid_0's multi_logloss: 0.277765
[9]	valid_0's multi_logloss: 0.270836
[10]	valid_0's multi_logloss: 0.264364
[11]	valid_0's multi_logloss: 0.258532
[12]	valid_0's multi_logloss: 0.253337
[13]	valid_0's multi_logloss: 0.248596
[14]	valid_0's multi_logloss: 0.244373
[15]	valid_0's multi_logloss: 0.240536
[16]	valid_0's multi_logloss: 0.237038
[17]	valid_0's multi_logloss: 0.233878
[18]	valid_0's multi_logloss: 0.230963
[19]	valid_0's multi_logloss: 0.228342
[20]	valid_0's multi_logloss: 0.22607
[21]	valid_0's multi_logloss: 0.223855
[22]	valid_0's multi_logloss: 0.221916
[23]	valid_0's multi_logloss: 0.22005
[24]	valid_0's multi_logloss: 0.218437
[25]	valid_0's

[208]	valid_0's multi_logloss: 0.194018
[209]	valid_0's multi_logloss: 0.194009
[210]	valid_0's multi_logloss: 0.193993
[211]	valid_0's multi_logloss: 0.193983
[212]	valid_0's multi_logloss: 0.193968
[213]	valid_0's multi_logloss: 0.193955
[214]	valid_0's multi_logloss: 0.193947
[215]	valid_0's multi_logloss: 0.193938
[216]	valid_0's multi_logloss: 0.193927
[217]	valid_0's multi_logloss: 0.193915
[218]	valid_0's multi_logloss: 0.193901
[219]	valid_0's multi_logloss: 0.193892
[220]	valid_0's multi_logloss: 0.193878
[221]	valid_0's multi_logloss: 0.193866
[222]	valid_0's multi_logloss: 0.193855
[223]	valid_0's multi_logloss: 0.193841
[224]	valid_0's multi_logloss: 0.193835
[225]	valid_0's multi_logloss: 0.193826
[226]	valid_0's multi_logloss: 0.193816
[227]	valid_0's multi_logloss: 0.193806
[228]	valid_0's multi_logloss: 0.193794
[229]	valid_0's multi_logloss: 0.193785
[230]	valid_0's multi_logloss: 0.193776
[231]	valid_0's multi_logloss: 0.193765
[232]	valid_0's multi_logloss: 0.193752


[414]	valid_0's multi_logloss: 0.192209
[415]	valid_0's multi_logloss: 0.192202
[416]	valid_0's multi_logloss: 0.192192
[417]	valid_0's multi_logloss: 0.192183
[418]	valid_0's multi_logloss: 0.192177
[419]	valid_0's multi_logloss: 0.192168
[420]	valid_0's multi_logloss: 0.192161
[421]	valid_0's multi_logloss: 0.19215
[422]	valid_0's multi_logloss: 0.192144
[423]	valid_0's multi_logloss: 0.192135
[424]	valid_0's multi_logloss: 0.192127
[425]	valid_0's multi_logloss: 0.192122
[426]	valid_0's multi_logloss: 0.192109
[427]	valid_0's multi_logloss: 0.192101
[428]	valid_0's multi_logloss: 0.192094
[429]	valid_0's multi_logloss: 0.19209
[430]	valid_0's multi_logloss: 0.192082
[431]	valid_0's multi_logloss: 0.192073
[432]	valid_0's multi_logloss: 0.192067
[433]	valid_0's multi_logloss: 0.192061
[434]	valid_0's multi_logloss: 0.192053
[435]	valid_0's multi_logloss: 0.192034
[436]	valid_0's multi_logloss: 0.192026
[437]	valid_0's multi_logloss: 0.192019
[438]	valid_0's multi_logloss: 0.192012
[4

[620]	valid_0's multi_logloss: 0.190909
[621]	valid_0's multi_logloss: 0.190904
[622]	valid_0's multi_logloss: 0.190897
[623]	valid_0's multi_logloss: 0.19089
[624]	valid_0's multi_logloss: 0.190884
[625]	valid_0's multi_logloss: 0.190879
[626]	valid_0's multi_logloss: 0.190874
[627]	valid_0's multi_logloss: 0.190868
[628]	valid_0's multi_logloss: 0.190864
[629]	valid_0's multi_logloss: 0.190858
[630]	valid_0's multi_logloss: 0.190852
[631]	valid_0's multi_logloss: 0.190843
[632]	valid_0's multi_logloss: 0.190838
[633]	valid_0's multi_logloss: 0.190831
[634]	valid_0's multi_logloss: 0.190824
[635]	valid_0's multi_logloss: 0.190817
[636]	valid_0's multi_logloss: 0.190809
[637]	valid_0's multi_logloss: 0.190802
[638]	valid_0's multi_logloss: 0.190796
[639]	valid_0's multi_logloss: 0.190795
[640]	valid_0's multi_logloss: 0.190792
[641]	valid_0's multi_logloss: 0.190787
[642]	valid_0's multi_logloss: 0.190781
[643]	valid_0's multi_logloss: 0.190778
[644]	valid_0's multi_logloss: 0.190773
[

[826]	valid_0's multi_logloss: 0.189858
[827]	valid_0's multi_logloss: 0.189853
[828]	valid_0's multi_logloss: 0.189841
[829]	valid_0's multi_logloss: 0.189836
[830]	valid_0's multi_logloss: 0.189835
[831]	valid_0's multi_logloss: 0.189833
[832]	valid_0's multi_logloss: 0.189828
[833]	valid_0's multi_logloss: 0.189825
[834]	valid_0's multi_logloss: 0.189819
[835]	valid_0's multi_logloss: 0.189815
[836]	valid_0's multi_logloss: 0.189811
[837]	valid_0's multi_logloss: 0.189805
[838]	valid_0's multi_logloss: 0.189797
[839]	valid_0's multi_logloss: 0.189794
[840]	valid_0's multi_logloss: 0.189791
[841]	valid_0's multi_logloss: 0.189786
[842]	valid_0's multi_logloss: 0.189782
[843]	valid_0's multi_logloss: 0.189779
[844]	valid_0's multi_logloss: 0.189775
[845]	valid_0's multi_logloss: 0.189771
[846]	valid_0's multi_logloss: 0.189767
[847]	valid_0's multi_logloss: 0.18976
[848]	valid_0's multi_logloss: 0.189756
[849]	valid_0's multi_logloss: 0.189752
[850]	valid_0's multi_logloss: 0.189747
[

In [34]:
# 검증 데이터에 대한 예측 값을 구한다.
preds_vld_lgb = model_lgb.predict(tst_vld[features], ntree_limit=best_iteration)

preds_vld_lgb_16 = np.delete(preds_vld_lgb, 16, axis=1)
preds_vld_lgb_16.shape

(696539, 16)

In [35]:
preds_vld_lgb_16 = preds_vld_lgb_16 - tst_vld[[prod+'_prev' for prod in prods[target]]]

In [41]:
result_lgb = predict_7_products(preds_vld_lgb_16.values)

# 검증 데이터에서의 MAP@7 점수를 구한다. (0.03661802042010113)
print(mapk(add_vld_list, result_lgb, 7, 0.0))

0.036521553551041475


# 1) Neural Network Model

All data should be between -1 and 1 to be trained, so we scaled numeric features before training the model.

In [9]:
scaler = StandardScaler().fit(trn[features])
X_trn_vld_norm = scaler.transform(X_trn_vld)

  return self.partial_fit(X, y)


In [13]:
X_eval_vld_norm = scaler.transform(X_eval_vld)

In [10]:
y_trn_vld_matrix = trn_vld[[prod for prod in prods[target]]].values
y_trn_vld_matrix.shape

(10765757, 16)

In [11]:
y_eval_vld_matrix = eval_vld[[prod for prod in prods[target]]].values
y_eval_vld_matrix.shape

(689132, 16)

In [12]:
model_nn = models.Sequential()
model_nn.add(layers.Dense(512, activation='relu', input_shape=(60,)))
model_nn.add(layers.Dropout(0.8))
model_nn.add(layers.Dense(512, activation='relu'))
model_nn.add(layers.Dropout(0.5))
model_nn.add(layers.Dense(16, activation='softmax'))

model_nn.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

model_nn.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               31232     
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                8208      
Total params: 302,096
Trainable params: 302,096
Non-trainable params: 0
___________________________________________________________

In [14]:
callback_list = [EarlyStopping(monitor='val_acc', patience = 10)]
model_nn.fit(X_trn_vld_norm, y_trn_vld_matrix, epochs=100, batch_size=64, callbacks=callback_list, validation_data=(X_eval_vld_norm ,y_eval_vld_matrix))

# 학습한 모델을 저장한다.
pickle.dump(model_nn, open("../model/neuralnetwork.pkl", "wb"))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 10765757 samples, validate on 689132 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


In [20]:
X_tst_vld_norm = scaler.transform(tst_vld[features])
vld_preds_nn = model_nn.predict(X_tst_vld_norm, batch_size=512)

  """Entry point for launching an IPython kernel.


In [68]:
def get_products_from_nn(preds_prod):
    result_vld = []

    for ncodper, prds in zip(ncodpers_tst_vld, preds_prod):
        r = [(ip,p) for ip, p in zip(target,prds) if p > 0]
        r = sorted(r, key=lambda a:a[1], reverse=True)[:7]
        result_vld.append([ip for ip,p in r])

    return result_vld

In [69]:
result_nn = get_products_from_nn(vld_preds_nn)

# 검증 데이터에서의 MAP@7 점수를 구한다. (0.036466)
mapk(add_vld_list, result_nn, 7, 0.0)

0.00884893083733709

# 4) Ensemble Model

1) lightGBM + XGBoost + NN
1) lightGBM + XGBoost
2) NN + lightGBM
3) NN + XGBoost 
* multiply neural network, lightGBM, xgb calculate sqrt

In [89]:
# 곱셈 후, 제곱근을 구하는 방식으로 앙상블을 수행한다
preds_vld_ensemble = np.sqrt(np.multiply(np.multiply(preds_vld_xgb, preds_vld_lgb),vld_preds_nn))
result_ensemble = predict_7_products(preds_vld_ensemble.values)

# 검증 데이터에서의 MAP@7 점수를 구한다. (0.036466)
print(mapk(add_vld_list, result_ensemble, 7, 0.0))

  


0.018705250713864016


each result, visualisation

# Training with all data

In [33]:
# XGBoost 모델을 전체 훈련 데이터로 재학습한다!
X_all = XY.as_matrix(columns=features)
Y_all = XY.as_matrix(columns=['y'])
dall = xgb.DMatrix(X_all, label=Y_all, feature_names=features)
watch_list = [(dall, 'train')]

# XGBoost 모델 재학습!
model = xgb.train(param, dall, num_boost_round=best_ntree_limit, evals=watch_list )

# 변수 중요도를 출력해본다. 예상하던 변수가 상위로 올라와 있는가?
print("Feature importance:")
for kv in sorted([(k,v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
    print(kv)

  
  This is separate from the ipykernel package so we can avoid doing imports until


[0]	train-mlogloss:2.67857
[1]	train-mlogloss:2.43528
[2]	train-mlogloss:2.26705
[3]	train-mlogloss:2.13088
[4]	train-mlogloss:2.01841
[5]	train-mlogloss:1.93183
[6]	train-mlogloss:1.8522
[7]	train-mlogloss:1.78537
[8]	train-mlogloss:1.72515
[9]	train-mlogloss:1.67057
[10]	train-mlogloss:1.62535
[11]	train-mlogloss:1.5841
[12]	train-mlogloss:1.54635
[13]	train-mlogloss:1.51283
[14]	train-mlogloss:1.48278
[15]	train-mlogloss:1.45422
[16]	train-mlogloss:1.428
[17]	train-mlogloss:1.40507
[18]	train-mlogloss:1.38329
[19]	train-mlogloss:1.36347
[20]	train-mlogloss:1.34513
[21]	train-mlogloss:1.32845
[22]	train-mlogloss:1.31424
[23]	train-mlogloss:1.30034
[24]	train-mlogloss:1.28795
[25]	train-mlogloss:1.2754
[26]	train-mlogloss:1.26436
[27]	train-mlogloss:1.25375
[28]	train-mlogloss:1.24416
[29]	train-mlogloss:1.2353
[30]	train-mlogloss:1.2268
[31]	train-mlogloss:1.21864
[32]	train-mlogloss:1.21123
[33]	train-mlogloss:1.20451
[34]	train-mlogloss:1.19766
[35]	train-mlogloss:1.19173
[36]	trai

[288]	train-mlogloss:1.01417
[289]	train-mlogloss:1.01388
[290]	train-mlogloss:1.01358
[291]	train-mlogloss:1.01327
[292]	train-mlogloss:1.01301
[293]	train-mlogloss:1.01274
[294]	train-mlogloss:1.01243
[295]	train-mlogloss:1.01216
[296]	train-mlogloss:1.01192
[297]	train-mlogloss:1.01167
[298]	train-mlogloss:1.01145
[299]	train-mlogloss:1.01123
[300]	train-mlogloss:1.01094
[301]	train-mlogloss:1.01072
[302]	train-mlogloss:1.01048
[303]	train-mlogloss:1.01024
[304]	train-mlogloss:1.01003
[305]	train-mlogloss:1.00982
[306]	train-mlogloss:1.0096
[307]	train-mlogloss:1.00932
[308]	train-mlogloss:1.00906
[309]	train-mlogloss:1.00873
[310]	train-mlogloss:1.0085
[311]	train-mlogloss:1.00818
[312]	train-mlogloss:1.00798
[313]	train-mlogloss:1.00775
[314]	train-mlogloss:1.00747
[315]	train-mlogloss:1.0072
[316]	train-mlogloss:1.00698
[317]	train-mlogloss:1.00673
[318]	train-mlogloss:1.00642
[319]	train-mlogloss:1.00619
[320]	train-mlogloss:1.00591
[321]	train-mlogloss:1.00566
[322]	train-mlogl

[565]	train-mlogloss:0.956165
[566]	train-mlogloss:0.956041
[567]	train-mlogloss:0.955857
[568]	train-mlogloss:0.955697
[569]	train-mlogloss:0.955584
[570]	train-mlogloss:0.95544
[571]	train-mlogloss:0.955288
[572]	train-mlogloss:0.955145
[573]	train-mlogloss:0.954946
[574]	train-mlogloss:0.954797
[575]	train-mlogloss:0.954673
[576]	train-mlogloss:0.954503
[577]	train-mlogloss:0.954376
[578]	train-mlogloss:0.954198
[579]	train-mlogloss:0.954079
[580]	train-mlogloss:0.953948
[581]	train-mlogloss:0.953766
[582]	train-mlogloss:0.953572
[583]	train-mlogloss:0.953385
[584]	train-mlogloss:0.953281
[585]	train-mlogloss:0.953087
[586]	train-mlogloss:0.952973
[587]	train-mlogloss:0.952797
[588]	train-mlogloss:0.952588
[589]	train-mlogloss:0.952429
[590]	train-mlogloss:0.952232
[591]	train-mlogloss:0.952055
[592]	train-mlogloss:0.951889
[593]	train-mlogloss:0.951658
[594]	train-mlogloss:0.951503
[595]	train-mlogloss:0.951333
[596]	train-mlogloss:0.951209
[597]	train-mlogloss:0.951071
[598]	train

[840]	train-mlogloss:0.914456
[841]	train-mlogloss:0.914306
[842]	train-mlogloss:0.914154
[843]	train-mlogloss:0.914025
[844]	train-mlogloss:0.913873
[845]	train-mlogloss:0.913719
[846]	train-mlogloss:0.913594
[847]	train-mlogloss:0.913463
[848]	train-mlogloss:0.91334
[849]	train-mlogloss:0.913222
[850]	train-mlogloss:0.913061
[851]	train-mlogloss:0.912963
[852]	train-mlogloss:0.912862
[853]	train-mlogloss:0.912664
[854]	train-mlogloss:0.912547
[855]	train-mlogloss:0.91234
[856]	train-mlogloss:0.912205
[857]	train-mlogloss:0.912085
[858]	train-mlogloss:0.911944
[859]	train-mlogloss:0.911806
[860]	train-mlogloss:0.911662
[861]	train-mlogloss:0.911524
[862]	train-mlogloss:0.9114
[863]	train-mlogloss:0.911277
[864]	train-mlogloss:0.911126
[865]	train-mlogloss:0.911021
[866]	train-mlogloss:0.910905
[867]	train-mlogloss:0.910776
[868]	train-mlogloss:0.910671
[869]	train-mlogloss:0.910555
[870]	train-mlogloss:0.910414
[871]	train-mlogloss:0.910296
[872]	train-mlogloss:0.910207
[873]	train-ml

In [35]:
pickle.dump(model, open("../model/xgb.baseline_bestnumtree.pkl", "wb"))

* lightGBM training

In [None]:
# 전체 훈련 데이터에는 늘어난 양만큼 트리 개수를 늘린다
best_iteration = int(best_iteration * len(XY_all) / len(XY_train))

# 전체 훈련 데이터에 대한 LightGBM 전용 데이터를 생성한다
all_train = lgbm.Dataset(XY_all[list(features)], label=XY_all["y"], weight=XY_all["weight"], feature_name=features)

# LightGBM 모델 학습!
model = lgbm.train(params, all_train, num_boost_round=best_iteration)
model.save_model("../model/lgbm.all.model.txt")

# LightGBM 모델이 제공하는 변수 중요도 기능을 통해 변수 중요도를 출력한다
print("Feature importance by split:")
for kv in sorted([(k,v) for k,v in zip(features, model.feature_importance("split"))], key=lambda kv: kv[1], reverse=True):
print(kv)
print("Feature importance by gain:")
for kv in sorted([(k,v) for k,v in zip(features, model.feature_importance("gain"))], key=lambda kv: kv[1], reverse=True):
print(kv)

# 테스트 데이터에 대한 예측 결과물을 return한다
y_lightgbm = model.predict(test_df[list(features)], num_iteration=best_iteration)

# Predict

In [None]:
X_tst = tst[features].values

In [34]:
dtst = xgb.DMatrix(X_tst, feature_names=features)
preds_tst = model.predict(dtst, ntree_limit=best_ntree_limit)
ncodpers_tst = tst.as_matrix(columns=['ncodpers'])
preds_tst = preds_tst - tst.as_matrix(columns=[prod + '_prev' for prod in prods])

  
  """
  


# Predict Products for customers

In [None]:
# 제출 파일을 생성한다.
submit_file = open('../model/xgb_baseline_0731_1', 'w')
submit_file.write('ncodpers,added_products\n')
Y_ret=[]

for ncodper, pred in zip(ncodpers_tst, preds_tst):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y,p,ip in y_prods]
    data = '{},{}\n'.format(int(ncodper), ' '.join(y_prods))
    Y_ret.append(data)
    submit_file.write(data)
    
submit_file.close()