In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from IPython.display import display
import xgboost as xgb

print('loading aisles...')
aisles = pd.read_csv('aisles.csv', dtype={
        'aisle_id': np.uint16,
        'aisle': 'category'})

print('loading department...')
department = pd.read_csv('departments.csv', dtype={
            'department_id': np.uint8,
            'department': 'category'})

print('loading products...')
products = pd.read_csv('products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.uint32,
        'aisle_id': np.uint8,
        'department_id': np.uint8})

print('loading prior orders...')
prior = pd.read_csv('order_products__prior.csv', dtype={
        'order_id': np.uint32,
        'product_id': np.uint16,
        'add_to_cart_order': np.uint16,
        'reordered': np.uint16})

print('loading train orders...')
train = pd.read_csv('order_products__train.csv', dtype={
        'order_id': np.uint32,
        'product_id': np.uint16,
        'add_to_cart_order': np.uint16,
        'reordered': np.uint8})

print('loading orders...')
order = pd.read_csv('orders.csv' , dtype={
        'order_id': np.uint32,
        'user_id': np.uint32,
        'eval_set': 'category',
        'order_number': np.uint16,
        'order_dow': np.uint16,
        'order_hour_of_day': np.uint16,
        'days_since_prior_order': np.float32})



loading aisles...
loading department...
loading products...
loading prior orders...
loading train orders...
loading orders...


In [2]:
train_orders = order[order.eval_set == 'train']
test_orders = order[order.eval_set == 'test']
prior_orders = order[order.eval_set == 'prior']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

order.set_index('order_id', inplace=True, drop=False)
prior = prior.join(order, on='order_id', rsuffix='_')
prior.drop('order_id_', inplace=True, axis=1)
prior.set_index('order_id', inplace=True, drop=False)

print('Construct products information...')
prods = pd.DataFrame()
prods['total_nb'] = prior.groupby(prior.product_id).size().astype(np.uint32)
prods['nb_reorder'] = prior.groupby(prior.product_id)['reordered'].sum().astype(np.uint32)
prods['reorder_rate'] = prods.nb_reorder / prods.total_nb.astype(np.float32)
prods['nb_buyers'] = prior.groupby(prior.product_id)['user_id'].apply(lambda x: len(set(x))).astype(np.uint16) # unique buyers
prods['avg_add_to_cart_order'] = prior.groupby(prior.product_id)['add_to_cart_order'].mean().astype(np.uint8)
prods['nb_orders'] = prior.groupby(prior.product_id).size().astype(np.uint16)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods
ords_pri = pd.DataFrame()
ords_pri['order_id'] = prior.groupby(prior.order_id)['order_id'].apply(lambda x: x.iloc[0])
ords_pri.set_index('order_id', drop=False, inplace=True)
ords_pri['nb_items'] = prior.groupby(prior.order_id)['product_id'].size().astype(np.uint8)
ords_pri['first_item_id'] = prior.groupby(prior.order_id)['product_id'].apply(lambda x: x.iloc[0])
ords_pri['first_item_reorder'] = prior.groupby(prior.order_id)['reordered'].apply(lambda x: x.iloc[0])
ords_pri['nb_reorder'] = prior.groupby(prior.order_id)['reordered'].sum()
ords_pri['reorder_ratio'] = (ords_pri['nb_reorder'] / ords_pri['nb_items']).astype(np.float32)
users = pd.DataFrame()
users['user_id'] = prior.groupby('user_id')['user_id'].apply(lambda x: x.iloc[0])
users['nb_order'] = order[order.eval_set == 'prior'].groupby('user_id').size().astype(np.uint16)
users['avg_days_between_order'] = prior.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
users['avg_hour_of_day'] = prior.groupby('user_id')['order_hour_of_day'].mean().astype(np.float32)
users['nb_total_items'] = prior.groupby('user_id').size().astype(np.uint16)
users['all_products'] = prior.groupby('user_id')['product_id'].apply(set) # apply 对每个行或者列调用一次函数
users['nb_distinct_items'] = (users['all_products'].map(len)).astype(np.uint16) #map 对每个元素(element-wise)调用一次函数
users['average_basket'] = (users.nb_total_items / users.nb_order).astype(np.float32)
users['min_days_of_week'] = prior.groupby(prior.user_id)['order_dow'].apply(min).astype(np.uint8)
users['max_days_of_week'] = prior.groupby(prior.user_id)['order_dow'].apply(max).astype(np.uint8)
# Query data from ords
users['last_order_id'] = prior_orders.groupby(prior_orders.user_id)['user_id'].apply(lambda x: x.iloc[-1])
users['lo_nb_products'] = users.last_order_id.map(ords_pri.nb_items)
users['lo_first_item_id'] = users.last_order_id.map(ords_pri.first_item_id)
users['lo_first_item_reorder'] = users.last_order_id.map(ords_pri.first_item_reorder)
users['lo_nb_reorder'] = users.last_order_id.map(ords_pri.nb_reorder)
users['lo_reorder_ratio'] = users.last_order_id.map(ords_pri.reorder_ratio)

users['last_2_order_id'] = prior_orders.groupby(prior_orders.user_id)['user_id'].apply(lambda x: x.iloc[-2])
users['lo2_nb_products'] = users.last_2_order_id.map(ords_pri.nb_items)
users['lo2_first_item_id'] = users.last_2_order_id.map(ords_pri.first_item_id)
users['lo2_first_item_reorder'] = users.last_2_order_id.map(ords_pri.first_item_reorder)
users['lo2_nb_reorder'] = users.last_2_order_id.map(ords_pri.nb_reorder)
users['lo2_reorder_ratio'] = users.last_2_order_id.map(ords_pri.reorder_ratio)

users['last_3_order_id'] = prior_orders.groupby(prior_orders.user_id)['user_id'].apply(lambda x: x.iloc[-3])
users['lo3_nb_products'] = users.last_3_order_id.map(ords_pri.nb_items)
users['lo3_first_item_id'] = users.last_3_order_id.map(ords_pri.first_item_id)
users['lo3_first_item_reorder'] = users.last_3_order_id.map(ords_pri.first_item_reorder)
users['lo3_nb_reorder'] = users.last_3_order_id.map(ords_pri.nb_reorder)
users['lo3_reorder_ratio'] = users.last_3_order_id.map(ords_pri.reorder_ratio)

del ords_pri

prior['user_product_index'] = (prior.user_id.astype(np.uint64) * 100000\
                               + prior.product_id).astype(np.uint64)
d = dict()
for row in prior.itertuples():
    k = row.user_product_index
    if k not in d:
        d[k] = (1, \
                row.add_to_cart_order, \
                row.reordered, \
                (row.order_number, row.order_id),\
                row.order_dow, \
                row.order_hour_of_day, \
                row.add_to_cart_order, \
                row.add_to_cart_order)
    else:
        d[k] = (d[k][0]+1, d[k][1]+row.add_to_cart_order, \
                d[k][2]+row.reordered, \
                # find last order with that product
                max(d[k][3], (row.order_number, row.order_id)), \
                d[k][4]+row.order_dow, \
                d[k][5]+row.order_hour_of_day, \
                min(d[k][6], row.add_to_cart_order), \
                max(d[k][7], row.add_to_cart_order))
UserProduct = pd.DataFrame.from_dict(d, orient='index')
del d
UserProduct.columns = ['nb_orders', 'sum_add_to_cart_order', 'nb_reordered', \
                      'last_order_id', 'sum_order_dow', 'sum_order_hour_of_day', \
                      'min_add_to_cart_order', 'max_add_to_cart_order']
UserProduct['nb_orders'] = UserProduct.nb_orders.astype(np.uint16) 
UserProduct['sum_add_to_cart_order'] = UserProduct.sum_add_to_cart_order.astype(np.uint16)
UserProduct['nb_reordered'] = UserProduct.nb_reordered.astype(np.uint16)
UserProduct['last_order_id'] = UserProduct.last_order_id.map(lambda x: x[1]).astype(np.uint32)
UserProduct['sum_order_dow'] = UserProduct.sum_order_dow.astype(np.uint16)
UserProduct['sum_order_hour_of_day'] = UserProduct.sum_order_hour_of_day.astype(np.uint32)
UserProduct['min_add_to_cart_order'] = UserProduct.min_add_to_cart_order.astype(np.uint8)
UserProduct['max_add_to_cart_order'] = UserProduct.max_add_to_cart_order.astype(np.uint8)

Construct products information...


In [3]:
def gen_features(orders, labels_out=False):
    print('generate features and labels(optional) from selected orders')
    count=0
    product_list = []
    order_list = []
    labels = []
    for row in orders.itertuples():
        count+=1
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_out:
            labels += [(order_id, product) in train.index for product in user_products]
        if count%10000 == 0:
            print('order row', count)
            
    df = pd.DataFrame({'order_id': order_list, 'product_id': product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print("user related features<prior>")
    df['user_id'] = df.order_id.map(order.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_order)
    df['user_total_items'] = df.user_id.map(users.nb_total_items)
    df['user_distinct_items'] = df.user_id.map(users.nb_distinct_items)
    df['user_avg_days_between_orders'] = df.user_id.map(users.avg_days_between_order)
    df['user_avg_basket'] = df.user_id.map(users.average_basket)
    df['user_min_days_of_week'] = df.user_id.map(users.min_days_of_week)
    df['user_max_days_of_week'] = df.user_id.map(users.max_days_of_week)
    
    df['user_last_order_id'] = df.user_id.map(users.last_order_id)
    df['user_lo_dow'] = df.user_last_order_id.map(order.order_dow)
    df['user_lo_hour_of_day'] = df.user_last_order_id.map(order.order_hour_of_day)
    df['user_lo_day_since_prior'] = df.user_last_order_id.map(order.days_since_prior_order)
    df['user_lo_nb_products'] = df.user_id.map(users.lo_nb_products)
    df['user_lo_first_item_id'] = df.user_id.map(users.lo_first_item_id)
    df['user_lo_first_item_reorder'] = df.user_id.map(users.lo_first_item_reorder)
    df['user_lo_nb_reorder'] = df.user_id.map(users.lo_nb_reorder)
    df['user_lo_reorder_ratio'] = df.user_id.map(users.lo_reorder_ratio)
    
    df['user_last2_order_id'] = df.user_id.map(users.last_2_order_id)
    df['user_lo2_dow'] = df.user_last2_order_id.map(order.order_dow)
    df['user_lo2_hour_of_day'] = df.user_last2_order_id.map(order.order_hour_of_day)
    df['user_lo2_day_since_prior'] = df.user_last2_order_id.map(order.days_since_prior_order)
    df['user_lo2_nb_products'] = df.user_id.map(users.lo2_nb_products)
    df['user_lo2_first_item_id'] = df.user_id.map(users.lo2_first_item_id)
    df['user_lo2_first_item_reorder'] = df.user_id.map(users.lo2_first_item_reorder)
    df['user_lo2_nb_reorder'] = df.user_id.map(users.lo2_nb_reorder)
    df['user_lo2_reorder_ratio'] = df.user_id.map(users.lo2_reorder_ratio)
    
    df['user_last3_order_id'] = df.user_id.map(users.last_3_order_id)
    df['user_lo3_dow'] = df.user_last3_order_id.map(order.order_dow)
    df['user_lo3_hour_of_day'] = df.user_last3_order_id.map(order.order_hour_of_day)
    df['user_lo3_day_since_prior'] = df.user_last3_order_id.map(order.days_since_prior_order)
    df['user_lo3_nb_products'] = df.user_id.map(users.lo3_nb_products)
    df['user_lo3_first_item_id'] = df.user_id.map(users.lo3_first_item_id)
    df['user_lo3_first_item_reorder'] = df.user_id.map(users.lo3_first_item_reorder)
    df['user_lo3_nb_reorder'] = df.user_id.map(users.lo3_nb_reorder)
    df['user_lo3_reorder_ratio'] = df.user_id.map(users.lo3_reorder_ratio)
    
    print("product related features<prior>")
    df['product_aisle_id'] = df.product_id.map(products.aisle_id)
    df['product_department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.total_nb)
    df['product_reorders'] = df.product_id.map(products.nb_reorder)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
    df['product_nb_buyers'] = df.product_id.map(products.nb_buyers)
    df['product_avg_add_to_cart_order'] = df.product_id.map(products.avg_add_to_cart_order)
    df['nb_orders'] = df.product_id.map(products.nb_orders)
    
    print("order related features<train>")
    df['order_hour_of_day'] = df.order_id.map(order.order_hour_of_day)
    df['order_days_since_prior_order'] = df.order_id.map(order.days_since_prior_order)
    df['order_day_of_week'] = df.order_id.map(order.order_dow)
    
    print("userXproduct related features<prior>")
    # 1.nb_orders, 2.sum_add_to_cart_order, 3.nb_reordered, \
    # 4.last_order_id, 5.sum_order_dow, 6.sum_order_hour_of_day
    df['UP'] = df.product_id+df.user_id.astype(np.uint64)*100000
    df['UP_nb_orders'] = df.UP.map(UserProduct.nb_orders)
    df['UP_avg_add_to_cart_order'] = df.UP.map(UserProduct.sum_add_to_cart_order)\
                                    / df.UP_nb_orders
    df['UP_nb_reordered'] = df.UP.map(UserProduct.nb_reordered)
    df['UP_reorder_ratio'] = (df.UP_nb_reordered / df.UP_nb_orders).astype(np.float32)
    df['UP_last_order_id'] = df.UP.map(UserProduct.last_order_id)
    df['UP_avg_order_dow'] = (df.UP.map(UserProduct.sum_order_dow)\
                              / df.UP_nb_orders).astype(np.float32)
    df['UP_avg_order_hour_of_day'] = (df.UP.map(UserProduct.sum_order_hour_of_day) / \
                                    df.UP_nb_orders).astype(np.float32)
    df['UP_order_ratio'] = (df.UP_nb_orders / df.user_total_orders).astype(np.float32)
    df['UP_order_since_last'] = df.user_total_orders - \
                                df.UP_last_order_id.map(order.order_number)
    #最后一次买该产品和该订单-相同产品相隔的时间(没有算日期。。。)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(\
                                       order.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    df['UP_min_add_to_cart_order'] = df.UP.map(UserProduct.min_add_to_cart_order)
    df['UP_max_add_to_cart_order'] = df.UP.map(UserProduct.max_add_to_cart_order)
    df.drop('UP', axis=1, inplace=True)
    
#     print(df.dtypes)
#     print(df.memory_usage())
    return(df, labels)

In [4]:
print('train order size: ', train_orders.shape)
df_train, labels = gen_features(train_orders, labels_out=True)
df_train['label'] = pd.Series(labels, dtype=np.int8)
print('test order size: ', test_orders.shape)
df_test, _ = gen_features(test_orders)
del order
del UserProduct
del products
df_train['label'] = pd.Series(labels, dtype=np.int8)

user_id_list=users.index.tolist()
nb_user = len(user_id_list)
val_nb_user = nb_user // 10

train order size:  (131209, 7)
generate features and labels(optional) from selected orders
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000
user related features<prior>
product related features<prior>
order related features<train>
userXproduct related features<prior>
test order size:  (75000, 7)
generate features and labels(optional) from selected orders
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
user related features<prior>
product related features<prior>
order related features<train>
userXproduct related features<prior>


In [5]:
for nb in range(10):
    print("Doing Cross %d Validation"% nb)
    val_start = nb*val_nb_user
    val_end = (nb+1)*val_nb_user
    if nb==9:
        val_end = nb_user
    print("validation start: %d, end: %d" % (val_start, val_end))
    train_user_ids = np.concatenate((user_id_list[:val_start],\
                                    user_id_list[val_end:]))
    val_user_ids = user_id_list[val_start:val_end]
    print("Split train and valid data/label by user_id")
    sub_df_val = df_train[df_train.user_id.isin(val_user_ids)]
    sub_df_train = df_train[df_train.user_id.isin(train_user_ids)]
    sub_train_label = np.array(sub_df_train['label'])
    sub_val_label = np.array(sub_df_val['label'])
    sub_df_train.drop('label', axis=1, inplace=True)
    sub_df_val.drop('label', axis=1, inplace=True)
    params={
    'booster':'gbtree',
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss',
    'gamma':0.7,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth':10, # 构建树的深度，越大越容易过拟合
    'lambda':10,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample':0.76, # 随机采样训练样本
    'colsample_bytree':0.95, # 生成树时进行的列采样
    'min_child_weight':10,  
    'silent':0 ,#设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.1, # 如同学习率
    'seed':77,
    'nthread':8,# cpu 线程数
    }
    train = np.array(sub_df_train)
    valid = np.array(sub_df_val)
    n = 150
    plst = list(params.items())
    xgtrain = xgb.DMatrix(train, label=sub_train_label)
    xgval = xgb.DMatrix(valid, label=sub_val_label)
    watchlist = [(xgtrain, 'train'), (xgval, 'val')]
    model = xgb.train(plst, xgtrain, n, watchlist, early_stopping_rounds=100)
    model.save_model('CV_0724_'+str(nb)+'.model')

Doing Cross 0 Validation
validation start: 0, end: 20620
Split train and valid data/label by user_id


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


[0]	train-logloss:0.625364	val-logloss:0.625497
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 100 rounds.
[1]	train-logloss:0.569941	val-logloss:0.570179
[2]	train-logloss:0.523931	val-logloss:0.52427
[3]	train-logloss:0.48533	val-logloss:0.485775
[4]	train-logloss:0.452657	val-logloss:0.453206
[5]	train-logloss:0.424819	val-logloss:0.425455
[6]	train-logloss:0.400972	val-logloss:0.401682
[7]	train-logloss:0.380464	val-logloss:0.381236
[8]	train-logloss:0.362761	val-logloss:0.363603
[9]	train-logloss:0.347494	val-logloss:0.348398
[10]	train-logloss:0.334211	val-logloss:0.335188
[11]	train-logloss:0.32269	val-logloss:0.323743
[12]	train-logloss:0.312648	val-logloss:0.313753
[13]	train-logloss:0.303907	val-logloss:0.305076
[14]	train-logloss:0.296292	val-logloss:0.297543
[15]	train-logloss:0.289642	val-logloss:0.290947
[16]	train-logloss:0.283839	val-logloss:0.285185
[17]	train-logloss:0.278779	val-

[10]	train-logloss:0.334393	val-logloss:0.335535
[11]	train-logloss:0.322825	val-logloss:0.324027
[12]	train-logloss:0.312763	val-logloss:0.314024
[13]	train-logloss:0.304009	val-logloss:0.305331
[14]	train-logloss:0.296376	val-logloss:0.297756
[15]	train-logloss:0.289718	val-logloss:0.291157
[16]	train-logloss:0.283917	val-logloss:0.285417
[17]	train-logloss:0.278849	val-logloss:0.280403
[18]	train-logloss:0.274423	val-logloss:0.276032
[19]	train-logloss:0.270554	val-logloss:0.272221
[20]	train-logloss:0.26717	val-logloss:0.26889
[21]	train-logloss:0.264418	val-logloss:0.266224
[22]	train-logloss:0.261825	val-logloss:0.26368
[23]	train-logloss:0.259559	val-logloss:0.261472
[24]	train-logloss:0.257589	val-logloss:0.25955
[25]	train-logloss:0.25585	val-logloss:0.257872
[26]	train-logloss:0.25434	val-logloss:0.256412
[27]	train-logloss:0.253018	val-logloss:0.255144
[28]	train-logloss:0.251975	val-logloss:0.254177
[29]	train-logloss:0.250951	val-logloss:0.25321
[30]	train-logloss:0.250049

[22]	train-logloss:0.261921	val-logloss:0.262405
[23]	train-logloss:0.259665	val-logloss:0.260198
[24]	train-logloss:0.257702	val-logloss:0.258276
[25]	train-logloss:0.255977	val-logloss:0.256594
[26]	train-logloss:0.254462	val-logloss:0.255134
[27]	train-logloss:0.253138	val-logloss:0.253864
[28]	train-logloss:0.251992	val-logloss:0.252762
[29]	train-logloss:0.25099	val-logloss:0.251806
[30]	train-logloss:0.250109	val-logloss:0.25098
[31]	train-logloss:0.249313	val-logloss:0.250262
[32]	train-logloss:0.248616	val-logloss:0.249628
[33]	train-logloss:0.248017	val-logloss:0.249088
[34]	train-logloss:0.247476	val-logloss:0.248616
[35]	train-logloss:0.246996	val-logloss:0.248204
[36]	train-logloss:0.246595	val-logloss:0.247858
[37]	train-logloss:0.246208	val-logloss:0.247539
[38]	train-logloss:0.245881	val-logloss:0.247281
[39]	train-logloss:0.245588	val-logloss:0.247051
[40]	train-logloss:0.245305	val-logloss:0.246833
[41]	train-logloss:0.245054	val-logloss:0.246647
[42]	train-logloss:0.2

[35]	train-logloss:0.247133	val-logloss:0.247479
[36]	train-logloss:0.246749	val-logloss:0.247167
[37]	train-logloss:0.246371	val-logloss:0.246847
[38]	train-logloss:0.246021	val-logloss:0.246543
[39]	train-logloss:0.245722	val-logloss:0.246292
[40]	train-logloss:0.245457	val-logloss:0.246086
[41]	train-logloss:0.245193	val-logloss:0.245898
[42]	train-logloss:0.244976	val-logloss:0.245733
[43]	train-logloss:0.244751	val-logloss:0.245568
[44]	train-logloss:0.244564	val-logloss:0.245442
[45]	train-logloss:0.244384	val-logloss:0.245319
[46]	train-logloss:0.244224	val-logloss:0.245215
[47]	train-logloss:0.24408	val-logloss:0.245125
[48]	train-logloss:0.243924	val-logloss:0.245047
[49]	train-logloss:0.243797	val-logloss:0.244969
[50]	train-logloss:0.243675	val-logloss:0.24491
[51]	train-logloss:0.243557	val-logloss:0.244847
[52]	train-logloss:0.243423	val-logloss:0.244793
[53]	train-logloss:0.243305	val-logloss:0.244737
[54]	train-logloss:0.243156	val-logloss:0.244686
[55]	train-logloss:0.2

[47]	train-logloss:0.244095	val-logloss:0.245375
[48]	train-logloss:0.243939	val-logloss:0.245286
[49]	train-logloss:0.243767	val-logloss:0.245196
[50]	train-logloss:0.243624	val-logloss:0.245117
[51]	train-logloss:0.243508	val-logloss:0.245063
[52]	train-logloss:0.243382	val-logloss:0.244996
[53]	train-logloss:0.243253	val-logloss:0.244923
[54]	train-logloss:0.243137	val-logloss:0.244876
[55]	train-logloss:0.243005	val-logloss:0.244827
[56]	train-logloss:0.242879	val-logloss:0.244788
[57]	train-logloss:0.24276	val-logloss:0.244748
[58]	train-logloss:0.242678	val-logloss:0.244717
[59]	train-logloss:0.242597	val-logloss:0.244689
[60]	train-logloss:0.242497	val-logloss:0.244673
[61]	train-logloss:0.24242	val-logloss:0.24465
[62]	train-logloss:0.242345	val-logloss:0.244626
[63]	train-logloss:0.24225	val-logloss:0.244595
[64]	train-logloss:0.24215	val-logloss:0.244574
[65]	train-logloss:0.242079	val-logloss:0.244553
[66]	train-logloss:0.242004	val-logloss:0.244536
[67]	train-logloss:0.2419

[59]	train-logloss:0.24232	val-logloss:0.246409
[60]	train-logloss:0.242263	val-logloss:0.246395
[61]	train-logloss:0.242161	val-logloss:0.24638
[62]	train-logloss:0.242071	val-logloss:0.246355
[63]	train-logloss:0.242	val-logloss:0.246324
[64]	train-logloss:0.241897	val-logloss:0.246299
[65]	train-logloss:0.241802	val-logloss:0.246287
[66]	train-logloss:0.241714	val-logloss:0.246267
[67]	train-logloss:0.241634	val-logloss:0.246249
[68]	train-logloss:0.241516	val-logloss:0.246231
[69]	train-logloss:0.241436	val-logloss:0.246211
[70]	train-logloss:0.241343	val-logloss:0.246195
[71]	train-logloss:0.24124	val-logloss:0.246187
[72]	train-logloss:0.241154	val-logloss:0.246179
[73]	train-logloss:0.241091	val-logloss:0.24617
[74]	train-logloss:0.241021	val-logloss:0.246163
[75]	train-logloss:0.240974	val-logloss:0.246155
[76]	train-logloss:0.2409	val-logloss:0.246146
[77]	train-logloss:0.240793	val-logloss:0.246137
[78]	train-logloss:0.240728	val-logloss:0.246125
[79]	train-logloss:0.240667	v

[72]	train-logloss:0.241361	val-logloss:0.245682
[73]	train-logloss:0.241253	val-logloss:0.245687
[74]	train-logloss:0.24114	val-logloss:0.245684
[75]	train-logloss:0.24109	val-logloss:0.245673
[76]	train-logloss:0.241047	val-logloss:0.245669
[77]	train-logloss:0.240938	val-logloss:0.245678
[78]	train-logloss:0.240859	val-logloss:0.245665
[79]	train-logloss:0.240758	val-logloss:0.245655
[80]	train-logloss:0.240676	val-logloss:0.245656
[81]	train-logloss:0.240612	val-logloss:0.245644
[82]	train-logloss:0.240524	val-logloss:0.245639
[83]	train-logloss:0.240456	val-logloss:0.245626
[84]	train-logloss:0.240418	val-logloss:0.245617
[85]	train-logloss:0.240328	val-logloss:0.245613
[86]	train-logloss:0.240231	val-logloss:0.245606
[87]	train-logloss:0.24014	val-logloss:0.2456
[88]	train-logloss:0.24003	val-logloss:0.245578
[89]	train-logloss:0.23997	val-logloss:0.245574
[90]	train-logloss:0.239919	val-logloss:0.245568
[91]	train-logloss:0.239846	val-logloss:0.245578
[92]	train-logloss:0.239767

[85]	train-logloss:0.24035	val-logloss:0.246184
[86]	train-logloss:0.240269	val-logloss:0.246182
[87]	train-logloss:0.240194	val-logloss:0.246177
[88]	train-logloss:0.24012	val-logloss:0.246151
[89]	train-logloss:0.240044	val-logloss:0.246156
[90]	train-logloss:0.23997	val-logloss:0.24615
[91]	train-logloss:0.239929	val-logloss:0.246147
[92]	train-logloss:0.239888	val-logloss:0.246148
[93]	train-logloss:0.23983	val-logloss:0.246142
[94]	train-logloss:0.239728	val-logloss:0.246115
[95]	train-logloss:0.239671	val-logloss:0.246118
[96]	train-logloss:0.239599	val-logloss:0.246114
[97]	train-logloss:0.239543	val-logloss:0.246115
[98]	train-logloss:0.239433	val-logloss:0.246109
[99]	train-logloss:0.23936	val-logloss:0.246088
[100]	train-logloss:0.239269	val-logloss:0.246087
[101]	train-logloss:0.239226	val-logloss:0.246075
[102]	train-logloss:0.239179	val-logloss:0.246077
[103]	train-logloss:0.239097	val-logloss:0.246054
[104]	train-logloss:0.239047	val-logloss:0.246046
[105]	train-logloss:0

[97]	train-logloss:0.239208	val-logloss:0.24799
[98]	train-logloss:0.239149	val-logloss:0.247987
[99]	train-logloss:0.239065	val-logloss:0.247977
[100]	train-logloss:0.239002	val-logloss:0.247981
[101]	train-logloss:0.23895	val-logloss:0.247968
[102]	train-logloss:0.238872	val-logloss:0.247972
[103]	train-logloss:0.238835	val-logloss:0.247967
[104]	train-logloss:0.238776	val-logloss:0.247968
[105]	train-logloss:0.238694	val-logloss:0.247964
[106]	train-logloss:0.238642	val-logloss:0.247963
[107]	train-logloss:0.238558	val-logloss:0.247957
[108]	train-logloss:0.238493	val-logloss:0.247946
[109]	train-logloss:0.238387	val-logloss:0.247951
[110]	train-logloss:0.238311	val-logloss:0.24796
[111]	train-logloss:0.238241	val-logloss:0.247967
[112]	train-logloss:0.238177	val-logloss:0.247949
[113]	train-logloss:0.23814	val-logloss:0.247954
[114]	train-logloss:0.238072	val-logloss:0.247947
[115]	train-logloss:0.237998	val-logloss:0.247946
[116]	train-logloss:0.23795	val-logloss:0.247945
[117]	tr

[109]	train-logloss:0.238607	val-logloss:0.245153
[110]	train-logloss:0.238529	val-logloss:0.245155
[111]	train-logloss:0.238476	val-logloss:0.245165
[112]	train-logloss:0.238436	val-logloss:0.24516
[113]	train-logloss:0.238396	val-logloss:0.245159
[114]	train-logloss:0.238349	val-logloss:0.245158
[115]	train-logloss:0.238284	val-logloss:0.245159
[116]	train-logloss:0.238229	val-logloss:0.245148
[117]	train-logloss:0.238136	val-logloss:0.245127
[118]	train-logloss:0.238077	val-logloss:0.24513
[119]	train-logloss:0.238	val-logloss:0.245135
[120]	train-logloss:0.237933	val-logloss:0.245132
[121]	train-logloss:0.237899	val-logloss:0.245132
[122]	train-logloss:0.237831	val-logloss:0.245135
[123]	train-logloss:0.237769	val-logloss:0.245135
[124]	train-logloss:0.237723	val-logloss:0.245124
[125]	train-logloss:0.237666	val-logloss:0.245129
[126]	train-logloss:0.237592	val-logloss:0.245131
[127]	train-logloss:0.237549	val-logloss:0.245134
[128]	train-logloss:0.237473	val-logloss:0.245174
[129]

In [6]:
df_test_array = np.array(df_test)
xgtest = xgb.DMatrix(df_test_array)
xgbst = xgb.Booster()
for i in range(10):
    print('xgboost predict from model',str(i))
    xgbst.load_model('./CV_0724_'+str(i)+'.model')
    if i == 0:
        preds = xgbst.predict(xgtest)
    else:
        preds += xgbst.predict(xgtest)
preds = preds / 10.
df_test['pred'] = preds
THRESHOLD=0.2
d = dict()
for row in df_test.itertuples():
    if row.pred > THRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)
for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

tst = pd.DataFrame.from_dict(d, orient='index')
tst.reset_index(inplace=True)
tst.columns = ['order_id', 'products']
tst.to_csv('submission_60_features_THRESHOLD_0.2_10_fold_CV_0724.csv', index=False)

xgboost predict from model 0
xgboost predict from model 1
xgboost predict from model 2
xgboost predict from model 3
xgboost predict from model 4
xgboost predict from model 5
xgboost predict from model 6
xgboost predict from model 7
xgboost predict from model 8
xgboost predict from model 9
