In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
from IPython.display import display
import xgboost as xgb

print('loading aisles...')
aisles = pd.read_csv('aisles.csv', dtype={
        'aisle_id': np.uint16,
        'aisle': 'category'})

print('loading department...')
department = pd.read_csv('departments.csv', dtype={
            'department_id': np.uint8,
            'department': 'category'})

print('loading products...')
products = pd.read_csv('products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.uint32,
        'aisle_id': np.uint8,
        'department_id': np.uint8})

print('loading prior orders...')
prior = pd.read_csv('order_products__prior.csv', dtype={
        'order_id': np.uint32,
        'product_id': np.uint16,
        'add_to_cart_order': np.uint16,
        'reordered': np.uint16})

print('loading train orders...')
train = pd.read_csv('order_products__train.csv', dtype={
        'order_id': np.uint32,
        'product_id': np.uint16,
        'add_to_cart_order': np.uint16,
        'reordered': np.uint8})

print('loading orders...')
order = pd.read_csv('orders_with_NaN_estimate.csv' , dtype={
        'order_id': np.uint32,
        'user_id': np.uint32,
        'eval_set': 'category',
        'order_number': np.uint16,
        'order_dow': np.uint16,
        'order_hour_of_day': np.uint16,
        'days_since_prior_order': np.float32})

In [None]:
# mean_days_since_prior_order = np.array(order['days_since_prior_order'].groupby(order.user_id).mean().astype(np.float32))
# j=0
# print('size of mean_days_since_prior_order: ', len(mean_days_since_prior_order))
# for i, item in enumerate(order['days_since_prior_order']):
#     if np.isnan(item):
#         order.loc[i, 'days_since_prior_order'] = mean_days_since_prior_order[j]
#         j+=1
#         if not j%10000:
#             print('j=',j)
# order.to_csv('orders_with_NaN_estimate.csv', index=False)

In [None]:
train_orders = order[order.eval_set == 'train']
test_orders = order[order.eval_set == 'test']
prior_orders = order[order.eval_set == 'prior']
train_prior_orders = pd.concat([train_orders, prior_orders], ignore_index=True)

train_prior = pd.concat([train, prior],ignore_index=True)
order.set_index('order_id', inplace=True, drop=False)
train_prior = train_prior.join(order, on='order_id', rsuffix='_')
train_prior.drop('order_id_', inplace=True, axis=1)


In [None]:
users = pd.DataFrame()
users['user_id'] = train_prior_orders.groupby('user_id')['user_id'].apply(lambda x: x.iloc[0])
users['nb_order'] = train_prior_orders.groupby('user_id').size().astype(np.uint16)
users['avg_days_between_order'] = train_prior.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
users['avg_hour_of_day'] = train_prior.groupby('user_id')['order_hour_of_day'].mean().astype(np.float32)
users['nb_total_items'] = train_prior.groupby('user_id').size().astype(np.uint16)
users['all_products'] = train_prior.groupby('user_id')['product_id'].apply(set) # apply 对每个行或者列调用一次函数
users['nb_distinct_items'] = (users['all_products'].map(len)).astype(np.uint16) #map 对每个元素(element-wise)调用一次函数
users['average_basket'] = (users.nb_total_items / users.nb_order).astype(np.float32)
users['min_days_of_week'] = train_prior.groupby('user_id')['order_dow'].apply(min).astype(np.uint8)
users['max_days_of_week'] = train_prior.groupby('user_id')['order_dow'].apply(max).astype(np.uint8)
users.set_index('user_id', inplace=True, drop=False)

In [None]:
# order_id X user_id related features
features = pd.DataFrame()
features['order_id'] = train_prior_orders['order_id']
features['user_id'] = train_prior_orders['user_id']
features['order_number'] = train_prior_orders['order_number']
features['order_dow'] = train_prior_orders['order_dow']
features['order_hour_of_day'] = train_prior_orders['order_hour_of_day']
features['order_days_since_prior'] = train_prior_orders['days_since_prior_order']
features.set_index('order_id', inplace=True, drop=False)


In [None]:
def gen_features(orders, info):
    g_features = pd.DataFrame()
    g_features['order_id'] = orders.order_id
    g_features['user_id'] = g_features.order_id.map(features.user_id)
    g_features['order_number'] = g_features.order_id.map(features.order_number)
    g_features['order_dow'] = g_features.order_id.map(features.order_dow)
    g_features['order_hour_of_day'] = g_features.order_id.map(features.order_hour_of_day)
    g_features['order_days_since_prior'] = g_features.order_id.map(features.order_days_since_prior)
    g_features['user_nb_order'] = g_features.user_id.map(users.nb_order)
    g_features['user_avg_days_between_order'] = g_features.user_id.map(users.avg_days_between_order)
    g_features['user_avg_hour_of_day'] = g_features.user_id.map(users.avg_hour_of_day)
    g_features['user_nb_total_items'] = g_features.user_id.map(users.nb_total_items)
    g_features['user_nb_distinct_items'] = g_features.user_id.map(users.nb_distinct_items)
    g_features['user_average_basket'] = g_features.user_id.map(users.average_basket)
    g_features['user_min_days_of_week'] = g_features.user_id.map(users.min_days_of_week)
    g_features['user_max_days_of_week'] = g_features.user_id.map(users.max_days_of_week)
    # calculate label
    df_order = pd.DataFrame()
    df_order['order_id'] = info.groupby('order_id')['order_id'].apply(lambda x: x.iloc[0])
    df_order['nb_reorder'] = info.groupby('order_id')['reordered'].sum().apply(np.uint16)
    df_order['reorder_label'] = (df_order['nb_reorder'] > 0).astype(np.int8)
    df_order.set_index('order_id', inplace=True, drop=False)
    g_features['label'] = g_features.order_id.map(df_order.reorder_label)
    g_features.drop('order_id', inplace=True)
    return g_features
train_set = gen_features(prior_orders, prior)
valid_set = gen_features(train_orders, train)
        

In [None]:
train_label = np.array(train_set['label'])
train_set.drop('label', axis=1, inplace=True)
train_data = np.array(train_set)

valid_label = np.array(valid_set['label'])
valid_set.drop('label',axis=1, inplace=True)
valid_data = np.array(valid_set)
del train_set
del valid_set
del users
del features
del order
del prior
del train
del products

In [None]:
params={
    'booster':'gbtree',
    'objective': 'binary:logistic', 
    'eval_metric': 'logloss',
    'gamma':0.7,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth':10, # 构建树的深度，越大越容易过拟合
    'lambda':10,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample':0.76, # 随机采样训练样本
    'colsample_bytree':0.95, # 生成树时进行的列采样
    'min_child_weight':10,  
    'silent':0 ,#设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.07, # 如同学习率
    'seed':15,
    'nthread':8,# cpu 线程数
    }
n = 150
plst = list(params.items())
xgtrain = xgb.DMatrix(train_data, label=train_label)
xgval = xgb.DMatrix(valid_data, label=valid_label)
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
model = xgb.train(plst, xgtrain, n, watchlist, early_stopping_rounds=100)
#     model.save_model('CV_0724_'+str(nb)+'.model')

In [None]:
xgtest = xgb.DMatrix(valid_data)
preds = model.predict(xgtest)
preds = np.uint8(preds>0.9)
acc = np.sum(preds==valid_label) / np.float32(len(preds))
print(acc)

In [None]:
display()

In [None]:
%matplotlib inline
xgb.plot_importance(model)