In [20]:
import pandas as pd
import numpy as np
import matplotlib as plt
from IPython.display import display
import xgboost as xgb

In [2]:
print('loading aisles...')
aisles = pd.read_csv('aisles.csv', dtype={
        'aisle_id': np.uint16,
        'aisle': 'category'})

print('loading department...')
department = pd.read_csv('departments.csv', dtype={
            'department_id': np.uint8,
            'department': 'category'})

print('loading products...')
products = pd.read_csv('products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8})

print('loading prior orders...')
prior = pd.read_csv('order_products__prior.csv', dtype={
        'order_id': np.int32,
        'product_id': np.uint16,
        'add_to_cart_order': np.int16,
        'reordered': np.int8})

print('loading train orders...')
train = pd.read_csv('order_products__train.csv', dtype={
        'order_id': np.int32,
        'product_id': np.uint16,
        'add_to_cart_order': np.int16,
        'reordered': np.int8})

print('loading orders...')
order = pd.read_csv('orders.csv' , dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})


loading aisles...
loading department...
loading products...
loading prior orders...
loading train orders...
loading orders...


In [3]:
train_orders = order[order.eval_set == 'train']
test_orders = order[order.eval_set == 'test']
train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

In [4]:
prods = pd.DataFrame()
prods['total_nb'] = prior.groupby(prior.product_id).size().astype(np.int32)
prods['nb_reorder'] = prior.groupby(prior.product_id)['reordered'].sum().astype(np.int32)
prods['reorder_rate'] = prods.nb_reorder / prods.total_nb.astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

In [5]:
order.set_index('order_id', inplace=True, drop=False)
prior = prior.join(order, on='order_id', rsuffix='_')
prior.drop('order_id_', inplace=True, axis=1)

In [7]:
users = pd.DataFrame()
users['nb_order'] = prior.groupby('user_id')['order_number'].size().astype(np.int16)
users['avg_days_between_order'] = prior.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
users['avg_hour_of_day'] = prior.groupby('user_id')['order_hour_of_day'].mean().astype(np.float32)
users['nb_total_items'] = prior.groupby('user_id').size().astype(np.int16)
users['all_products'] = prior.groupby('user_id')['product_id'].apply(set)
users['nb_distinct_items'] = (users['all_products'].map(len)).astype(np.int16)
users['average_basket'] = users.nb_total_items / users.nb_order.astype(np.float32)

In [18]:
display(products)

Unnamed: 0_level_0,product_id,product_name,aisle_id,department_id,total_nb,nb_reorder,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,Chocolate Sandwich Cookies,61,19,1852.0,1136.0,0.613391
2,2,All-Seasons Salt,104,13,90.0,12.0,0.133333
3,3,Robust Golden Unsweetened Oolong Tea,94,7,277.0,203.0,0.732852
4,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,329.0,147.0,0.446809
5,5,Green Chile Anytime Sauce,5,13,15.0,9.0,0.600000
6,6,Dry Nose Oil,11,11,8.0,3.0,0.375000
7,7,Pure Coconut Water With Orange,98,7,30.0,12.0,0.400000
8,8,Cut Russet Potatoes Steam N' Mash,116,1,165.0,83.0,0.503030
9,9,Light Strawberry Blueberry Yogurt,120,16,156.0,82.0,0.525641
10,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,2572.0,1304.0,0.506998


In [11]:
def gen_features(orders, labels_out=False):
    print('generate features and labels(optional) from selected orders')
    count=0
    product_list = []
    order_list = []
    labels = []
    for row in orders.itertuples():
        count+=1
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_out:
            labels += [(order_id, product) in train.index for product in user_products]
        if count%10000 == 0:
            print('order row', count)
    df = pd.DataFrame({'order_id': order_list, 'product_id': product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print("user related features")
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_order)
    df['user_total_items'] = df.user_id.map(users.nb_total_items)
    df['user_avg_days_between_orders'] = df.user_id.map(users.avg_days_between_order)
    df['user_avg_basket'] = df.user_id.map(users.average_basket)
    
    print("product related features")
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.total_nb)
    df['product_reorders'] = df.product_id.map(products.nb_reorder)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
    
    print(df.dtypes)
    print(df.memory_usage())
    return(df, labels)
print('train order size: ', train_orders.shape)
df_train, labels = gen_features(train_orders, labels_out=True)

train order size:  (131209, 7)
generate features and labels(optional) from selected orders
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000
user related features
product related features
order_id                          int32
product_id                        int32
user_id                         float64
user_total_orders               float64
user_total_items                float64
user_avg_days_between_orders    float32
user_avg_basket                 float32
aisle_id                          uint8
department_id                     uint8
product_orders                  float64
product_reorders                float64
product_reorder_rate            float64
dtype: object
Index                                 80
order_id                        33898644
product_id                      33898644
user_id                         6

In [42]:
print('test order size: ', test_orders.shape)
df_test, _ = gen_features(test_orders)

test order size:  (75000, 7)
generate features and labels(optional) from selected orders
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
user related features
product related features
order_id                          int32
product_id                        int32
user_id                         float64
user_total_orders               float64
user_total_items                float64
user_avg_days_between_orders    float32
user_avg_basket                 float32
aisle_id                          uint8
department_id                     uint8
product_orders                  float64
product_reorders                float64
product_reorder_rate            float64
dtype: object
Index                                 80
order_id                        19333168
product_id                      19333168
user_id                         38666336
user_total_orders               38666336
user_total_items                38666336
user_avg_day

In [33]:
params={
'booster':'gbtree',
'objective': 'binary:logistic', 
'eval_metric': 'logloss',
'gamma':0.7,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
'max_depth':10, # 构建树的深度，越大越容易过拟合
'lambda':10,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
'subsample':0.7, # 随机采样训练样本
'colsample_bytree':0.95, # 生成树时进行的列采样
'min_child_weight':10,  
'silent':0 ,#设置成1则没有运行信息输出，最好是设置为0.
'eta': 0.01, # 如同学习率
'seed':666,
'nthread':4,# cpu 线程数
}
df_train = np.array(df_train)
n = 500
plst = list(params.items())
offset = 7500000
xgtrain = xgb.DMatrix(df_train[:offset, :], label=labels[:offset])
xgval = xgb.DMatrix(df_train[offset:, :], label=labels[offset:])
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
model = xgb.train(plst, xgtrain, n, watchlist, early_stopping_rounds=100)


[0]	train-logloss:0.686591	val-logloss:0.686568
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 100 rounds.
[1]	train-logloss:0.680165	val-logloss:0.680119
[2]	train-logloss:0.673865	val-logloss:0.673797
[3]	train-logloss:0.66769	val-logloss:0.667599
[4]	train-logloss:0.661631	val-logloss:0.661518
[5]	train-logloss:0.65569	val-logloss:0.655555
[6]	train-logloss:0.649876	val-logloss:0.649719
[7]	train-logloss:0.644163	val-logloss:0.643984
[8]	train-logloss:0.638556	val-logloss:0.638356
[9]	train-logloss:0.633055	val-logloss:0.632834
[10]	train-logloss:0.627656	val-logloss:0.627414
[11]	train-logloss:0.622372	val-logloss:0.622109
[12]	train-logloss:0.617172	val-logloss:0.616888
[13]	train-logloss:0.612069	val-logloss:0.611764
[14]	train-logloss:0.607071	val-logloss:0.606746
[15]	train-logloss:0.60215	val-logloss:0.601805
[16]	train-logloss:0.597319	val-logloss:0.596954
[17]	train-logloss:0.592577	val-

[164]	train-logloss:0.331679	val-logloss:0.329474
[165]	train-logloss:0.331218	val-logloss:0.329006
[166]	train-logloss:0.330764	val-logloss:0.328545
[167]	train-logloss:0.330317	val-logloss:0.328091
[168]	train-logloss:0.329877	val-logloss:0.327644
[169]	train-logloss:0.329443	val-logloss:0.327204
[170]	train-logloss:0.329015	val-logloss:0.32677
[171]	train-logloss:0.328601	val-logloss:0.326348
[172]	train-logloss:0.328191	val-logloss:0.325932
[173]	train-logloss:0.327784	val-logloss:0.325519
[174]	train-logloss:0.327383	val-logloss:0.325111
[175]	train-logloss:0.326987	val-logloss:0.324708
[176]	train-logloss:0.326597	val-logloss:0.324313
[177]	train-logloss:0.326213	val-logloss:0.323922
[178]	train-logloss:0.325835	val-logloss:0.323539
[179]	train-logloss:0.325463	val-logloss:0.32316
[180]	train-logloss:0.325103	val-logloss:0.322793
[181]	train-logloss:0.324746	val-logloss:0.322431
[182]	train-logloss:0.324392	val-logloss:0.32207
[183]	train-logloss:0.324042	val-logloss:0.321714
[18

[329]	train-logloss:0.30417	val-logloss:0.301411
[330]	train-logloss:0.304136	val-logloss:0.301377
[331]	train-logloss:0.304102	val-logloss:0.301343
[332]	train-logloss:0.304069	val-logloss:0.301308
[333]	train-logloss:0.304036	val-logloss:0.301275
[334]	train-logloss:0.304004	val-logloss:0.301242
[335]	train-logloss:0.303972	val-logloss:0.30121
[336]	train-logloss:0.303942	val-logloss:0.301179
[337]	train-logloss:0.303911	val-logloss:0.301148
[338]	train-logloss:0.303881	val-logloss:0.301118
[339]	train-logloss:0.303852	val-logloss:0.301089
[340]	train-logloss:0.303822	val-logloss:0.301059
[341]	train-logloss:0.303794	val-logloss:0.30103
[342]	train-logloss:0.303765	val-logloss:0.301001
[343]	train-logloss:0.303738	val-logloss:0.300973
[344]	train-logloss:0.30371	val-logloss:0.300946
[345]	train-logloss:0.303683	val-logloss:0.300918
[346]	train-logloss:0.303656	val-logloss:0.300891
[347]	train-logloss:0.30363	val-logloss:0.300864
[348]	train-logloss:0.303604	val-logloss:0.300838
[349]

[494]	train-logloss:0.302006	val-logloss:0.299363
[495]	train-logloss:0.302003	val-logloss:0.299361
[496]	train-logloss:0.301999	val-logloss:0.299358
[497]	train-logloss:0.301994	val-logloss:0.299356
[498]	train-logloss:0.30199	val-logloss:0.299354
[499]	train-logloss:0.301985	val-logloss:0.299351


In [34]:
del df_train

In [43]:
print('xgboost predict')
df_test_array = np.array(df_test)
xgtest = xgb.DMatrix(df_test_array)
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
del df_test_array

xgboost predict


In [46]:
df_test['pred'] = preds

In [49]:
THRESHOLD=0.21
d = dict()
for row in df_test.itertuples():
    if row.pred > THRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)
for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

tst = pd.DataFrame.from_dict(d, orient='index')
tst.reset_index(inplace=True)
tst.columns = ['order_id', 'products']
tst.to_csv('submission_newbie.csv', index=False)