In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from IPython.display import display
import xgboost as xgb
from sklearn.model_selection import GroupKFold

In [2]:
print('loading aisles...')
aisles = pd.read_csv('aisles.csv', dtype={
        'aisle_id': np.uint16,
        'aisle': 'category'})

print('loading department...')
department = pd.read_csv('departments.csv', dtype={
            'department_id': np.uint8,
            'department': 'category'})

print('loading products...')
products = pd.read_csv('products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.uint32,
        'aisle_id': np.uint8,
        'department_id': np.uint8})

print('loading prior orders...')
prior = pd.read_csv('order_products__prior.csv', dtype={
        'order_id': np.uint32,
        'product_id': np.uint16,
        'add_to_cart_order': np.uint16,
        'reordered': np.uint16})

print('loading train orders...')
train = pd.read_csv('order_products__train.csv', dtype={
        'order_id': np.uint32,
        'product_id': np.uint16,
        'add_to_cart_order': np.uint16,
        'reordered': np.uint8})

print('loading orders...')
order = pd.read_csv('orders.csv' , dtype={
        'order_id': np.uint32,
        'user_id': np.uint32,
        'eval_set': 'category',
        'order_number': np.uint16,
        'order_dow': np.uint16,
        'order_hour_of_day': np.uint16,
        'days_since_prior_order': np.float32})


loading aisles...
loading department...
loading products...
loading prior orders...
loading train orders...
loading orders...


In [3]:
train_orders = order[order.eval_set == 'train']
test_orders = order[order.eval_set == 'test']
train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

In [4]:
order.set_index('order_id', inplace=True, drop=False)
prior = prior.join(order, on='order_id', rsuffix='_')
prior.drop('order_id_', inplace=True, axis=1)

In [5]:
prods = pd.DataFrame()
prods['total_nb'] = prior.groupby(prior.product_id).size().astype(np.uint32)
prods['nb_reorder'] = prior.groupby(prior.product_id)['reordered'].sum().astype(np.uint32)
prods['reorder_rate'] = prods.nb_reorder / prods.total_nb.astype(np.float32)
prods['nb_buyers'] = prior.groupby(prior.product_id)['user_id'].apply(lambda x: len(set(x))).astype(np.uint16) # unique buyers
prods['avg_add_to_cart_order'] = prior.groupby(prior.product_id)['add_to_cart_order'].mean().astype(np.uint8)
prods['nb_orders'] = prior.groupby(prior.product_id).size().astype(np.uint16)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

In [6]:
def find_nb_aisle(set_of_product_id):
    return len(set([products[products.product_id == s].aisle_id.values[0] for s in set_of_product_id]))
def find_nb_department(set_of_product_id):
    return len(set([products[products.product_id == s].department_id.values[0] for s in set_of_product_id]))

In [7]:
users = pd.DataFrame()
users['nb_order'] = order[order.eval_set == 'prior'].groupby('user_id').size().astype(np.uint16)
users['avg_days_between_order'] = prior.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
users['avg_hour_of_day'] = prior.groupby('user_id')['order_hour_of_day'].mean().astype(np.float32)
users['nb_total_items'] = prior.groupby('user_id').size().astype(np.uint16)
users['all_products'] = prior.groupby('user_id')['product_id'].apply(set) # apply 对每个行或者列调用一次函数
users['nb_distinct_items'] = (users['all_products'].map(len)).astype(np.uint16) #map 对每个元素(element-wise)调用一次函数
users['average_basket'] = (users.nb_total_items / users.nb_order).astype(np.float32)
users['min_days_of_week'] = prior.groupby(prior.user_id)['order_dow'].apply(min).astype(np.uint8)
users['max_days_of_week'] = prior.groupby(prior.user_id)['order_dow'].apply(max).astype(np.uint8)
# users['mid_days_of_week'] = prior.groupby(prior.user_id)['order_dow'].apply(np.median).astype(np.uint8)
# users['min_hour_of_day'] = prior.groupby(prior.user_id)['order_hour_of_day'].apply(min).astype(np.uint8)
# users['max_hour_of_day'] = prior.groupby(prior.user_id)['order_hour_of_day'].apply(max).astype(np.uint8)
# users['mid_hour_of_day'] = prior.groupby(prior.user_id)['order_hour_of_day'].apply(np.median).astype(np.uint8)
# users['nb_aisles_purchased_from'] = users.all_products.map(find_nb_aisle)
# users['nb_departments_purchased_from'] = users.all_products.map(find_nb_department)

In [8]:
display(users)

Unnamed: 0_level_0,nb_order,avg_days_between_order,avg_hour_of_day,nb_total_items,all_products,nb_distinct_items,average_basket,min_days_of_week,max_days_of_week
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,10,20.259260,10.542373,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,5.900000,1,4
2,14,15.967033,10.441026,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,13.928572,1,5
3,12,11.487180,16.352272,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,7.333333,0,3
4,5,15.357142,13.111111,18,"{21573, 17769, 35469, 37646, 1200, 19057, 2657...",17,3.600000,4,6
5,4,14.500000,15.729730,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,9.250000,0,3
6,3,7.800000,17.000000,14,"{40992, 27521, 20323, 48679, 8424, 45007, 2190...",12,4.666667,2,5
7,20,13.546391,13.631068,206,"{11520, 35333, 519, 10504, 47623, 45066, 13198...",68,10.300000,0,6
8,3,30.000000,2.448980,49,"{11136, 8193, 17794, 26882, 39812, 24838, 651,...",36,16.333334,1,6
9,3,24.260870,14.263158,76,"{8834, 38277, 5002, 11790, 38159, 7952, 34448,...",58,25.333334,0,5
10,5,20.746376,16.902098,143,"{36865, 20995, 13829, 43014, 11782, 18441, 476...",94,28.600000,2,5


In [9]:
prior['user_product_index'] = (prior.user_id.astype(np.uint64) * 100000\
                               + prior.product_id).astype(np.uint64)
d = dict()
for row in prior.itertuples():
    k = row.user_product_index
    if k not in d:
        d[k] = (1, row.add_to_cart_order, row.reordered, (row.order_number, row.order_id),\
                row.order_dow, row.order_hour_of_day)
    else:
        d[k] = (d[k][0]+1, d[k][1]+row.add_to_cart_order, \
                d[k][2]+row.reordered,
                # find last order with that product
                max(d[k][3], (row.order_number, row.order_id)), \
                d[k][4]+row.order_dow, \
                d[k][5]+row.order_hour_of_day)

In [10]:
UserProduct = pd.DataFrame.from_dict(d, orient='index')
del d
del prior
UserProduct.columns = ['nb_orders', 'sum_add_to_cart_order', 'nb_reordered', \
                      'last_order_id', 'sum_order_dow', 'sum_order_hour_of_day']
UserProduct['nb_orders'] = UserProduct.nb_orders.astype(np.uint16) 
UserProduct['sum_add_to_cart_order'] = UserProduct.sum_add_to_cart_order.astype(np.uint16)
UserProduct['nb_reordered'] = UserProduct.nb_reordered.astype(np.uint16)
UserProduct['last_order_id'] = UserProduct.last_order_id.map(lambda x: x[1]).astype(np.uint32)
UserProduct['sum_order_dow'] = UserProduct.sum_order_dow.astype(np.uint16)
UserProduct['sum_order_hour_of_day'] = UserProduct.sum_order_hour_of_day.astype(np.uint32)

In [11]:
user_id_list=users.index.tolist()
print("user id:", len(user_id_list))

user id: 206209


In [12]:
def gen_features(orders, labels_out=False):
    print('generate features and labels(optional) from selected orders')
    count=0
    product_list = []
    order_list = []
    labels = []
    for row in orders.itertuples():
        count+=1
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_out:
            labels += [(order_id, product) in train.index for product in user_products]
        if count%10000 == 0:
            print('order row', count)
            
    df = pd.DataFrame({'order_id': order_list, 'product_id': product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print("user related features<prior>")
    df['user_id'] = df.order_id.map(order.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_order)
    df['user_total_items'] = df.user_id.map(users.nb_total_items)
    df['user_distinct_items'] = df.user_id.map(users.nb_distinct_items)
    df['user_avg_days_between_orders'] = df.user_id.map(users.avg_days_between_order)
    df['user_avg_basket'] = df.user_id.map(users.average_basket)
#     df['user_nb_aisles_purchased_from'] = df.user_id.map(users.nb_aisles_purchased_from)
#     df['user_nb_departments_purchased_from'] = df.user_id.map(users.nb_departments_purchased_from)
    df['user_min_days_of_week'] = df.user_id.map(users.min_days_of_week)
    df['user_max_days_of_week'] = df.user_id.map(users.max_days_of_week)
#     df['user_mid_days_of_week'] = df.user_id.map(users.min_days_of_week)
#     df['user_min_hour_of_day'] = df.user_id.map(users.min_hour_of_day)
#     df['user_max_hour_of_day'] = df.user_id.map(users.max_hour_of_day)
#     df['user_mid_hour_of_day'] = df.user_id.map(users.mid_hour_of_day)
    
    print("product related features<prior>")
    df['product_aisle_id'] = df.product_id.map(products.aisle_id)
    df['product_department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.total_nb)
    df['product_reorders'] = df.product_id.map(products.nb_reorder)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
    df['product_nb_buyers'] = df.product_id.map(products.nb_buyers)
    df['product_avg_add_to_cart_order'] = df.product_id.map(products.avg_add_to_cart_order)
    df['nb_orders'] = df.product_id.map(products.nb_orders)
    
    print("order related features<train>")
    df['order_hour_of_day'] = df.order_id.map(order.order_hour_of_day)
    df['order_days_since_prior_order'] = df.order_id.map(order.days_since_prior_order)
    df['order_day_of_week'] = df.order_id.map(order.order_dow)
    
    print("userXproduct related features<prior>")
    # 1.nb_orders, 2.sum_add_to_cart_order, 3.nb_reordered, \
    # 4.last_order_id, 5.sum_order_dow, 6.sum_order_hour_of_day
    df['UP'] = df.product_id+df.user_id.astype(np.uint64)*100000
    df['UP_nb_orders'] = df.UP.map(UserProduct.nb_orders)
    df['UP_avg_add_to_cart_order'] = df.UP.map(UserProduct.sum_add_to_cart_order)\
                                    / df.UP_nb_orders
    df['UP_nb_reordered'] = df.UP.map(UserProduct.nb_reordered)
    df['UP_reorder_ratio'] = (df.UP_nb_reordered / df.UP_nb_orders).astype(np.float32)
    df['UP_last_order_id'] = df.UP.map(UserProduct.last_order_id)
    df['UP_avg_order_dow'] = (df.UP.map(UserProduct.sum_order_dow)\
                              / df.UP_nb_orders).astype(np.float32)
    df['UP_avg_order_hour_of_day'] = (df.UP.map(UserProduct.sum_order_hour_of_day) / \
                                    df.UP_nb_orders).astype(np.float32)
    df['UP_order_ratio'] = (df.UP_nb_orders / df.user_total_orders).astype(np.float32)
    df['UP_order_since_last'] = df.user_total_orders - \
                                df.UP_last_order_id.map(order.order_number)
    #最后一次买该产品和该订单-相同产品相隔的时间(没有算日期。。。)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(\
                                       order.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    df.drop(['UP_last_order_id', 'UP'], axis=1, inplace=True)
    
    print(df.dtypes)
    print(df.memory_usage())
    return(df, labels)
print('train order size: ', train_orders.shape)
df_train, labels = gen_features(train_orders, labels_out=True)

train order size:  (131209, 7)
generate features and labels(optional) from selected orders
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000
user related features<prior>
product related features<prior>
order related features<train>
userXproduct related features<prior>
order_id                           int32
product_id                         int32
user_id                           uint32
user_total_orders                 uint16
user_total_items                  uint16
user_distinct_items               uint16
user_avg_days_between_orders     float32
user_avg_basket                  float32
user_min_days_of_week              uint8
user_max_days_of_week              uint8
product_aisle_id                   uint8
product_department_id              uint8
product_orders                   float64
product_reorders                 flo

In [13]:
print('test order size: ', test_orders.shape)
df_test, _ = gen_features(test_orders)

test order size:  (75000, 7)
generate features and labels(optional) from selected orders
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
user related features<prior>
product related features<prior>
order related features<train>
userXproduct related features<prior>
order_id                           int32
product_id                         int32
user_id                           uint32
user_total_orders                 uint16
user_total_items                  uint16
user_distinct_items               uint16
user_avg_days_between_orders     float32
user_avg_basket                  float32
user_min_days_of_week              uint8
user_max_days_of_week              uint8
product_aisle_id                   uint8
product_department_id              uint8
product_orders                   float64
product_reorders                 float64
product_reorder_rate             float64
product_nb_buyers                float64
product_avg_add

In [14]:
del order
del UserProduct
del products

In [15]:
df_train['label'] = pd.Series(labels, dtype=np.int8)

In [None]:
display(df_train)

In [16]:
np.random.shuffle(user_id_list)
train_user_ids = user_id_list[10000:]
val_user_ids = user_id_list[:10000]

In [17]:
print("Split train and valid data/label by user_id")
sub_df_val = df_train[df_train.user_id.isin(val_user_ids)]
sub_df_train = df_train[df_train.user_id.isin(train_user_ids)]
sub_train_label = np.array(sub_df_train['label'])
sub_val_label = np.array(sub_df_val['label'])
sub_df_train.drop('label', axis=1, inplace=True)
sub_df_val.drop('label', axis=1, inplace=True)

Split train and valid data/label by user_id


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [20]:
display(sub_train_label)

array([0, 1, 1, ..., 0, 0, 0], dtype=int8)

In [24]:
del df_train

In [21]:
params={
'booster':'gbtree',
'objective': 'binary:logistic', 
'eval_metric': 'logloss',
'gamma':0.7,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
'max_depth':10, # 构建树的深度，越大越容易过拟合
'lambda':10,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
'subsample':0.76, # 随机采样训练样本
'colsample_bytree':0.95, # 生成树时进行的列采样
'min_child_weight':10,  
'silent':0 ,#设置成1则没有运行信息输出，最好是设置为0.
'eta': 0.1, # 如同学习率
'seed':42,
'nthread':8,# cpu 线程数
}
train = np.array(sub_df_train)
valid = np.array(sub_df_val)
n = 100
plst = list(params.items())
xgtrain = xgb.DMatrix(train, label=sub_train_label)
xgval = xgb.DMatrix(valid, label=sub_val_label)
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
model = xgb.train(plst, xgtrain, n, watchlist, early_stopping_rounds=100)

[0]	train-logloss:0.625342	val-logloss:0.625657
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 100 rounds.
[1]	train-logloss:0.569902	val-logloss:0.570488
[2]	train-logloss:0.523896	val-logloss:0.524717
[3]	train-logloss:0.485294	val-logloss:0.486313
[4]	train-logloss:0.453249	val-logloss:0.454473
[5]	train-logloss:0.425309	val-logloss:0.426703
[6]	train-logloss:0.401394	val-logloss:0.402941
[7]	train-logloss:0.3814	val-logloss:0.383108
[8]	train-logloss:0.363579	val-logloss:0.365428
[9]	train-logloss:0.348166	val-logloss:0.350143
[10]	train-logloss:0.334846	val-logloss:0.336948
[11]	train-logloss:0.323232	val-logloss:0.325448
[12]	train-logloss:0.313124	val-logloss:0.315446
[13]	train-logloss:0.304332	val-logloss:0.30675
[14]	train-logloss:0.297037	val-logloss:0.29956
[15]	train-logloss:0.290707	val-logloss:0.29334
[16]	train-logloss:0.284786	val-logloss:0.28751
[17]	train-logloss:0.279622	val-log

In [None]:
print('xgboost predict')
df_test_array = np.array(df_test)
xgtest = xgb.DMatrix(df_test_array)
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
del df_test_array

xgboost predict


In [1]:
df_test['pred'] = preds

NameError: name 'preds' is not defined

In [None]:
THRESHOLD=0.22
d = dict()
for row in df_test.itertuples():
    if row.pred > THRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)
for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

tst = pd.DataFrame.from_dict(d, orient='index')
tst.reset_index(inplace=True)
tst.columns = ['order_id', 'products']
tst.to_csv('submission_newbie_3.csv', index=False)