In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from IPython.display import display
import xgboost as xgb

In [2]:
print('loading aisles...')
aisles = pd.read_csv('aisles.csv', dtype={
        'aisle_id': np.uint16,
        'aisle': 'category'})

print('loading department...')
department = pd.read_csv('departments.csv', dtype={
            'department_id': np.uint8,
            'department': 'category'})

print('loading products...')
products = pd.read_csv('products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.uint32,
        'aisle_id': np.uint8,
        'department_id': np.uint8})

print('loading prior orders...')
prior = pd.read_csv('order_products__prior.csv', dtype={
        'order_id': np.uint32,
        'product_id': np.uint16,
        'add_to_cart_order': np.uint16,
        'reordered': np.uint16})

print('loading train orders...')
train = pd.read_csv('order_products__train.csv', dtype={
        'order_id': np.uint32,
        'product_id': np.uint16,
        'add_to_cart_order': np.uint16,
        'reordered': np.uint8})

print('loading orders...')
order = pd.read_csv('orders.csv' , dtype={
        'order_id': np.uint32,
        'user_id': np.uint32,
        'eval_set': 'category',
        'order_number': np.uint16,
        'order_dow': np.uint16,
        'order_hour_of_day': np.uint16,
        'days_since_prior_order': np.float32})


loading aisles...
loading department...
loading products...
loading prior orders...
loading train orders...
loading orders...


In [3]:
train_orders = order[order.eval_set == 'train']
test_orders = order[order.eval_set == 'test']
train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

In [4]:
order.set_index('order_id', inplace=True, drop=False)
prior = prior.join(order, on='order_id', rsuffix='_')
prior.drop('order_id_', inplace=True, axis=1)

In [5]:
prods = pd.DataFrame()
prods['total_nb'] = prior.groupby(prior.product_id).size().astype(np.uint32)
prods['nb_reorder'] = prior.groupby(prior.product_id)['reordered'].sum().astype(np.uint32)
prods['reorder_rate'] = prods.nb_reorder / prods.total_nb.astype(np.float32)
prods['nb_buyers'] = prior.groupby(prior.product_id)['user_id'].apply(lambda x: len(set(x))).astype(np.uint16) # unique buyers
prods['avg_add_to_cart_order'] = prior.groupby(prior.product_id)['add_to_cart_order'].mean().astype(np.uint8)
prods['nb_orders'] = prior.groupby(prior.product_id).size().astype(np.uint16)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

In [6]:
def find_nb_aisle(set_of_product_id):
    return len(set([products[products.product_id == s].aisle_id.values[0] for s in set_of_product_id]))
def find_nb_department(set_of_product_id):
    return len(set([products[products.product_id == s].department_id.values[0] for s in set_of_product_id]))

In [7]:
users = pd.DataFrame()
users['nb_order'] = prior.groupby('user_id')['order_number'].size().astype(np.uint16)
users['avg_days_between_order'] = prior.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
users['avg_hour_of_day'] = prior.groupby('user_id')['order_hour_of_day'].mean().astype(np.float32)
users['nb_total_items'] = prior.groupby('user_id')['product_id'].size().astype(np.uint16)
users['all_products'] = prior.groupby('user_id')['product_id'].apply(set) # apply 对每个行或者列调用一次函数
users['nb_distinct_items'] = (users['all_products'].map(len)).astype(np.uint16) #map 对每个元素(element-wise)调用一次函数
users['average_basket'] = (users.nb_total_items / users.nb_order).astype(np.float32)
users['min_days_of_week'] = prior.groupby(prior.user_id)['order_dow'].apply(min).astype(np.uint8)
users['max_days_of_week'] = prior.groupby(prior.user_id)['order_dow'].apply(max).astype(np.uint8)
# users['mid_days_of_week'] = prior.groupby(prior.user_id)['order_dow'].apply(np.median).astype(np.uint8)
# users['min_hour_of_day'] = prior.groupby(prior.user_id)['order_hour_of_day'].apply(min).astype(np.uint8)
# users['max_hour_of_day'] = prior.groupby(prior.user_id)['order_hour_of_day'].apply(max).astype(np.uint8)
# users['mid_hour_of_day'] = prior.groupby(prior.user_id)['order_hour_of_day'].apply(np.median).astype(np.uint8)
# users['nb_aisles_purchased_from'] = users.all_products.map(find_nb_aisle)
# users['nb_departments_purchased_from'] = users.all_products.map(find_nb_department)

In [8]:
prior['user_product_index'] = (prior.user_id.astype(np.uint64) * 100000\
                               + prior.product_id).astype(np.uint64)
d = dict()
for row in prior.itertuples():
    k = row.user_product_index
    if k not in d:
        d[k] = (1, row.add_to_cart_order, row.reordered, (row.order_number, row.order_id),\
                row.order_dow, row.order_hour_of_day)
    else:
        d[k] = (d[k][0]+1, d[k][1]+row.add_to_cart_order, \
                d[k][2]+row.reordered,
                # find last order with that product
                max(d[k][3], (row.order_number, row.order_id)), \
                d[k][4]+row.order_dow, \
                d[k][5]+row.order_hour_of_day)

In [9]:
UserProduct = pd.DataFrame.from_dict(d, orient='index')
del d
del prior
UserProduct.columns = ['nb_orders', 'sum_add_to_cart_order', 'nb_reordered', \
                      'last_order_id', 'sum_order_dow', 'sum_order_hour_of_day']
UserProduct['nb_orders'] = UserProduct.nb_orders.astype(np.uint16) 
UserProduct['sum_add_to_cart_order'] = UserProduct.sum_add_to_cart_order.astype(np.uint16)
UserProduct['nb_reordered'] = UserProduct.nb_reordered.astype(np.uint16)
UserProduct['last_order_id'] = UserProduct.last_order_id.map(lambda x: x[1]).astype(np.uint32)
UserProduct['sum_order_dow'] = UserProduct.sum_order_dow.astype(np.uint16)
UserProduct['sum_order_hour_of_day'] = UserProduct.sum_order_hour_of_day.astype(np.uint32)

In [10]:
user_id_list=users.index.tolist()
print("user id:", len(user_id_list))

user id: 206209


In [11]:
def gen_features(orders, labels_out=False):
    print('generate features and labels(optional) from selected orders')
    count=0
    product_list = []
    order_list = []
    labels = []
    for row in orders.itertuples():
        count+=1
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_out:
            labels += [(order_id, product) in train.index for product in user_products]
        if count%10000 == 0:
            print('order row', count)
            
    df = pd.DataFrame({'order_id': order_list, 'product_id': product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print("user related features<prior>")
    df['user_id'] = df.order_id.map(order.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_order)
    df['user_total_items'] = df.user_id.map(users.nb_total_items)
    df['user_distinct_items'] = df.user_id.map(users.nb_distinct_items)
    df['user_avg_days_between_orders'] = df.user_id.map(users.avg_days_between_order)
    df['user_avg_basket'] = df.user_id.map(users.average_basket)
#     df['user_nb_aisles_purchased_from'] = df.user_id.map(users.nb_aisles_purchased_from)
#     df['user_nb_departments_purchased_from'] = df.user_id.map(users.nb_departments_purchased_from)
    df['user_min_days_of_week'] = df.user_id.map(users.min_days_of_week)
    df['user_max_days_of_week'] = df.user_id.map(users.max_days_of_week)
#     df['user_mid_days_of_week'] = df.user_id.map(users.min_days_of_week)
#     df['user_min_hour_of_day'] = df.user_id.map(users.min_hour_of_day)
#     df['user_max_hour_of_day'] = df.user_id.map(users.max_hour_of_day)
#     df['user_mid_hour_of_day'] = df.user_id.map(users.mid_hour_of_day)
    
    print("product related features<prior>")
    df['product_aisle_id'] = df.product_id.map(products.aisle_id)
    df['product_department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.total_nb)
    df['product_reorders'] = df.product_id.map(products.nb_reorder)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
    df['product_nb_buyers'] = df.product_id.map(products.nb_buyers)
    df['product_avg_add_to_cart_order'] = df.product_id.map(products.avg_add_to_cart_order)
    df['nb_orders'] = df.product_id.map(products.nb_orders)
    
    print("order related features<train>")
    df['order_hour_of_day'] = df.order_id.map(order.order_hour_of_day)
    df['order_days_since_prior_order'] = df.order_id.map(order.days_since_prior_order)
    df['order_day_of_week'] = df.order_id.map(order.order_dow)
    
    print("userXproduct related features<prior>")
    # 1.nb_orders, 2.sum_add_to_cart_order, 3.nb_reordered, \
    # 4.last_order_id, 5.sum_order_dow, 6.sum_order_hour_of_day
    df['UP'] = df.product_id+df.user_id.astype(np.uint64)*100000
    df['UP_nb_orders'] = df.UP.map(UserProduct.nb_orders)
    df['UP_avg_add_to_cart_order'] = df.UP.map(UserProduct.sum_add_to_cart_order)\
                                    / df.UP_nb_orders
    df['UP_nb_reordered'] = df.UP.map(UserProduct.nb_reordered)
    df['UP_reorder_ratio'] = (df.UP_nb_reordered / df.UP_nb_orders).astype(np.float32)
    df['UP_last_order_id'] = df.UP.map(UserProduct.last_order_id)
    df['UP_avg_order_dow'] = (df.UP.map(UserProduct.sum_order_dow)\
                              / df.UP_nb_orders).astype(np.float32)
    df['UP_avg_order_hour_of_day'] = (df.UP.map(UserProduct.sum_order_hour_of_day) / \
                                    df.UP_nb_orders).astype(np.float32)
    df['UP_order_ratio'] = (df.UP_nb_orders / df.user_total_orders).astype(np.float32)
    df['UP_order_since_last'] = df.user_total_orders - \
                                df.UP_last_order_id.map(order.order_number)
    #最后一次买该产品和该订单-相同产品相隔的时间(没有算日期。。。)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(\
                                       order.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    df.drop(['UP_last_order_id', 'UP'], axis=1, inplace=True)
    
    print(df.dtypes)
    print(df.memory_usage())
    return(df, labels)
print('train order size: ', train_orders.shape)
df_train, labels = gen_features(train_orders, labels_out=True)

train order size:  (131209, 7)
generate features and labels(optional) from selected orders
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000
user related features<prior>
product related features<prior>
order related features<train>
userXproduct related features<prior>
order_id                           int32
product_id                         int32
user_id                           uint32
user_total_orders                 uint16
user_total_items                  uint16
user_distinct_items               uint16
user_avg_days_between_orders     float32
user_avg_basket                  float32
user_min_days_of_week              uint8
user_max_days_of_week              uint8
product_aisle_id                   uint8
product_department_id              uint8
product_orders                   float64
product_reorders                 flo

In [12]:
print('test order size: ', test_orders.shape)
df_test, _ = gen_features(test_orders)

test order size:  (75000, 7)
generate features and labels(optional) from selected orders
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
user related features<prior>
product related features<prior>
order related features<train>
userXproduct related features<prior>
order_id                           int32
product_id                         int32
user_id                           uint32
user_total_orders                 uint16
user_total_items                  uint16
user_distinct_items               uint16
user_avg_days_between_orders     float32
user_avg_basket                  float32
user_min_days_of_week              uint8
user_max_days_of_week              uint8
product_aisle_id                   uint8
product_department_id              uint8
product_orders                   float64
product_reorders                 float64
product_reorder_rate             float64
product_nb_buyers                float64
product_avg_add

In [13]:
np.random.shuffle(user_id_list)
train_user_ids = user_id_list[20000:]
val_user_ids = user_id_list[:20000]

In [14]:
display(df_train)

Unnamed: 0,order_id,product_id,user_id,user_total_orders,user_total_items,user_distinct_items,user_avg_days_between_orders,user_avg_basket,user_min_days_of_week,user_max_days_of_week,...,order_day_of_week,UP_nb_orders,UP_avg_add_to_cart_order,UP_nb_reordered,UP_reorder_ratio,UP_avg_order_dow,UP_avg_order_hour_of_day,UP_order_ratio,UP_order_since_last,UP_delta_hour_vs_last
0,1187899,17122,1,59,59,18,20.259260,1.0,1,4,...,4,1,6.000000,0,0.000000,4.000000,15.000000,0.016949,54,31
1,1187899,196,1,59,59,18,20.259260,1.0,1,4,...,4,10,1.400000,9,0.900000,2.500000,10.300000,0.169492,49,0
2,1187899,26405,1,59,59,18,20.259260,1.0,1,4,...,4,2,5.000000,1,0.500000,3.000000,7.500000,0.033898,55,1
3,1187899,46149,1,59,59,18,20.259260,1.0,1,4,...,4,3,3.000000,2,0.666667,2.000000,12.666667,0.050847,49,0
4,1187899,14084,1,59,59,18,20.259260,1.0,1,4,...,4,1,2.000000,0,0.000000,2.000000,8.000000,0.016949,58,0
5,1187899,13032,1,59,59,18,20.259260,1.0,1,4,...,4,3,6.333333,2,0.666667,2.666667,8.000000,0.050847,49,0
6,1187899,26088,1,59,59,18,20.259260,1.0,1,4,...,4,2,4.500000,1,0.500000,2.500000,7.500000,0.033898,57,1
7,1187899,39657,1,59,59,18,20.259260,1.0,1,4,...,4,1,3.000000,0,0.000000,4.000000,8.000000,0.016949,49,0
8,1187899,12427,1,59,59,18,20.259260,1.0,1,4,...,4,10,3.300000,9,0.900000,2.500000,10.300000,0.169492,49,0
9,1187899,25133,1,59,59,18,20.259260,1.0,1,4,...,4,8,4.000000,7,0.875000,2.500000,11.000000,0.135593,49,0


In [28]:
df_train_user_id = np.array(df_train.user_id)
df_train.groupby('user_id').loc(lambda df:df.user_id in df_train_user_id )
# train_user_id_boolset = [id in train_user_ids for u_id in df_train_user_id]
# val_user_id_boolset = [id in val_user_ids for u_id in df_train_user_id]
# print(train_user_id_boolset)

AttributeError: Cannot access callable attribute 'loc' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [None]:
del order
del UserProduct
del products

In [16]:
params={
'booster':'gbtree',
'objective': 'binary:logistic', 
'eval_metric': 'logloss',
'gamma':0.7,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
'max_depth':10, # 构建树的深度，越大越容易过拟合
'lambda':10,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
'subsample':0.76, # 随机采样训练样本
'colsample_bytree':0.95, # 生成树时进行的列采样
'min_child_weight':10,  
'silent':0 ,#设置成1则没有运行信息输出，最好是设置为0.
'eta': 0.1, # 如同学习率
'seed':46,
'nthread':8,# cpu 线程数
}
df_train = np.array(df_train)
n = 80
plst = list(params.items())
offset = 7300000
xgtrain = xgb.DMatrix(df_train[:offset, :], label=labels[:offset])
xgval = xgb.DMatrix(df_train[offset:, :], label=labels[offset:])
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
model = xgb.train(plst, xgtrain, n, watchlist, early_stopping_rounds=100)

[0]	train-logloss:0.630518	val-logloss:0.630409
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 100 rounds.
[1]	train-logloss:0.579378	val-logloss:0.579171
[2]	train-logloss:0.537092	val-logloss:0.536792
[3]	train-logloss:0.501818	val-logloss:0.501433
[4]	train-logloss:0.472149	val-logloss:0.471686
[5]	train-logloss:0.447093	val-logloss:0.446557
[6]	train-logloss:0.425826	val-logloss:0.425225
[7]	train-logloss:0.407729	val-logloss:0.407068
[8]	train-logloss:0.392291	val-logloss:0.391575
[9]	train-logloss:0.379103	val-logloss:0.378334
[10]	train-logloss:0.367829	val-logloss:0.367012
[11]	train-logloss:0.358172	val-logloss:0.357312
[12]	train-logloss:0.349931	val-logloss:0.349036
[13]	train-logloss:0.342849	val-logloss:0.341917
[14]	train-logloss:0.336791	val-logloss:0.335824
[15]	train-logloss:0.331613	val-logloss:0.330616
[16]	train-logloss:0.327183	val-logloss:0.326157
[17]	train-logloss:0.323386	v

In [17]:
del df_train

In [18]:
print('xgboost predict')
df_test_array = np.array(df_test)
xgtest = xgb.DMatrix(df_test_array)
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
del df_test_array

xgboost predict


In [19]:
df_test['pred'] = preds

In [20]:
THRESHOLD=0.22
d = dict()
for row in df_test.itertuples():
    if row.pred > THRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)
for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

tst = pd.DataFrame.from_dict(d, orient='index')
tst.reset_index(inplace=True)
tst.columns = ['order_id', 'products']
tst.to_csv('submission_newbie_2.csv', index=False)