In [1]:
# Author : Paul-Antoine Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold


import numpy as np
import pandas as pd
import lightgbm as lgb
IDIR = '../input/'

In [2]:
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


## Compute Data

In [None]:
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
prods.reset_index(inplace = True)
products = pd.merge(products, prods,  on='product_id', how='left')
products.set_index('product_id', inplace=True)

del prods
products.to_csv('./features/products.csv')

In [None]:
print('add order info to priors')
priors = pd.merge(priors, orders[['order_id', 'user_id', 'order_number']], on='order_id', how='left')

In [None]:
### user features

print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['total_distinct_items'] = priors.groupby('user_id')['product_id'].apply(lambda x: len(set(x))).astype(np.int16)

users = users.join(usr)
del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

In [None]:
users.to_csv('./features/users.csv')

### User x Product matrix with the following features:
- Number of orders for that product (by each user)
- The order_id of the last order which is required to compute another feature, which is how many orders has been done from this last order of this product. This measure how long we have not ordered this product
- Average_pos_in_cart

In [None]:
user_product = priors.groupby(['user_id', 'product_id'])[['order_id']].count().astype(np.uint16)
user_product.rename(columns={'order_id': 'nb_orders'}, inplace=True)

In [None]:
user_add_to_cart_avg = priors.groupby(['user_id', 'product_id'])[['add_to_cart_order']].sum().astype(np.uint16)
user_add_to_cart_avg.rename(columns={'add_to_cart_order': 'add_to_cart_order_sum'}, inplace=True)
user_product = user_product.join(user_add_to_cart_avg)
del user_add_to_cart_avg

In [None]:
last_order_number = priors.groupby(['user_id', 'product_id'])['order_number'].max()
user_product = user_product.join(last_order_number)

In [None]:
user_product.reset_index(inplace = True)
user_product = pd.merge(user_product, orders[['user_id', 'order_id', 'order_number', 'order_hour_of_day']], 
                        on = ['user_id', 'order_number'], how = 'left' )
user_product.set_index(['user_id', 'product_id'], drop = True, inplace=True)
user_product.rename(columns={'order_number': 'last_order_number', 'order_id': 'last_order_id', 
                             'order_hour_of_day': 'last_order_hour_of_day'}, inplace=True)
#user_product.to_csv('./features/user_product.csv')

## Load precomputed data

In [3]:
user_product =  pd.read_csv('./features/user_product.csv', dtype={        
        'user_id': np.int32,
        'product_id': np.uint16,
        'add_to_cart_order_sum': np.uint16,
        'nb_orders': np.uint16,
        'last_order_number': np.int16,
        'last_order_id':np.uint32,
        'last_order_hour_of_day': np.int8
})


In [4]:
users =  pd.read_csv('./features/users.csv', dtype={        
        'user_id': np.int32,
        'total_items': np.uint16,
        'total_distinct_items': np.uint16,
        'average_days_between_orders': np.float32,
        'nb_orders': np.uint16,
        'average_basket': np.float32
})

In [5]:
products =  pd.read_csv('./features/products.csv', dtype={ 
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8,
        'orders': np.float32,
        'reorders': np.float32,
        'reorder_rate': np.float32
})

In [6]:
priors = pd.merge(priors, orders[['order_id', 'user_id']], on='order_id', how='left')

In [8]:
### build list of candidate products to reorder, with features ###

def features(selected_orders, labels_given=False):
    print('build candidate list')
        
    # all (order_id, product_id) from priors by all the users that are in selected_orders    
    df = priors[priors.user_id.isin(selected_orders.user_id.unique())][['order_id', 'product_id', 'user_id']]    
    
    print('user related features')
    df = pd.merge(df, users, on = 'user_id', how='left')    
    df.rename(columns={'nb_orders': 'user_total_orders', 'total_items':'user_total_items', 
                       'average_days_between_orders': 'user_average_days_between_orders', 
                       'average_basket': 'user_average_basket'}, inplace=True)
    
    print('order related features') 
    df = pd.merge(df, orders[['order_id', 'order_hour_of_day', 'days_since_prior_order',]], on = 'order_id', how='left')
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders    
        
    print('product related features')
    df = pd.merge(df, products, on = 'product_id', how='left')
    df.rename(columns={'orders': 'product_orders', 'reorders': 'product_reorders', 'reorder_rate':'product_reorder_rate'}, inplace=True)

    print('user_X_product related features')
    df = pd.merge(df, user_product, on = ['user_id', 'product_id'], how='left')
    df.rename(columns={'nb_orders': 'UP_orders'}, inplace=True)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_average_pos_in_cart'] = (df.add_to_cart_order_sum / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = (df.user_total_orders - df.last_order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.last_order_hour_of_day).map(lambda x: min(x, 24-x)).astype(np.int8)
    df.drop(['last_order_number', 'last_order_hour_of_day', 'last_order_id'],axis=1, inplace = True)
    
    if labels_given:
        train['label'] = 1
        train['label'] = train['label'].astype(np.int8)
        train = pd.merge(train, orders[['order_id', 'user_id']], on = 'order_id', how='left')
        df = pd.merge(df, train[['user_id', 'product_id', 'label']], on = ['user_id', 'product_id'], how='left')
        train.drop(['label', 'user_id'], axis = 1, inplace = True)
    
    #print(df.dtypes)
    print("features memory in Mb", df.memory_usage().sum()/1000000)
    return df

In [9]:
### train / test orders ##
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

split orders : train, test


In [10]:
df = features(train_orders)

build candidate list
user related features
order related features
product related features
user_X_product related features
features memory in Mb 1568.791316


In [20]:
#df.to_csv("./features/df_train_2.csv")
#df.dtypes.to_csv('./features/df_train_dtypes.csv')

In [16]:
df.head()

Unnamed: 0,order_id,product_id,user_id,user_total_items,total_distinct_items,user_average_days_between_orders,user_total_orders,user_average_basket,order_hour_of_day,days_since_prior_order,...,product_reorders,product_reorder_rate,UP_orders,add_to_cart_order_sum,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last,label
0,2,33120,202279,90,47,22.625,9,10.0,9,8.0,...,13744.0,0.708454,5,10,0.555556,2.0,0.555556,1,1,1.0
1,2,28985,202279,90,47,22.625,9,10.0,9,8.0,...,46841.0,0.694465,5,16,0.555556,3.2,0.555556,3,0,
2,2,9327,202279,90,47,22.625,9,10.0,9,8.0,...,995.0,0.157962,1,3,0.111111,3.0,0.111111,6,0,
3,2,45918,202279,90,47,22.625,9,10.0,9,8.0,...,203.0,0.272849,5,24,0.555556,4.8,0.555556,2,5,
4,2,30035,202279,90,47,22.625,9,10.0,9,8.0,...,289.0,0.507909,3,14,0.333333,4.666667,0.333333,2,5,


In [17]:
labels = df.label

## Model