In [None]:
# Author : Trong Canh Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold


import numpy as np
import pandas as pd
import gc

IDIR = '../input/'
FEATURES_PATH = './features3/'

In [None]:
gc.collect()

## Data Construction

In [None]:
dtype_dict = {
    'user_id': np.int32,
    'product_id': np.int32,
    
    'up_orders': np.int16,
    'up_first_order':np.int16,
    'up_last_order':np.int16,
    'up_add_to_cart_order_mean':np.float32,
    'up_order_rate': np.float32,
    'up_order_rate_since_first_order': np.float32,
    'up_orders_since_last_order':np.float32,
    'up_days_since_last_order': np.int16,
    'up_in_same_day_previous_order': np.int8,
    
    'user_total_order': np.int16,
    'user_order_size_mean': np.float32,
    'user_reorder_rate': np.float32,
    'user_days_since_last_order': np.float32,
    
    
    'up_reordered': np.float32,
    
    'product_reorder_ratio':np.float32,
    
    'aisle_id':np.int16,
    'department_id':np.int16,
    'aisle_reorder_rate': np.float32,
    'dep_reorder_rate': np.float32,
    'user_aisle_reorder_rate': np.float32,
    'user_dep_reorder_rate': np.float32
    
    
}

In [None]:
import pickle
with open(FEATURES_PATH + 'dtypes.pickle', 'wb') as f:
    pickle.dump(dtype_dict, f)

### Preload if necessary

In [None]:
data = pd.read_csv(FEATURES_PATH + 'data.csv', dtype= dtype_dict)
print("Memory (Gb):", data.memory_usage().sum()/1000000)

### Append features

In [None]:
#data = pd.read_csv(FEATURES_PATH + "up_order_rates.csv", dtype= dtype_dict)
data = pd.read_hdf(FEATURES_PATH + "features.h5", "up_order_rates")
data.reset_index(inplace = True)

In [None]:
#tmp = pd.read_csv(FEATURES_PATH + "up_days_since_last_order.csv", dtype= dtype_dict)
tmp = pd.read_hdf(FEATURES_PATH + "features.h5", "up_days_since_last_order").reset_index()
data = pd.merge(data, tmp, on=["user_id", "product_id"], how = 'left')

In [None]:
#tmp = pd.read_csv(FEATURES_PATH + "up_add_to_cart_order_mean.csv", dtype= dtype_dict)
tmp = pd.read_hdf(FEATURES_PATH + "features.h5", "up_add_to_cart_order_mean").reset_index()
data = pd.merge(data, tmp, on=["user_id", "product_id"], how = 'left')

In [None]:
#tmp = pd.read_csv(FEATURES_PATH + "up_reordered.csv", dtype= dtype_dict)
tmp = pd.read_hdf(FEATURES_PATH + "features.h5", "up_reordered").reset_index()
data = pd.merge(data, tmp, on=["user_id", "product_id"], how = 'left')

In [None]:
del tmp
print("data memory", data.memory_usage().sum()/1000000)
gc.collect()

In [None]:
data.head()

### User info

In [None]:
#user_info = pd.read_csv(FEATURES_PATH + "user_info.csv", dtype= dtype_dict)
user_info = pd.read_hdf(FEATURES_PATH + "features.h5", "user_info").reset_index()

In [None]:
user_info.head()

In [None]:
data = pd.merge(data, user_info, on=["user_id"], how = 'left')

### Product info

In [None]:
#product_info = pd.read_csv(FEATURES_PATH + "product_info.csv", dtype= dtype_dict)
product_info = pd.read_hdf(FEATURES_PATH + "features.h5", "product_info").reset_index()
product_info.head()

In [None]:
data = pd.merge(data, product_info, on=["product_id"], how = 'left')

In [None]:
print("data memory", data.memory_usage().sum()/1000000)
gc.collect()

### Aisle & Department

In [None]:
#aisle_info = pd.read_csv(FEATURES_PATH + "aisle_info.csv", dtype= dtype_dict)
aisle_info = pd.read_hdf(FEATURES_PATH + "features.h5", "aisle_info").reset_index()
data = pd.merge(data, aisle_info, on=["aisle_id"], how = 'left')

In [None]:
#department_info = pd.read_csv(FEATURES_PATH + "department_info.csv", dtype= dtype_dict)
department_info = pd.read_hdf(FEATURES_PATH + "features.h5", "department_info").reset_index()
data = pd.merge(data, department_info, on=["department_id"], how = 'left')

In [None]:
#user_aisle_info = pd.read_csv(FEATURES_PATH + "user_aisle_info.csv", dtype= dtype_dict)
user_aisle_info = pd.read_hdf(FEATURES_PATH + "features.h5", "user_aisle_info").reset_index()
data = pd.merge(data, user_aisle_info, on=["user_id", "aisle_id"], how = 'left')
print("merge user_aisle_info")

In [None]:
#user_dep_info = pd.read_csv(FEATURES_PATH + "user_dep_info.csv", dtype= dtype_dict)
user_dep_info = pd.read_hdf(FEATURES_PATH + "features.h5", "user_dep_info").reset_index()
data = pd.merge(data, user_dep_info, on=["user_id", "department_id"], how = 'left')
print("merge user_dep_info")

In [None]:
data.dtypes

### Computed features

#### If the order is in the same day and the product is present in the previous order

In [None]:
data['up_in_same_day_previous_order'] = ((data.user_days_since_last_order == 0) &  (data.up_orders_since_last_order == 0)).astype(np.int8)

## SAVE

In [None]:
#data.set_index(["user_id", "product_id"], inplace = True)
#data.to_csv(FEATURES_PATH + "./data4.csv")

In [None]:
data.set_index(["user_id", "product_id"], inplace = True)
data.to_hdf(FEATURES_PATH + "./data.h5", "data")