In [1]:
# Author : Trong Canh Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold


import numpy as np
import pandas as pd
import gc

IDIR = '../input/'
FEATURES_PATH = './features3/'

In [None]:
gc.collect()

## Data Construction

In [None]:
dtype_dict = {
    'user_id': np.int32,
    'product_id': np.int32,
    
    'up_orders': np.int16,
    'up_first_order':np.int16,
    'up_last_order':np.int16,
    'up_add_to_cart_order_mean':np.float32,
    'up_order_rate': np.float32,
    'up_order_rate_since_first_order': np.float32,
    'up_orders_since_last_order':np.float32,
    'up_days_since_last_order': np.int16,
    'up_in_same_day_previous_order': np.int8,
    
    'user_total_order': np.int16,
    'user_order_size_mean': np.float32,
    'user_reorder_rate': np.float32,
    'user_days_since_last_order': np.float32,
    
    
    'up_reordered': np.float32,
    
    'product_reorder_ratio':np.float32,
    
    'aisle_id':np.int16,
    'department_id':np.int16,
    'aisle_reorder_rate': np.float32,
    'dep_reorder_rate': np.float32,
    'user_aisle_reorder_rate': np.float32,
    'user_dep_reorder_rate': np.float32
    
    
}

In [None]:
import pickle
with open(FEATURES_PATH + 'dtypes.pickle', 'wb') as f:
    pickle.dump(dtype_dict, f)

### Preload if necessary

In [None]:
data = pd.read_csv(FEATURES_PATH + 'data.csv', dtype= dtype_dict)
print("Memory (Gb):", data.memory_usage().sum()/1000000)

### Append features

In [2]:
#data = pd.read_csv(FEATURES_PATH + "up_order_rates.csv", dtype= dtype_dict)
data = pd.read_hdf(FEATURES_PATH + "features.h5", "up_info")
data.reset_index(inplace = True)
print('done')

done


In [3]:
#data = pd.read_csv(FEATURES_PATH + "up_order_rates.csv", dtype= dtype_dict)
tmp = pd.read_hdf(FEATURES_PATH + "features.h5", "up_order_rates").reset_index()
data = pd.merge(data, tmp, on=["user_id", "product_id"], how = 'left')
print('done')

done


In [4]:
#tmp = pd.read_csv(FEATURES_PATH + "up_days_since_last_order.csv", dtype= dtype_dict)
tmp = pd.read_hdf(FEATURES_PATH + "features.h5", "up_days_since_last_order").reset_index()
data = pd.merge(data, tmp, on=["user_id", "product_id"], how = 'left')
print('done')

done


In [5]:
#tmp = pd.read_csv(FEATURES_PATH + "up_add_to_cart_order_mean.csv", dtype= dtype_dict)
tmp = pd.read_hdf(FEATURES_PATH + "features.h5", "up_add_to_cart_order_mean").reset_index()
data = pd.merge(data, tmp, on=["user_id", "product_id"], how = 'left')
print('done')

done


In [6]:
#tmp = pd.read_csv(FEATURES_PATH + "up_reordered.csv", dtype= dtype_dict)
tmp = pd.read_hdf(FEATURES_PATH + "features.h5", "up_reordered").reset_index()
data = pd.merge(data, tmp, on=["user_id", "product_id"], how = 'left')
print('done')

done


In [7]:
del tmp
print("data memory", data.memory_usage().sum()/1000000)
gc.collect()

data memory 958.172616


168

In [None]:
data.head()

### User info

In [8]:
#user_info = pd.read_csv(FEATURES_PATH + "user_info.csv", dtype= dtype_dict)
user_info = pd.read_hdf(FEATURES_PATH + "features.h5", "user_info").reset_index()

In [None]:
user_info.head()

In [9]:
data = pd.merge(data, user_info, on=["user_id"], how = 'left')

### Product info

In [10]:
#product_info = pd.read_csv(FEATURES_PATH + "product_info.csv", dtype= dtype_dict)
product_info = pd.read_hdf(FEATURES_PATH + "features.h5", "product_info").reset_index()
product_info.head()

Unnamed: 0,product_id,product_reorder_ratio,aisle_id,department_id,product_reorder_probability,is_organic
0,1,0.6136,61,19,0.385475,0
1,2,0.142857,104,13,0.102564,0
2,3,0.733813,94,7,0.486486,0
3,4,0.448485,38,1,0.351648,0
4,5,0.625,5,13,0.666667,0


In [11]:
data = pd.merge(data, product_info, on=["product_id"], how = 'left')

In [12]:
print("data memory", data.memory_usage().sum()/1000000)
gc.collect()

data memory 1583.646407


95

### Aisle & Department

In [13]:
#aisle_info = pd.read_csv(FEATURES_PATH + "aisle_info.csv", dtype= dtype_dict)
aisle_info = pd.read_hdf(FEATURES_PATH + "features.h5", "aisle_info").reset_index()
data = pd.merge(data, aisle_info, on=["aisle_id"], how = 'left')

In [14]:
#department_info = pd.read_csv(FEATURES_PATH + "department_info.csv", dtype= dtype_dict)
department_info = pd.read_hdf(FEATURES_PATH + "features.h5", "department_info").reset_index()
data = pd.merge(data, department_info, on=["department_id"], how = 'left')

In [15]:
#user_aisle_info = pd.read_csv(FEATURES_PATH + "user_aisle_info.csv", dtype= dtype_dict)
user_aisle_info = pd.read_hdf(FEATURES_PATH + "features.h5", "user_aisle_info").reset_index()
data = pd.merge(data, user_aisle_info, on=["user_id", "aisle_id"], how = 'left')
print("merge user_aisle_info")

merge user_aisle_info


In [16]:
#user_dep_info = pd.read_csv(FEATURES_PATH + "user_dep_info.csv", dtype= dtype_dict)
user_dep_info = pd.read_hdf(FEATURES_PATH + "features.h5", "user_dep_info").reset_index()
data = pd.merge(data, user_dep_info, on=["user_id", "department_id"], how = 'left')
print("merge user_dep_info")

merge user_dep_info


In [17]:
print(len(data) == 13307953)

True


In [18]:
data.up_reordered = data.up_reordered.astype(np.uint8)

### Computed features

In [19]:
data['up_recency'] = (data.up_last_order/data.user_total_order).astype(np.float32)
#data['up_in_same_day_previous_order'] = ((data.user_days_since_last_order == 0) &  (data.up_orders_since_last_order == 0)).astype(np.int8)

## SAVE

In [20]:
gc.collect()

140

In [21]:
data.set_index(["user_id", "product_id"], inplace = True)
data.to_hdf(FEATURES_PATH + "./data.h5", "data")

In [22]:
data.isnull().sum()

up_days_since_prior_order_mean        0
up_order_dow_mean                     0
up_order_hour_of_day_mean             0
up_add_to_cart_order_relative_mean    0
up_orders                             0
up_first_order                        0
up_last_order                         0
up_order_rate                         0
up_orders_since_last_order            0
up_order_rate_since_first_order       0
up_days_since_last_order              0
up_add_to_cart_order_mean             0
up_reordered                          0
user_total_order                      0
user_reorder_rate                     0
user_order_size_mean                  0
user_days_since_prior_mean            0
user_order_dow                        0
user_order_hour_of_day                0
user_eval_set                         0
user_days_since_prior_order           0
user_order_dow_mean                   0
user_order_hour_of_day_mean           0
product_reorder_ratio                 0
aisle_id                              0


# FEATURE ENGINEER

In [None]:
data = pd.read_hdf(FEATURES_PATH + "data.h5", "data")

In [None]:
list(data.columns)

### Merge