### Feature Engineering (capstone2_preprocess.ipynb)
 
#### Product features
 
* number of apearance in all the history orders
* total reorder number
* reorder ratio (product level)
* number of users who purchased this product
* average and standard deviation of add_to_cart_order
* average and standard deviation of purchase day_of_week (Monday, Tuesday, ...)
* average and standard deviation of purchase hour_of_day (8 am, 9am, ...)
* recency (captures if the product is generally brought more in users earlier orders or later orders)
* number of orders of user who bought this product 
* number of users who purchased this product only once / more than once

#### User features

* number of Aisles/Departments a user purchased products from
* number of total history orders of a user
* reorder ratio (user level)
* average and standard deviation of days between history orders
* average and standard deviation of number of products purchased in the same order
* number of total / distinct products purchased
* average and standard deviation of add_to_cart_order (user level)
* average and standard deviation of interval between two orders which contained the same product

In [1]:
import numpy as np
import time
from scipy import stats
import pandas as pd
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
class time_it:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + ' begin ...')
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print(self.process_name + ' end ...')
            print('duration {0} s \n'.format(end_time - self.begin_time))

#### Create statistical columns, group by [N columns] and compute stats on [N column]
       
*Parameters*

df: pandas dataframe

    Features matrix

group_columns_list: list like

    List of columns to group with

agg_dict: python dictionary

*Return*

new pandas dataframe with original columns and new added columns

*Example*
```python
{real_column_name:{your_specified_new_column_name:method}}
agg_dict = {'user_id':{'prod_tot_cnts':'count'}, 
            'reordered':{'reordered_tot_cnts_of_this_prod':'sum'}, 
            'user_buy_product_times':{'prod_order_once':lambda x: sum(x==1), 
                                      'prod_order_more_than_once':lambda x: sum(x==2)}}
ka_add_groupby_features_1_vs_n(train, ['product_id'], agg_dict)
```

In [3]:
def ka_add_groupby_features_1_vs_n(df, group_columns_list, agg_dict, only_new_feature=True):
    with time_it('add stats features'):
        try:
            if type(group_columns_list) == list:
                pass
            else:
                raise TypeError(k + 'should be a list')
        except TypeError as e:
            print(e)
            raise
                
        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)
            
        the_stats = grouped.agg(agg_dict)
        the_stats.columns = the_stats.columns.droplevel(0)
        the_stats.reset_index(inplace=True)
        if only_new_feature:
            df_new = the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')
            
        return df_new

#### LOAD DATA

In [4]:
print('loading prior')
priors = pd.read_csv('order_products__prior.csv', dtype={'order_id': np.int32, 
                                                         'product_id': np.uint16, 
                                                         'add_to_cart_order': np.int16, 
                                                         'reordered': np.int8})

loading prior


In [5]:
print('loading train')
train = pd.read_csv('order_products__train.csv', dtype={'order_id': np.int32, 
                                                        'product_id': np.uint16,
                                                        'add_to_cart_order': np.int16,
                                                        'reordered': np.int8})

loading train


In [6]:
print('loading orders')
orders = pd.read_csv('orders.csv', dtype={'order_id': np.int32,
                                          'user_id': np.int32,
                                          'eval_set': 'category',
                                          'order_number': np.int16,
                                          'order_dow': np.int8,
                                          'order_hour_of_day': np.int8,
                                          'days_since_prior_order': np.float32})

loading orders


In [7]:
print('loading products')
products = pd.read_csv('products.csv', dtype={'product_id': np.uint16,
                                              'order_id': np.int32,
                                              'aisle_id': 'category',
                                              'department_id': 'category'})
products.drop('product_name', axis=1, inplace=True)

loading products


In [8]:
print('loading aisles.csv')
aisles = pd.read_csv('aisles.csv', dtype={'aisle_id': np.uint8,
                                          'aisle': 'category'})

loading aisles.csv


In [9]:
print('loading departments.csv')
departments = pd.read_csv('departments.csv', dtype={'department_id': np.uint8,
                                                    'department': 'category'})

loading departments.csv


#### PRODUCT FEATURES

In [10]:
print('computing product f')
prod_f = pd.DataFrame()

computing product f


Appear in how many orders

In [11]:
prod_f['appearOrderCount'] = priors['order_id'].groupby(priors.product_id).count().astype(np.uint16)

Total reorder times

In [12]:
prod_f['reorderCount'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)

Reorder ratio

In [13]:
prod_f['reorderRatio'] = (prod_f['reorderCount'] / prod_f['appearOrderCount']).astype(np.float32)

Mean and Std Dev of add_to_cart_order

In [14]:
prod_f['add_to_cart_orderMean'] = priors['add_to_cart_order'].groupby(priors.product_id).mean().astype(np.float32)
prod_f['add_to_cart_orderStd'] = priors['add_to_cart_order'].groupby(priors.product_id).std().astype(np.float32)

In [15]:
# temp merge
priorsXorders=priors.merge(orders,how='inner',on='order_id')

Bought by how many users

In [16]:
prod_f['boughtUserCount'] = priorsXorders['user_id'].groupby(priorsXorders.product_id).count().astype(np.uint16)

Bought day of week

In [17]:
prod_f['order_dowMean'] = priorsXorders['order_dow'].groupby(priorsXorders.product_id).mean().astype(np.float32)
prod_f['order_dowStd'] = priorsXorders['order_dow'].groupby(priorsXorders.product_id).std().astype(np.float32)

Bought time

In [18]:
prod_f['order_hour_of_dayMean'] = priorsXorders['order_hour_of_day'].groupby(priorsXorders.product_id).mean().astype(np.float32)
prod_f['order_hour_of_dayStd'] = priorsXorders['order_hour_of_day'].groupby(priorsXorders.product_id).std().astype(np.float32)

Fill NA Std Dev as 0; if there is only one sample, Std Dev = NA

In [19]:
prod_f.fillna(0, inplace=True)

##### Recenctly purchased

This is a feature which captures if the product is generally brought more in users earlier orders or later orders

In [20]:
maxPriorOrder = priorsXorders['order_number'].groupby(priorsXorders.user_id).max().astype(np.float32)
maxPriorOrder = maxPriorOrder.rename('userMaxPriorOrderNumber')
maxPriorOrder = pd.DataFrame(maxPriorOrder)
priorsXorders = priorsXorders.merge(maxPriorOrder, how='left', left_on='user_id', right_index=True)
priorsXorders['recency'] = (priorsXorders['order_number'] / priorsXorders['userMaxPriorOrderNumber']).astype(np.float32)
prod_f['recencyMean'] = priorsXorders['recency'].groupby(priorsXorders.product_id).mean().astype(np.float32)
prod_f['recencyStd'] = priorsXorders['recency'].groupby(priorsXorders.product_id).std().astype(np.float32)

User who bought this product had how many orders -- userMaxPriorOrderNumber

In [21]:
prod_f['userMaxPriorOrderNumberMean'] = priorsXorders['userMaxPriorOrderNumber'].groupby(priorsXorders.product_id). \
                                                                                 mean(). \
                                                                                 astype(np.float32)
prod_f['userMaxPriorOrderNumberStd'] = priorsXorders['userMaxPriorOrderNumber'].groupby(priorsXorders.product_id). \
                                                                                std(). \
                                                                                astype(np.float32)

Fill NA Std Dev as 0; if there is only one sample, Std Dev = NA

In [22]:
prod_f.fillna(0, inplace=True)

In [23]:
priorsXorders['_user_buy_product_times'] = priorsXorders.groupby(['user_id', 'product_id']).cumcount() + 1
tmp_group=priorsXorders['_user_buy_product_times'].groupby(priorsXorders.product_id)

Prodoct order once 

In [24]:
prod_f['_prod_order_once'] = tmp_group.aggregate(lambda x: sum(x==1))

Product order, more than one

In [25]:
prod_f['_prod_order_more_than_once'] = tmp_group.aggregate(lambda x: sum(x==2))

Product in last order = 0; previous order = 1; order before that = 2

In [26]:
priorsXorders['lastNOrder'] = priorsXorders['userMaxPriorOrderNumber'] - priorsXorders['order_number']
prod_f['apperInLastNOrderMean'] = priorsXorders['lastNOrder'].groupby(priorsXorders.product_id).mean().astype(np.float32)
prod_f['apperInLastNOrderStd'] = priorsXorders['lastNOrder'].groupby(priorsXorders.product_id).std().astype(np.float32)
prod_f['apperInLastNOrderMax'] = priorsXorders['lastNOrder'].groupby(priorsXorders.product_id).max().astype(np.float32)
prod_f['apperInLastNOrderMin'] = priorsXorders['lastNOrder'].groupby(priorsXorders.product_id).min().astype(np.float32)

Fill NA Std Dev as 0; if there is only one sample, Std Dev = NA

In [27]:
prod_f.fillna(0, inplace=True)

In [28]:
prd = products.merge(prod_f, how='inner', left_on='product_id', right_index=True)
del prod_f
gc.collect()

160

#### USER FEATURES

In [29]:
print('building user features...')
user_f = pd.DataFrame()
priorsXordersXproducts = priorsXorders.merge(products, how='inner', on='product_id')

building user features...


Aisle purchases and Department purchases

In [30]:
tmp_group = priorsXordersXproducts['aisle_id'].groupby(priorsXordersXproducts.user_id)
unique_aisle_ids = tmp_group.unique()
for i in range(len(unique_aisle_ids)):
    unique_aisle_ids.iloc[i] = len(unique_aisle_ids.iloc[i])
user_f['uniqueAisleCount'] = unique_aisle_ids
user_f['uniqueAisleCount'] = user_f['uniqueAisleCount'].astype(np.int16)
user_f['freqAisle'] = tmp_group.agg(lambda x: stats.mode(x)[0][0])
user_f['freqAisle'] = user_f['freqAisle'].astype(np.int16)

In [31]:
tmp_group = priorsXordersXproducts['department_id'].groupby(priorsXordersXproducts.user_id)
unique_department_ids = tmp_group.unique()
for i in range(len(unique_department_ids)):
    unique_department_ids.iloc[i]=len(unique_department_ids.iloc[i])
user_f['uniqueDepartmentCount'] = unique_department_ids
user_f['uniqueDepartmentCount'] = user_f['uniqueDepartmentCount'].astype(np.int16)
user_f['freqDepartment'] = tmp_group.agg(lambda x: stats.mode(x)[0][0])
user_f['freqDepartment'] = user_f['freqDepartment'].astype(np.int16)

User total orders

In [32]:
user_f['_user_total_orders'] = priorsXorders['order_number'].groupby(priorsXorders.user_id).max().astype(np.float32)

User mean and std dev of days since prior order

In [33]:
user_f['days_since_prior_orderMean'] = priorsXordersXproducts['days_since_prior_order']. \
                                       groupby(priorsXordersXproducts.user_id).mean().astype(np.float32)
user_f['days_since_prior_orderStd'] = priorsXordersXproducts['days_since_prior_order']. \
                                      groupby(priorsXordersXproducts.user_id).std().astype(np.float32)

_user_reorder_ratio, _user_total_products, _user_distinct_products

In [34]:
tmp_group = priorsXorders['product_id'].groupby(priorsXorders.user_id)
user_f['_user_total_products'] = tmp_group.count()
user_f['_user_distinct_products'] = tmp_group.agg(lambda x: x.nunique())

In [35]:
tmp_group = priorsXorders['reordered'].groupby(priorsXorders.user_id)
user_f['_user_reorder_ratio'] = (priorsXorders.groupby('user_id')['reordered'].sum() / \
                                 priorsXorders[priorsXorders['order_number'] > 1]. \
                                 groupby('user_id')['order_number']. \
                                 count()).rename('_user_reorder_ratio')

Bucket size

In [36]:
order_size = priorsXordersXproducts['product_id'].groupby(priorsXordersXproducts.order_id).count().astype(np.uint8)
order_size = order_size.rename('order_size')
order_size = pd.DataFrame(order_size)
priorsXordersXproducts = priorsXordersXproducts.merge(order_size, how='left', left_on='order_id', right_index=True)

In [37]:
tmp_group = priorsXordersXproducts['order_size'].groupby(priorsXordersXproducts.user_id)
user_f['order_size_mean'] = tmp_group.mean()
user_f['order_size_std'] = tmp_group.std()
user_f['order_size_max'] = tmp_group.max()
user_f['order_size_min'] = tmp_group.min()

In [38]:
us = orders[orders.eval_set != "prior"][['user_id', 'order_id', 'eval_set', 'days_since_prior_order']]
us.rename(index=str, columns={'days_since_prior_order': 'time_since_last_order'}, inplace=True)

In [39]:
user_f = user_f.merge(us, how='inner', left_index=True, right_on='user_id')

#### USER AND PRODUCT

_up_order_count, _up_first_order_number, _up_last_order_number, _up_average_cart_position

In [40]:
agg_dict_4 = {'order_number':{'_up_order_count': 'count', 
                              '_up_first_order_number': 'min', 
                              '_up_last_order_number':'max',
                              '_up_order_past_appears_mean':'mean',# user purchased this product in which past orders = mean
                              '_up_order_past_appears_std':'std'}, 
              'add_to_cart_order':{'_up_average_cart_position': 'mean',
                                   '_up_std_cart_position': 'std'}}

In [41]:
data = ka_add_groupby_features_1_vs_n(df=priorsXorders, group_columns_list=['user_id', 'product_id'], agg_dict=agg_dict_4)

add stats features begin ...
add stats features end ...
duration 94.15769910812378 s 



In [42]:
data['_up_order_past_appears_std'].fillna(0, inplace=True)
data['_up_std_cart_position'].fillna(0, inplace=True)
data = data.merge(prd, how='inner', on='product_id').merge(user_f, how='inner', on='user_id')

In [43]:
data['_up_order_rate'] = data._up_order_count / data._user_total_orders
data['_up_order_since_last_order'] = data._user_total_orders - data._up_last_order_number
data['_up_order_since_last_order_normalize'] = data['_up_order_since_last_order'] / data['_user_total_orders']
data['_up_order_past_appears_mean_normalize'] = data['_up_order_past_appears_mean'] / data._up_last_order_number
data['_up_order_past_appears_std_normalize'] = data['_up_order_past_appears_std'] / data._up_last_order_number
data['_up_order_rate_since_first_order'] = data._up_order_count / (data._user_total_orders - data._up_first_order_number + 1)

User purchased this product in which past orders in interval

In [44]:
tmp_group = priorsXorders['order_number'].groupby([priorsXorders.user_id,priorsXorders.product_id])
_up_order_past_appears_interval_mean = tmp_group.agg(lambda x: np.mean(np.diff(np.sort(x))) if len(x) > 1 else -1)
_up_order_past_appears_interval_std = tmp_group.agg(lambda x: np.std(np.diff(np.sort(x))) if len(x) > 1 else 0)
tmp_df = pd.DataFrame(data={'_up_order_past_appears_interval_mean':_up_order_past_appears_interval_mean,
                            '_up_order_past_appears_interval_std':_up_order_past_appears_interval_std})

In [45]:
data = data.merge(right=tmp_df, left_on=[data.user_id, data.product_id], how='left', right_index=True)
data['_up_order_past_appears_interval_mean'].loc[data['_up_order_past_appears_interval_mean']==-1] = data['_user_total_orders']. \
                                                                                                     loc[data['_up_order_past_appears_interval_mean']==-1]
data['_up_order_past_appears_interval_mean_normalize'] = data['_up_order_past_appears_interval_mean'] / data['_user_total_orders']
data['_up_order_past_appears_interval_std_normalize'] = data['_up_order_past_appears_interval_std'] / data['_user_total_orders']
data['_up_order_expect_days_to_order'] = data['_up_order_past_appears_interval_mean'] - data['_up_order_since_last_order']
data['_up_order_expect_days_to_order_normalize'] = data['_up_order_expect_days_to_order'] / data._user_total_orders

Add user_id to train set

In [46]:
train = train.merge(right=orders[['order_id', 'user_id']], how='left', on='order_id')
data = data.merge(train[['user_id', 'product_id', 'reordered']], on=['user_id', 'product_id'], how='left')

In [47]:
train = data.loc[data.eval_set=='train', :]
train.loc[:, 'reordered'] = train.reordered.fillna(0)
X_test = data.loc[data.eval_set=='test', :]
X_train = train.drop('reordered', axis=1)
y_train = train.reordered

In [48]:
X_train.to_hdf('X_train_w32.h5', 'X_train', mode='w', format='table')
y_train.to_hdf('y_train_w32.h5', 'y_train', mode='w', format='table')
X_test.to_hdf('X_test_w32.h5', 'X_test', mode='w', format='table')