In [3]:
import gc
import time
import numpy as np
import pandas as pd

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print (check_output(["ls","./input"]).decode("utf8"))

* Load Data Function

In [6]:
def load_data(path_data):
    '''
    -----------------------------------order_product---------------------------------
    *Unique in order_id + product_id
    '''
    priors = pd.read_csv(path_data + 'order_products_prior.csv',
                        dtype={
                            'order_id':np.int32,
                            'product_id':np.uint16,
                            'add_to_cart_order':np.int16,
                            'reordered':np.int8
                        })
    train = pd.read_csv(path_data + 'order_products_train.csv',
                       dtype ={
                           'order_id': np.int32,
                           'product_id':np.uint16,
                           'add_to_cart_order':np.int16,
                           'reordered':np.int8                           
                       })
    
    '''
      --------------------------------order--------------------------------
    * This file tells us which set (prior, train, test) an order belongs
    * Unique in order_id
    * order_id in train, prior, test has no intersection
    * this is the #order_number order of this user
    '''
    orders = pd.read_csv(path_data + 'orders.csv',
                        dtype = {
                            'order_id': np.int32,
                            'user_id': np.int64,
                            'eval_set':'category',
                            'order_number': np.int16,
                            'order_dow':np.int8,
                            'order_hour_of_day': np.int8,
                            'days_since_prior_order': np.float32
                        })
    # order in prior, train, test has no duplicate
    # order_ids_pri = priors.order_id.unique()
    # order_ids_trn = train.order_id.unique()
    # order_ids_tst = orders[orders.eval_set == 'test']['order_id'].unique()
    # print(set(order_ids_pri).intersection(set(order_ids_trn)))
    # print(set(order_ids_pri).intersection(set(order_ids_tst)))
    # print(set(order_ids_trn).intersection(set(order_ids_tst)))
    
    '''
    --------------------------------product--------------------------------
    * Unique in product_id    
    '''
    products = pd.read_csv(path_data + 'products.csv')
    aisles = pd.read_csv(path_data + "aisles.csv")
    departments = pd.read_csv(path_data + "departments.csv")
    sample_submission = pd.read_csv(path_data + "sample_submission.csv")
    
    return priors, train, orders, products, aisles, departments, sample_submission

class tick_tock:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " begin ......")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print(self.process_name + " end ......")
            print('time lapsing {0} s \n'.format(end_time - self.begin_time))
def ka_add_groupby_features_1_vs_n(df, group_columns_list, agg_dict, only_new_feature=True):
    '''Create statistical columns, group by [N columns] and compute stats on [N column]

       Parameters
       ----------
       df: pandas dataframe
          Features matrix
       group_columns_list: list_like
          List of columns you want to group with, could be multiple columns
       agg_dict: python dictionary

       Return
       ------
       new pandas dataframe with original columns and new added columns

       Example
       -------
       {real_column_name: {your_specified_new_column_name : method}}
       agg_dict = {'user_id':{'prod_tot_cnts':'count'},
                   'reordered':{'reorder_tot_cnts_of_this_prod':'sum'},
                   'user_buy_product_times': {'prod_order_once':lambda x: sum(x==1),
                                              'prod_order_more_than_once':lambda x: sum(x==2)}}
       ka_add_stats_features_1_vs_n(train, ['product_id'], agg_dict)
    '''
    with tick_tock("add stats features"):
        try:
            if type(group_columns_list) == list:
                pass
            else:
                raise TypeError(k + "should be a list")
        except TypeError as e:
            print(e)
            raise
        
        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)
        the_stats = grouped.agg(agg_dict)
        the_stats.columns = the_stats.columns.droplevel(0)
        the_stats.reset_index(inplace = True)
        if only_new_feature:
            df_new = the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')

    return df_new

def ka_add_groupby_features_n_vs_1(df, group_columns_list, target_columns_list, methods_list, keep_only_stats=True, verbose=1):
    '''Create statistical columns, group by [N columns] and compute stats on [1 column]

       Parameters
       ----------
       df: pandas dataframe
          Features matrix
       group_columns_list: list_like
          List of columns you want to group with, could be multiple columns
       target_columns_list: list_like
          column you want to compute stats, need to be a list with only one element
       methods_list: list_like
          methods that you want to use, all methods that supported by groupby in Pandas

       Return
       ------
       new pandas dataframe with original columns and new added columns

       Example
       -------
       ka_add_stats_features_n_vs_1(train, group_columns_list=['x0'], target_columns_list=['x10'])
    '''      
    with tick_tock("add stats features", verbose):
        dicts = {"group_columns_list": group_columns_list , "target_columns_list": target_columns_list, "methods_list" :methods_list}
        for k, v in dicts.items():
            try:
                if type(v) == list:
                    pass
                else:
                    raise TypeError(k+"should be a list")
            except TypeError as e:
                print(e)
                raise
        grouped_name = ''.join(group_columns_list)
        target_name =''.join(target_columns_list)
        combine_name=[[grouped_name] +[method_name]+[target_name] for method_name in methods_list]
        df_new =df.copy()
        grouped = df_new.groupby(group_columns_list)
        the_stats = grouoped[target_name].agg(methods_list).reset_index()
        the_stats.columns =[grouped_name]+ \
                            ['_%s_%s_by_%s' % (grouped_name, method_name, target_name)\
                            for (grouped_name, method_name,target_name) in combine_name]
        if keep_only_stats:
            return the_stats
        else:
            df_new = pd.merge(left = df_new, right=the_stats, on = group_columns_list,how = 'left')
        return df_new
    

In [11]:
path_data='../input/'
priors, train, orders, products, aisles, departments, sample_submission = load_data(path_data)

In [None]:
# Products information ----------------------------------------------------------------
# add order information to priors set
priors_orders_detail = orders.merge(right=priors, how = 'inner', on = 'order_id')

# create new variables
#_user_buy_product_times: what's the number of time the customer buy this product
priors_orders_detail.loc[:'_user_buy_product_times'] = priors_orders_detail.groupby(
    ['user_id', 'product_id']).cumcount()+1