In [1]:
import numpy as np
import pandas as pd
import gc
IDIR = '../input/'
FEATURES_PATH = './features3/'

In [None]:
gc.collect()

In [2]:
print('loading prior')
#priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
#            'order_id': np.int32,
#            'product_id': np.uint16,
#            'add_to_cart_order': np.int16,
#            'reordered': np.int8})


priors = pd.read_hdf(IDIR+"input.h5","priors")

print('loading train')
trains = pd.read_hdf(IDIR+"input.h5","trains")

#trains = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
#            'order_id': np.int32,
#            'product_id': np.uint16,
#            'add_to_cart_order': np.int16,
#            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('loading aisle')
aisles = pd.read_csv(IDIR + 'aisles.csv', dtype={
        'aisle_id': np.int16 })

print('loading department')
departments = pd.read_csv(IDIR + 'departments.csv', dtype={
        'department_id': np.int16 })
        

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('trains {}: {}'.format(trains.shape, ', '.join(trains.columns)))

loading prior
loading train
loading orders
loading products
loading aisle
loading department
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
trains (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [None]:
priors.to_hdf(IDIR+"input.h5","priors", mode="w")
trains.to_hdf(IDIR+"input.h5","trains", mode="a")

# USER PRODUCT

## Product number of days since last order

In [None]:
orders["days_since_first_order"] = orders.groupby('user_id').days_since_prior_order.cumsum()
priors = pd.merge(priors, orders, on=['order_id'], how='left')

In [None]:
priors.head()

In [None]:
up_info = priors.groupby(['user_id', 'product_id'])[['days_since_first_order']].max().reset_index()
user_info = orders[orders.eval_set != "prior"][['user_id', 'days_since_first_order']]
user_info.rename(columns={"days_since_first_order": "last_order_days_since_first_order"}, inplace = True)
up_info = pd.merge(up_info, user_info, on="user_id", how="left")

up_info.days_since_first_order = up_info.days_since_first_order.fillna(0)
up_info['up_days_since_last_order'] = (up_info.last_order_days_since_first_order - up_info.days_since_first_order).astype(np.int16)
up_info.drop(['days_since_first_order', 'last_order_days_since_first_order'], axis = 1, inplace=True)


In [None]:
#up_info.set_index(['user_id', 'product_id']).to_csv(FEATURES_PATH +  "up_days_since_last_order.csv")

In [None]:
up_info.set_index(['user_id', 'product_id']).to_hdf(FEATURES_PATH+"features.h5", "up_days_since_last_order", mode='a')

In [None]:
up_info.head()

## Product order rate and number of orders from the last order

In [None]:
#priors = pd.merge(priors, orders, on="order_id", how="inner")

In [None]:
user_info = orders[orders.eval_set == "prior"].groupby('user_id') \
        .agg({'order_number': np.max}) \
        .rename(columns={'order_number': 'user_orders'})
        
user_info.reset_index(inplace = True)

In [None]:
user_info.head()

In [None]:
up_info = pd.DataFrame(priors.groupby(['user_id', 'product_id']).size().astype(np.int16), columns=['up_orders'])
up_info[['up_first_order', 'up_last_order']] = priors.groupby(['user_id', 'product_id']).order_number.agg([np.min, np.max])
up_info.reset_index(inplace = True)
up_info.user_id = up_info.user_id.astype(np.int32)
up_info.product_id  = up_info.product_id.astype(np.int32)

In [None]:
up_info = pd.merge(up_info, user_info, on='user_id', how='left')
up_info['up_order_rate'] = (up_info.up_orders/up_info.user_orders).astype(np.float32)
up_info['up_orders_since_last_order'] = (up_info.user_orders - up_info.up_last_order).astype(np.float32)
up_info['up_order_rate_since_first_order'] = (up_info.up_orders / (up_info.user_orders - up_info.up_first_order + 1)).astype(np.float32)

#finish, remove temporal user feature
up_info.drop(['user_orders'], axis=1, inplace=True)

In [None]:
up_info.head()

In [None]:
up_info.set_index(['user_id', 'product_id'], inplace = True)
up_info.to_hdf(FEATURES_PATH+"features.h5", "up_order_rates", mode='a')
#up_info.to_csv(FEATURES_PATH +  "up_order_rates.csv")

## Average add to cart order

In [None]:
up_info = priors.groupby(['user_id', 'product_id'])[['add_to_cart_order']].mean().astype(np.float32)
up_info.rename(columns={"add_to_cart_order":"up_add_to_cart_order_mean"}, inplace=True)

In [None]:
up_info.head()

In [None]:
up_info.to_hdf(FEATURES_PATH+"features.h5", "up_add_to_cart_order_mean", mode='a')
#up_info.to_csv(FEATURES_PATH +  "up_add_to_cart_order_mean.csv")

## Reordered in the train set

In [None]:
#must reset

In [None]:
trains = pd.merge(trains, orders[['order_id', 'user_id']], on = 'order_id', how='left')
priors = pd.merge(priors, orders[['order_id', 'user_id']], on="order_id", how="inner")
up_info = pd.DataFrame(priors.groupby(['user_id', 'product_id']).size()).reset_index()
up_info.drop([0], axis = 1, inplace = True)
up_info = pd.merge(up_info, trains[['user_id', 'product_id', 'reordered']], on=['user_id', 'product_id'], how='left')
up_info.reordered.fillna(0, inplace = True)
up_info.rename(columns={"reordered": "up_reordered"}, inplace=True)

In [None]:
up_info.head()

In [None]:
up_info.set_index(['user_id','product_id'], inplace=True)
up_info.to_hdf(FEATURES_PATH+"features.h5", "up_reordered", mode='a')
#up_info.to_csv(FEATURES_PATH + "up_reordered.csv")

# USER

In [None]:
#priors = pd.merge(priors, orders[['order_id', 'user_id']], on="order_id", how="inner")

In [None]:
user_info = orders[orders.eval_set == "prior"].groupby('user_id') \
        .agg({'order_number': np.max}) \
        .rename(columns={'order_number': 'user_total_order'})
        
user_info.reset_index(inplace = True)
user_info = pd.merge(user_info, orders[orders.eval_set != 'prior'][['user_id', 'eval_set', 'days_since_prior_order']],
                    on='user_id', how='inner')
user_info.rename(columns={'days_since_prior_order': 'user_days_since_last_order', 
                         'eval_set': 'user_eval_set'}, inplace=True)

In [None]:
user_reorder_rate = priors.groupby(['user_id']).reordered.agg([np.sum, np.size])
user_reorder_rate['user_reorder_rate'] = (user_reorder_rate['sum']/user_reorder_rate['size']).astype(np.float32)
user_reorder_rate.reset_index(inplace=True)
user_info = pd.merge(user_info, user_reorder_rate[['user_id', 'user_reorder_rate']], on="user_id", how='left')
del user_reorder_rate

In [None]:
orders_count = priors.groupby('order_id')['product_id'].count()
orders['item_count'] = orders['order_id'].map(orders_count)
user_basket_avg = orders[orders.eval_set=="prior"].groupby('user_id')[['item_count']].mean()
user_basket_avg.rename(columns={"item_count": "user_order_size_mean"}, inplace=True)
user_basket_avg.reset_index(inplace=True)
orders.drop(['item_count'], axis = 1, inplace = True)

user_info = pd.merge(user_info, user_basket_avg, on="user_id", how='left')
del user_basket_avg

In [None]:
user_info.head()

In [None]:
user_info.dtypes

In [None]:
user_info.set_index('user_id').to_hdf(FEATURES_PATH+"features.h5", "user_info", mode='a')
#user_info.set_index('user_id').to_csv(FEATURES_PATH + "user_info.csv")

# PRODUCT

In [4]:
product_reorder_sum = priors.groupby('product_id').reordered.sum()
product_reorder_size = priors.groupby('product_id').reordered.size()
product_reorder_ratio = (product_reorder_sum+1)/(product_reorder_size + 1)
product_info = pd.DataFrame(product_reorder_ratio)
product_info.rename(columns={"reordered": "product_reorder_ratio"}, inplace=True)
product_info.reset_index(inplace = True)
product_info = pd.merge(product_info, products, on="product_id", how="inner")

In [5]:
product_info.head()

Unnamed: 0,product_id,product_reorder_ratio,aisle_id,department_id
0,1,0.6136,61,19
1,2,0.142857,104,13
2,3,0.733813,94,7
3,4,0.448485,38,1
4,5,0.625,5,13


In [6]:
product_info.sort_values(by="product_reorder_ratio")

Unnamed: 0,product_id,product_reorder_ratio,aisle_id,department_id
11668,11672,0.009804,104,13
32562,32568,0.011236,11,11
36897,36904,0.014706,25,11
10372,10376,0.014815,104,13
37666,37673,0.015625,104,13
24360,24364,0.017647,104,13
22335,22339,0.018519,114,17
28110,28116,0.018868,104,13
13231,13235,0.018868,104,13
24318,24322,0.019231,10,17


In [7]:
product_info.set_index('product_id').to_hdf(FEATURES_PATH+"features.h5", "product_info", mode='a')
#product_info.set_index('product_id').to_csv(FEATURES_PATH +  "product_info.csv")

# AISLE DEPARTMENT

In [None]:
#need reset

In [3]:
priors = pd.merge(priors, orders[['user_id', 'order_id','order_number']], on='order_id', how='left')
priors = pd.merge(priors, products, on='product_id', how='left')

In [4]:
aisle_reordered_sum = priors.groupby('aisle_id').reordered.sum()
aisle_reordered_size = priors.groupby('aisle_id').reordered.size()
aisle_reordered_ratio = ((aisle_reordered_sum + 1)/(aisle_reordered_size + 1)).astype(np.float32)
aisle_info = pd.DataFrame(aisle_reordered_ratio)
aisle_info.rename(columns={"reordered": "aisle_reorder_ratio"}, inplace=True)

In [5]:
aisle_info.head()

Unnamed: 0_level_0,aisle_reorder_ratio
aisle_id,Unnamed: 1_level_1
1,0.596602
2,0.489332
3,0.598008
4,0.489536
5,0.280639


In [6]:
aisle_info.sort_values(by="aisle_reorder_ratio")

Unnamed: 0_level_0,aisle_reorder_ratio
aisle_id,Unnamed: 1_level_1
104,0.152395
97,0.167265
118,0.194886
10,0.195465
132,0.212190
44,0.217382
11,0.236244
80,0.236906
109,0.242105
22,0.245861


In [7]:
aisle_info.to_hdf(FEATURES_PATH+"features.h5", "aisle_info", mode='a')
#aisle_info.to_csv(FEATURES_PATH + "aisle_info.csv")

In [8]:
department_reordered_sum = priors.groupby('department_id').reordered.sum()
department_reordered_size = priors.groupby('department_id').reordered.size()
department_reordered_ratio = ((department_reordered_sum + 1)/(department_reordered_size + 1)).astype(np.float32)
department_info = pd.DataFrame(department_reordered_ratio)
department_info.rename(columns={"reordered": "dep_reorder_ratio"}, inplace=True)

In [9]:
department_info.head()

Unnamed: 0_level_0,dep_reorder_ratio
department_id,Unnamed: 1_level_1
1,0.541886
2,0.407996
3,0.628141
4,0.649913
5,0.569927


In [10]:
department_info.to_hdf(FEATURES_PATH+"features.h5", "department_info", mode='a')
#department_info.to_csv(FEATURES_PATH + "department_info.csv")

## User Aisle

In [11]:
user_aisle_reordered_sum = priors.groupby(['user_id', 'aisle_id']).reordered.sum()
user_aisle_reordered_size = priors.groupby(['user_id', 'aisle_id']).reordered.size()
user_aisle_reordered_ratio = ((user_aisle_reordered_sum + 1)/(user_aisle_reordered_size + 1)).astype(np.float32)
user_aisle_info = pd.DataFrame(user_aisle_reordered_ratio)
user_aisle_info.rename(columns={"reordered": "user_aisle_reordered_ratio"}, inplace=True)

In [12]:
user_aisle_info.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_aisle_reordered_ratio
user_id,aisle_id,Unnamed: 2_level_1
1,21,0.888889
1,23,0.846154
1,24,0.333333
1,45,0.5
1,53,0.666667


In [13]:
user_aisle_info.sort_values(by="user_aisle_reordered_ratio").head()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_aisle_reordered_ratio
user_id,aisle_id,Unnamed: 2_level_1
12182,107,0.026316
40333,124,0.027027
53808,92,0.03125
89332,3,0.035714
98655,92,0.037037


In [14]:
print(user_aisle_info.isnull().sum())

user_aisle_reordered_ratio    0
dtype: int64


In [15]:
user_aisle_info.to_hdf(FEATURES_PATH+"features.h5", "user_aisle_info", mode='a')
#user_aisle_info.to_csv(FEATURES_PATH + "user_aisle_info.csv")

## User Department

In [16]:
user_dep_reordered_sum = priors.groupby(['user_id', 'department_id']).reordered.sum()
user_dep_reordered_size = priors.groupby(['user_id', 'department_id']).reordered.size()
user_dep_reordered_ratio = ((user_dep_reordered_sum + 1)/(user_dep_reordered_size + 1)).astype(np.float32)
user_dep_info = pd.DataFrame(user_dep_reordered_ratio)
user_dep_info.rename(columns={"reordered": "user_dep_reordered_ratio"}, inplace=True)

In [17]:
user_dep_info.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_dep_reordered_ratio
user_id,department_id,Unnamed: 2_level_1
1,4,0.333333
1,7,0.857143
1,13,0.5
1,14,0.75
1,16,0.642857


In [18]:
user_dep_info.sort_values(by="user_dep_reordered_ratio").head()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_dep_reordered_ratio
user_id,department_id,Unnamed: 2_level_1
91866,19,0.016129
52922,13,0.020833
53808,18,0.02381
80655,4,0.027778
4350,4,0.028571


In [19]:
user_dep_info.isnull().sum()

user_dep_reordered_ratio    0
dtype: int64

In [20]:
user_dep_info.to_hdf(FEATURES_PATH+"features.h5", "user_dep_info", mode='a')
#user_dep_info.to_csv(FEATURES_PATH + "user_dep_info.csv")