In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)

In [17]:
all_data = pd.read_hdf('../data/online_retail.h5','clean')
orders = pd.read_hdf('../data/online_retail.h5','orders')
products = pd.read_hdf('../data/online_retail.h5','products')
customers = pd.read_hdf('../data/online_retail.h5','customers')
order_products = pd.read_hdf('../data/online_retail.h5','order_products')
order_products_compact = pd.read_hdf('../data/online_retail.h5','order_products_compact')


orders = orders[orders.eval_set != 'test']
priors = order_products[order_products.eval_set == 'prior'].copy()
train = order_products[order_products.eval_set == 'train'].copy()
test = order_products[order_products.eval_set == 'test'].copy()

In [18]:
orders.head()

Unnamed: 0,user_id,order_id,order_date,order_number,order_dow,order_hour_of_day,days_since_prior_order,eval_set
0,12347,537626,2010-12-07 14:57:00,1,1,14,,prior
1,12347,542237,2011-01-26 14:30:00,2,2,14,49.0,prior
2,12347,549222,2011-04-07 10:43:00,3,3,10,70.0,prior
3,12347,556201,2011-06-09 13:01:00,4,3,13,63.0,prior
4,12347,562032,2011-08-02 08:48:00,5,1,8,53.0,prior


### Product level features

In [19]:
order_size = priors.groupby('order_id').size().reset_index()
order_size.columns = ['order_id','order_size']
order_size = orders.merge(order_size,on='order_id')
priors['revenue'] = priors['unit_price'] * priors['quantity']
priors_orders = orders.merge(priors, on='order_id')
priors_orders.loc[:,'_user_buy_product_times'] = priors_orders.groupby(['user_id', 'product_id']).cumcount() + 1
prods = pd.DataFrame()
prods['orders'] = priors_orders.groupby('product_id').size()
prods['reorders'] = priors_orders.groupby('product_id')['reordered'].sum()
prods['reorder_rate'] = (prods.reorders / prods.orders)
prods['total_quantity'] = priors_orders.groupby('product_id')['quantity'].sum()
prods['total_revenue'] = priors_orders.groupby('product_id')['revenue'].sum()
prods['avg_price'] = prods['total_revenue'] / prods['total_quantity']
prods['prod_first_buy'] = priors_orders.groupby('product_id')['_user_buy_product_times'].agg(lambda x: sum(x==1))
prods['prod_second_buy'] = priors_orders.groupby('product_id')['_user_buy_product_times'].agg(lambda x: sum(x==2))
prods['prod_1reorder_ratio'] = prods.prod_second_buy / prods.prod_first_buy
prods['prod_nreorder_ratio'] = prods.reorders / prods.prod_first_buy
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
products.fillna(0,inplace=True)

In [20]:
order_size.groupby('user_id')['order_size'].count().head()

user_id
12347    6
12348    3
12352    4
12359    3
12362    9
Name: order_size, dtype: int64

In [21]:
order_size[order_size.user_id == 12347]

Unnamed: 0,user_id,order_id,order_date,order_number,order_dow,order_hour_of_day,days_since_prior_order,eval_set,order_size
0,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,31
1,12347,542237,2011-01-26 14:30:00,2,2,14,49.0,prior,29
2,12347,549222,2011-04-07 10:43:00,3,3,10,70.0,prior,24
3,12347,556201,2011-06-09 13:01:00,4,3,13,63.0,prior,18
4,12347,562032,2011-08-02 08:48:00,5,1,8,53.0,prior,22
5,12347,573511,2011-10-31 12:25:00,6,0,12,90.0,prior,47


In [22]:
orders[orders.user_id == 12347]

Unnamed: 0,user_id,order_id,order_date,order_number,order_dow,order_hour_of_day,days_since_prior_order,eval_set
0,12347,537626,2010-12-07 14:57:00,1,1,14,,prior
1,12347,542237,2011-01-26 14:30:00,2,2,14,49.0,prior
2,12347,549222,2011-04-07 10:43:00,3,3,10,70.0,prior
3,12347,556201,2011-06-09 13:01:00,4,3,13,63.0,prior
4,12347,562032,2011-08-02 08:48:00,5,1,8,53.0,prior
5,12347,573511,2011-10-31 12:25:00,6,0,12,90.0,prior
6,12347,581180,2011-12-07 15:52:00,7,2,15,37.0,train


In [23]:
priors_orders[priors_orders.user_id == 12347]

Unnamed: 0,user_id,order_id,order_date,order_number,order_dow,order_hour_of_day,days_since_prior_order,eval_set_x,product_id,quantity,unit_price,add_to_cart_order,eval_set_y,reordered,revenue,_user_buy_product_times
0,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,205,12,4.65,1,prior,0,55.80,1
1,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,207,6,5.49,2,prior,0,32.94,1
2,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,331,6,5.95,3,prior,0,35.70,1
3,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,393,12,1.45,4,prior,0,17.40,1
4,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,750,12,1.65,5,prior,0,19.80,1
5,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,1084,12,1.65,6,prior,0,19.80,1
6,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,1099,6,2.10,7,prior,0,12.60,1
7,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,1244,4,4.25,8,prior,0,17.00,1
8,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,1350,36,0.65,9,prior,0,23.40,1
9,12347,537626,2010-12-07 14:57:00,1,1,14,,prior,1352,12,1.25,10,prior,0,15.00,1


### User level features

In [10]:
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean()
usr['sum_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].sum()
usr['nb_orders'] = orders.groupby('user_id')['order_number'].max()

users = pd.DataFrame()
users['total_items'] = priors_orders.groupby('user_id').size()
users['all_products'] = priors_orders.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len))
users['total_item_quantity'] = priors_orders.groupby('user_id')['quantity'].sum()
users['total_money_spent'] = priors_orders.groupby('user_id')['revenue'].sum()
users['avg_money_spent_per_item'] = users['total_money_spent'] / users['total_item_quantity']
users['user_reorder_ratio'] = priors_orders.groupby('user_id')['reordered'].sum().\
                              divide(priors_orders.groupby('user_id')['order_number'].agg(lambda x: sum(x>1)))
users = users.join(usr)
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)

In [10]:
users.head()

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,total_item_quantity,total_money_spent,avg_money_spent_per_item,user_reorder_ratio,average_days_between_orders,sum_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
12347,171,"{1026, 1029, 1545, 1546, 1547, 25, 2587, 2591,...",100,2266,4085.18,1.802816,0.507143,60.333333,362.0,7,24.428572
12348,21,"{1792, 902, 908, 909, 910, 911, 912, 914, 1300...",21,2116,1229.64,0.581115,0.0,94.0,282.0,4,5.25
12352,48,"{1540, 773, 1286, 1033, 530, 1687, 2071, 2073,...",38,263,893.55,3.397529,0.30303,64.5,258.0,5,9.6
12359,142,"{1540, 1545, 2058, 1547, 1557, 2589, 1568, 156...",134,958,3449.08,3.600292,0.063492,91.0,273.0,4,35.5
12362,224,"{1033, 1034, 1547, 18, 1044, 2071, 1560, 2073,...",179,1943,4153.49,2.137669,0.227273,31.888889,287.0,10,22.4


### userXproduct features

In [11]:
%%time
priors_orders['user_product'] = priors_orders.product_id + priors_orders.user_id * 100000
d= dict()
for row in priors_orders.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1,
                (row.order_number, row.order_id),
                row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1,
                max(d[z][1], (row.order_number, row.order_id)),
                d[z][2] + row.add_to_cart_order)

userXproduct = pd.DataFrame.from_dict(d, orient='index')
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1])
# userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart
up_temp = pd.DataFrame()
up_temp['up_total_quantity'] = priors_orders.groupby('user_product')['quantity'].sum()
up_temp['up_total_spent'] = priors_orders.groupby('user_product')['revenue'].sum()
up_temp['up_first_order_number'] = priors_orders.groupby('user_product')['order_number'].min()
up_temp['up_last_order_number'] = priors_orders.groupby('user_product')['order_number'].max()
userXproduct = userXproduct.join(up_temp)

CPU times: user 1.53 s, sys: 12 ms, total: 1.54 s
Wall time: 1.54 s


In [12]:
userXproduct.head()

Unnamed: 0,nb_orders,last_order_id,sum_pos_in_cart,up_total_quantity,up_total_spent,up_first_order_number,up_last_order_number
1234700205,1,537626,1,12,55.8,1,1
1234700207,1,537626,2,6,32.94,1,1
1234700331,1,537626,3,6,35.7,1,1
1234700393,1,537626,4,12,17.4,1,1
1234700750,4,573511,19,48,79.2,1,6


### split train / test orders

In [13]:
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

### transform train data into a table that can be feed into machine learning models

In [14]:
%%time
# construct the framework
order_list = []
product_list = []
labels = []
last_orders = set(zip(train.order_id.values,train.product_id.values))
for row in train_orders.itertuples():
    order_id = row.order_id
    user_id = row.user_id
    user_products = list(products.product_id.values)
    product_list += user_products
    order_list += [order_id] * len(user_products)
    labels += [(order_id, product) in last_orders for product in user_products]
df = pd.DataFrame({'order_id':order_list, 'product_id':product_list, 'labels':labels}, dtype=np.int32)
# labels = np.array(labels, dtype=np.int8)

CPU times: user 1.39 s, sys: 20 ms, total: 1.41 s
Wall time: 1.41 s


In [15]:
orders.head()

Unnamed: 0,user_id,order_id,order_date,order_number,order_dow,order_hour_of_day,days_since_prior_order,eval_set
0,12347,537626,2010-12-07 14:57:00,1,1,14,,prior
1,12347,542237,2011-01-26 14:30:00,2,2,14,49.0,prior
2,12347,549222,2011-04-07 10:43:00,3,3,10,70.0,prior
3,12347,556201,2011-06-09 13:01:00,4,3,13,63.0,prior
4,12347,562032,2011-08-02 08:48:00,5,1,8,53.0,prior


In [16]:
df.head()

Unnamed: 0,labels,order_id,product_id
0,0,581180,0
1,0,581180,1
2,0,581180,2
3,0,581180,3
4,0,581180,4


In [17]:
# add features
orders.set_index('order_id',inplace=True)
print('user related features')
df['user_id'] = df.order_id.map(orders.user_id)
df['user_total_orders'] = df.user_id.map(users.nb_orders)
df['user_total_items'] = df.user_id.map(users.total_items)
df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
df['user_average_basket'] =  df.user_id.map(users.average_basket)
df['user_total_item_quantity'] = df.user_id.map(users.total_item_quantity)
df['user_total_spent'] = df.user_id.map(users.total_money_spent)
df['user_sum_days_between_orders'] = df.user_id.map(users.sum_days_between_orders)
df['user_reorder_ratio'] = df.user_id.map(users.user_reorder_ratio)


print('order related features')
# df['dow'] = df.order_id.map(orders.order_dow)
df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
df['order_dow'] = df.order_id.map(orders.order_dow)
df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
df['days_since_ratio'] = df.days_since_prior_order / (df.user_average_days_between_orders+.01)

print('product related features')
df['product_orders'] = df.product_id.map(products.orders)
df['product_reorders'] = df.product_id.map(products.reorders)
df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
df['product_total_quantity_sold'] = df.product_id.map(products.total_quantity)
df['product_avg_price'] = df.product_id.map(products.avg_price)
df['prod_first_buy'] = df.product_id.map(products.prod_first_buy)
df['prod_second_buy'] = df.product_id.map(products.prod_second_buy)
df['prod_1reorder_ratio'] = df.product_id.map(products.prod_1reorder_ratio)
df['prod_nreorder_ratio'] = df.product_id.map(products.prod_nreorder_ratio)

print('user_X_product related features')
df['z'] = df.user_id * 100000 + df.product_id
# df.drop(['user_id'], axis=1, inplace=True)
df['UP_orders'] = df.z.map(userXproduct.nb_orders)
df['UP_orders_ratio'] = df.UP_orders / df.user_total_orders
# df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
# df['UP_average_pos_in_cart'] = df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders
df['UP_reorder_rate'] = df.UP_orders / df.user_total_orders
# df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
df['UP_total_quantity'] = df.z.map(userXproduct.up_total_quantity)
df['UP_first_order_number'] = df.z.map(userXproduct.up_first_order_number)
df['UP_order_rate_since_first_order'] = df.UP_orders / (df.user_total_orders - df.UP_first_order_number + 1)

df.drop(['z','UP_first_order_number'], axis=1, inplace=True)
df.fillna(0,inplace=True)

user related features
order related features
product related features
user_X_product related features


In [18]:
df.head()

Unnamed: 0,labels,order_id,product_id,user_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,user_total_item_quantity,...,product_avg_price,prod_first_buy,prod_second_buy,prod_1reorder_ratio,prod_nreorder_ratio,UP_orders,UP_orders_ratio,UP_reorder_rate,UP_total_quantity,UP_order_rate_since_first_order
0,0,581180,0,12347,7,171,100,60.333333,24.428572,2266,...,0.85,33.0,6.0,0.181818,0.272727,0.0,0.0,0.0,0.0,0.0
1,0,581180,1,12347,7,171,100,60.333333,24.428572,2266,...,0.394126,12.0,2.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0
2,0,581180,2,12347,7,171,100,60.333333,24.428572,2266,...,0.21,12.0,2.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0
3,0,581180,3,12347,7,171,100,60.333333,24.428572,2266,...,0.65,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,581180,4,12347,7,171,100,60.333333,24.428572,2266,...,0.42,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
all_data.head()

Unnamed: 0,order_id,product_id,product_id_original,description,order_date,unit_price,user_id,country,quantity
0,536365,749,21730,GLASS STAR FROSTED T-LIGHT HOLDER,2010-12-01 08:26:00,4.25,17850,United Kingdom,10
2,536365,1599,22752,SET 7 BABUSHKA NESTING BOXES,2010-12-01 08:26:00,7.65,17850,United Kingdom,2
3,536365,2639,71053,WHITE METAL LANTERN,2010-12-01 08:26:00,3.39,17850,United Kingdom,6
4,536365,2789,84029E,RED WOOLLY HOTTIE WHITE HEART.,2010-12-01 08:26:00,3.39,17850,United Kingdom,6
5,536365,2790,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,2010-12-01 08:26:00,3.39,17850,United Kingdom,6


In [20]:
# df.to_hdf('../data/online_retail_transformed.h5','train')

In [26]:
df.to_pickle('../data/train_transformed.p')
products.to_pickle('../data/product_features.p')
users.to_pickle('../data/user_features.p')

In [22]:
df.head()

Unnamed: 0,labels,order_id,product_id,user_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,user_total_item_quantity,...,product_avg_price,prod_first_buy,prod_second_buy,prod_1reorder_ratio,prod_nreorder_ratio,UP_orders,UP_orders_ratio,UP_reorder_rate,UP_total_quantity,UP_order_rate_since_first_order
0,0,581180,0,12347,7,171,100,60.333333,24.428572,2266,...,0.85,33.0,6.0,0.181818,0.272727,0.0,0.0,0.0,0.0,0.0
1,0,581180,1,12347,7,171,100,60.333333,24.428572,2266,...,0.394126,12.0,2.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0
2,0,581180,2,12347,7,171,100,60.333333,24.428572,2266,...,0.21,12.0,2.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0
3,0,581180,3,12347,7,171,100,60.333333,24.428572,2266,...,0.65,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,581180,4,12347,7,171,100,60.333333,24.428572,2266,...,0.42,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
(df==0).sum()

labels                              4171246
order_id                                  0
product_id                             1184
user_id                                   0
user_total_orders                         0
user_total_items                          0
total_distinct_items                      0
user_average_days_between_orders      10632
user_average_basket                       0
user_total_item_quantity                  0
user_total_spent                          0
user_sum_days_between_orders          10632
user_reorder_ratio                   106320
order_hour_of_day                         0
order_dow                            623744
days_since_prior_order               450088
days_since_ratio                     450088
product_orders                        15392
product_reorders                     903392
product_reorder_rate                 903392
product_total_quantity_sold           15392
product_avg_price                     15392
prod_first_buy                  

In [24]:
products.head()

Unnamed: 0_level_0,product_id,description,orders,reorders,reorder_rate,total_quantity,total_revenue,avg_price,prod_first_buy,prod_second_buy,prod_1reorder_ratio,prod_nreorder_ratio
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,INFLATABLE POLITICAL GLOBE,42.0,9.0,0.214286,774.0,657.9,0.85,33.0,6.0,0.181818,0.272727
1,1,GROOVY CACTUS INFLATABLE,14.0,2.0,0.142857,223.0,87.89,0.394126,12.0,2.0,0.166667,0.166667
2,2,DOGGY RUBBER,14.0,2.0,0.142857,92.0,19.32,0.21,12.0,2.0,0.166667,0.166667
3,3,HEARTS WRAPPING TAPE,1.0,0.0,0.0,1.0,0.65,0.65,1.0,0.0,0.0,0.0
4,4,SPOTS ON RED BOOKCOVER TAPE,3.0,0.0,0.0,9.0,3.78,0.42,3.0,0.0,0.0,0.0


In [25]:
users.head()

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,total_item_quantity,total_money_spent,avg_money_spent_per_item,user_reorder_ratio,average_days_between_orders,sum_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
12347,171,"{1026, 1029, 1545, 1546, 1547, 25, 2587, 2591,...",100,2266,4085.18,1.802816,0.507143,60.333333,362.0,7,24.428572
12348,21,"{1792, 902, 908, 909, 910, 911, 912, 914, 1300...",21,2116,1229.64,0.581115,0.0,94.0,282.0,4,5.25
12352,48,"{1540, 773, 1286, 1033, 530, 1687, 2071, 2073,...",38,263,893.55,3.397529,0.30303,64.5,258.0,5,9.6
12359,142,"{1540, 1545, 2058, 1547, 1557, 2589, 1568, 156...",134,958,3449.08,3.600292,0.063492,91.0,273.0,4,35.5
12362,224,"{1033, 1034, 1547, 18, 1044, 2071, 1560, 2073,...",179,1943,4153.49,2.137669,0.227273,31.888889,287.0,10,22.4
