In [1]:
import pandas as pd
from file_utils import load_dataframes

In [2]:
items, categories, alley_inventory, transaction_items__prior, transaction_items__train, transactions = load_dataframes("data")

In [3]:
df_prior = pd.merge(transaction_items__prior, transactions, on="transaction_id")
df_train = pd.merge(transaction_items__train, transactions, on="transaction_id")

In [4]:
N_ITEMS = items.shape[0]
N_CUSTOMERS = transactions.loc[transactions.eval_set == "prior"].shape[0]
N_TRANSACTIONS_PRIOR = transactions.loc[transactions.eval_set == "prior"].shape[0]
N_TRANSACTIONS_TRAIN = transactions.loc[transactions.eval_set == "train"].shape[0]

### Item features

- category
- aisle
- order rate
- reorder rate

In [6]:
# Category and aisle
item_features = items[["category", "alley_id"]]

In [7]:
# Order rate

item_features["order_rate"] = df_prior.groupby("item_id").agg({"transaction_number": "count"}) / N_TRANSACTIONS_PRIOR
item_features["order_rate"]

item_id
1        0.000533
2        0.000026
3        0.000077
4        0.000095
5        0.000004
           ...   
49684    0.000002
49685    0.000013
49686    0.000034
49687    0.000004
49688    0.000026
Name: order_rate, Length: 49147, dtype: float64

In [8]:
# Reorder rate
item_features["reorder_rate"] = df_prior.groupby("item_id").agg({"previous_bought": "mean"})
item_features["reorder_rate"]

item_id
1        0.619683
2        0.113924
3        0.716157
4        0.433099
5        0.571429
           ...   
49684    0.142857
49685    0.128205
49686    0.682692
49687    0.454545
49688       0.175
Name: reorder_rate, Length: 49147, dtype: Float64

In [10]:
item_features.to_parquet("features/item_features.parquet")
item_features

Unnamed: 0_level_0,category,alley_id,order_rate,reorder_rate
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,19,61,0.000533,0.619683
2,13,104,0.000026,0.113924
3,7,94,0.000077,0.716157
4,1,38,0.000095,0.433099
5,13,5,0.000004,0.571429
...,...,...,...,...
49684,5,124,0.000002,0.142857
49685,1,42,0.000013,0.128205
49686,3,112,0.000034,0.682692
49687,8,41,0.000004,0.454545


### Client features

- number of prior transactions
- average number of items in transaction
- reorder rate

In [24]:
pd.DataFrame(index=transactions.loc[transactions.eval_set == "prior"].customer_id.unique())

1
1
1
1
1
...
206209
206209
206209
206209
206209


In [25]:
client_features = pd.DataFrame(index=transactions.loc[transactions.eval_set == "prior"].customer_id.unique())
client_features.index.name = "customer_id"

In [26]:
# Number of prior transactions per client

client_features["n_transactions"] = df_prior.groupby("customer_id").agg({"transaction_number": "max"})
client_features["n_transactions"]

customer_id
1            9
<NA>      <NA>
2           14
3           12
4            5
          ... 
206205       2
206206      67
206207      16
206208      49
206209      13
Name: n_transactions, Length: 206188, dtype: Int64

In [27]:
# Average items in transaction per client

client_features["avg_items_in_transaction"] = df_prior.groupby("customer_id").agg({"item_id": "count"})
client_features["avg_items_in_transaction"] /= client_features["n_transactions"]
client_features["avg_items_in_transaction"]

customer_id
1          5.333333
<NA>           <NA>
2              10.5
3          6.583333
4               2.6
            ...    
206205         12.0
206206     3.865672
206207         12.0
206208    11.979592
206209     8.307692
Name: avg_items_in_transaction, Length: 206188, dtype: Float64

In [28]:
client_features.to_parquet("features/client_features.parquet")
client_features

Unnamed: 0_level_0,n_transactions,avg_items_in_transaction
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,9,5.333333
,,
2,14,10.5
3,12,6.583333
4,5,2.6
...,...,...
206205,2,12.0
206206,67,3.865672
206207,16,12.0
206208,49,11.979592


### Client-item features

- number of transactions of client with that item
- how many orders since last order of that item
- reorder rate

In [29]:
# How many orders since last order of that item

client_item_features = df_prior.groupby(["customer_id", "item_id"]).agg({"transaction_number": "max"}).rename(columns={"transaction_number": "last_transaction"})

client_item_features["last_transaction"]

customer_id  item_id
1            196         9
             10258       9
             10326       5
             12427       9
             13032       7
                        ..
206209       43961      12
             44325       7
             48370      11
             48697       7
             48742       7
Name: last_transaction, Length: 12140563, dtype: Int64

In [35]:
client_item_features.join(client_features["n_transactions"])

Unnamed: 0_level_0,Unnamed: 1_level_0,last_transaction,n_transactions
customer_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,196,9,9
1,10258,9,9
1,10326,5,9
1,12427,9,9
1,13032,7,9
...,...,...,...
206209,43961,12,13
206209,44325,7,13
206209,48370,11,13
206209,48697,7,13


In [43]:
client_item_features["n_transactions_since_last"] = \
    client_item_features.join(client_features["n_transactions"])["n_transactions"] - client_item_features["last_transaction"]
client_item_features["n_transactions_since_last"]

customer_id  item_id
1            196        0
             10258      0
             10326      4
             12427      0
             13032      2
                       ..
206209       43961      1
             44325      6
             48370      2
             48697      6
             48742      6
Name: n_transactions_since_last, Length: 12140563, dtype: Int64

In [44]:
# Number of transactions of client with that item
client_item_features["n_transactions"] = df_prior.groupby(["customer_id", "item_id"]).size()
client_item_features["n_transactions"]

customer_id  item_id
1            196        9
             10258      8
             10326      1
             12427      9
             13032      2
                       ..
206209       43961      3
             44325      1
             48370      1
             48697      1
             48742      1
Name: n_transactions, Length: 12140563, dtype: int64

In [46]:
# Reorder rate
client_item_features["reorder_rate"] = df_prior.groupby(["customer_id", "item_id"]).agg({"previous_bought": "mean"})
client_item_features["reorder_rate"]

customer_id  item_id
1            196        0.888889
             10258         0.875
             10326           0.0
             12427         0.875
             13032           0.5
                          ...   
206209       43961      0.666667
             44325           0.0
             48370           0.0
             48697           0.0
             48742           0.0
Name: reorder_rate, Length: 12140563, dtype: Float64

In [47]:
# Number of transactions with that item in last 30 days - NOT CREDIBLE

# client_item_features["n_transactions_last_30_days"] = df_prior.loc[df_prior.days_since_prior_order <= 30].groupby(["customer_id", "item_id"]).size()
# client_item_features["n_transactions_last_30_days"]

In [48]:
client_item_features.to_parquet("features/client_item_features.parquet")
client_item_features

Unnamed: 0_level_0,Unnamed: 1_level_0,last_transaction,n_transactions_since_last,n_transactions,reorder_rate
customer_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,196,9,0,9,0.888889
1,10258,9,0,8,0.875
1,10326,5,4,1,0.0
1,12427,9,0,9,0.875
1,13032,7,2,2,0.5
...,...,...,...,...,...
206209,43961,12,1,3,0.666667
206209,44325,7,6,1,0.0
206209,48370,11,2,1,0.0
206209,48697,7,6,1,0.0


### Transaction features

- day of week
- hour of day

In [49]:
transaction_features = transactions.loc[transactions.eval_set == "train"][["day_of_week", "time_of_day"]]

In [50]:
transaction_features.to_parquet("features/transaction_features.parquet")
transaction_features

Unnamed: 0_level_0,day_of_week,time_of_day
transaction_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1187899,4,8
1492625,1,11
2196797,0,11
525192,2,
880375,1,14
...,...,...
2585586,2,16
943915,6,19
2371631,4,
1716008,1,16
