In [20]:
import os
import pandas as pd
import sys
import warnings

In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
warnings.filterwarnings("ignore")

sys.path.append(os.pardir)
sys.path.append('../..')

from src.dataloader import DataLoader
from src.utils import mkdir, load_joblib, load_yaml, save_joblib

config = load_yaml("../config.yml")
for k, v in config["path"].items():
    config["path"][k] = "." + v
config["path"]

{'aisles': '../input/instacart-market-basket-analysis/aisles.csv',
 'departments': '../input/instacart-market-basket-analysis/departments.csv',
 'order_products__prior': '../input/instacart-market-basket-analysis/order_products__prior.csv',
 'order_products__train': '../input/instacart-market-basket-analysis/order_products__train.csv',
 'orders': '../input/instacart-market-basket-analysis/orders.csv',
 'products': '../input/instacart-market-basket-analysis/products.csv',
 'sample_submission': '../input/instacart-market-basket-analysis/sample_submission.csv',
 'column_series': '../input/column_series',
 'feature': '../feature',
 'importance': '../importance',
 'logs': '../logs',
 'model': '../model',
 'preprocess': '../input/preprocess',
 'submit': '../submit'}

## Load

In [3]:
def load(config):
    dataloader = DataLoader(config)
    aisles = dataloader.load_csv(config["path"]["aisles"])
    departments = dataloader.load_csv(config["path"]["departments"])
    order_products_prior = dataloader.load_csv(config["path"]["order_products__prior"])
    order_products_train = dataloader.load_csv(config["path"]["order_products__train"])
    orders = dataloader.load_csv(config["path"]["orders"])
    products = dataloader.load_csv(config["path"]["products"])
    sample_submission = dataloader.load_csv(config["path"]["sample_submission"])
    return aisles, departments, order_products_prior, order_products_train, orders, products, sample_submission

aisles, departments, order_products_prior, order_products_train, orders, products, sample_submission = load(config)

In [4]:
for df, name in zip(
    [aisles, departments, order_products_prior, order_products_train, orders, products, sample_submission],
    ["aisles", "departments", "order_products_prior", "order_products_train", "orders", "products", "sample_submission"]
):
    print("\n", name, "\n", df.shape)
    display(df.head(3))


 aisles 
 (134, 2)


Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars



 departments 
 (21, 2)


Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery



 order_products_prior 
 (32434489, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0



 order_products_train 
 (1384617, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0



 orders 
 (3421083, 7)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0



 products 
 (49688, 4)


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7



 sample_submission 
 (75000, 2)


Unnamed: 0,order_id,products
0,17,39276 29259
1,34,39276 29259
2,137,39276 29259


In [None]:
if not os.path.exists("./preprocess/prior.jbl"):
    prior = order_products_prior.merge(orders[orders["eval_set"]=="prior"], how="left", on="order_id")
    prior = prior.merge(products, how="left", on="product_id")
    prior.drop("eval_set", axis=1, inplace=True)
    prior.set_index("order_id", inplace=True)

    train = order_products_train.merge(orders[orders["eval_set"]=="train"], how="left", on="order_id")
    train = train.merge(products, how="left", on="product_id")
    train.drop("eval_set", axis=1, inplace=True)
    train.set_index("order_id", inplace=True)

    test = sample_submission.merge(orders[orders["eval_set"]=="test"], how="left", on="order_id")
    test.drop(["eval_set", "products"], axis=1, inplace=True)
    test.set_index("order_id", inplace=True)

    mkdir("./preprocess")
    save_joblib(prior, "./preprocess/prior.jbl")
    save_joblib(train, "./preprocess/train.jbl")
    save_joblib(test, "./preprocess/test.jbl")

else:
    prior = load_joblib("./preprocess/prior.jbl")
    train = load_joblib("./preprocess/train.jbl")
    test = load_joblib("./preprocess/test.jbl")

In [13]:
prior.head(3)

Unnamed: 0_level_0,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16
2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4
2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13


In [14]:
train.head(3)

Unnamed: 0_level_0,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,49302,1,1,112108,4,4,10,9.0,Bulgarian Yogurt,120,16
1,11109,2,1,112108,4,4,10,9.0,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16
1,10246,3,0,112108,4,4,10,9.0,Organic Celery Hearts,83,4


In [15]:
test.head(3)

Unnamed: 0_level_0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
17,36855,5,6,15,1.0
34,35220,20,3,11,8.0
137,187107,9,2,19,30.0


In [16]:
test.shape, test.user_id.unique().shape

((75000, 5), (75000,))

In [17]:
prior.user_id.nunique(), train.user_id.nunique() + test.user_id.nunique(), train.user_id.nunique(), test.user_id.nunique()

(206209, 206209, 131209, 75000)

## Subset

In [18]:
prior[prior["user_id"]==202279].sort_values(["order_number", "add_to_cart_order"])

Unnamed: 0_level_0,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2894949,5812,1,0,202279,1,5,9,,Natural Calm Magnesium Supplement,47,11
2894949,24742,2,0,202279,1,5,9,,My Community Immune Support Dietary Supplement,47,11
2894949,17794,3,0,202279,1,5,9,,Carrots,83,4
2894949,28985,4,0,202279,1,5,9,,Michigan Organic Kale,83,4
2894949,33120,5,0,202279,1,5,9,,Organic Egg Whites,86,16
2894949,1003,6,0,202279,1,5,9,,Organic Turkey Bone Broth,5,13
2894949,26504,7,0,202279,1,5,9,,Creamy & Raw Almond Butter,88,13
2894949,34040,8,0,202279,1,5,9,,Organic Gut Shot Ginger Beet Probiotic Drink,81,15
2894949,7206,9,0,202279,1,5,9,,Cilantro Avocado Yogurt Dressing,89,13
2894949,5593,10,0,202279,1,5,9,,Renew Life Total Body 7-Day Rapid Cleanse,6,2


In [19]:
train[train["user_id"]==202279].sort_values(["order_number", "add_to_cart_order"])

Unnamed: 0_level_0,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1050357,33120,1,1,202279,9,1,14,30.0,Organic Egg Whites,86,16
1050357,1003,2,1,202279,9,1,14,30.0,Organic Turkey Bone Broth,5,13
1050357,21543,3,1,202279,9,1,14,30.0,Organic Quick Oats,130,14
1050357,20995,4,1,202279,9,1,14,30.0,Organic Broccoli Florets,116,1
1050357,46064,5,0,202279,9,1,14,30.0,Lightly Salted Brown Rice Cakes,78,19
1050357,20168,6,0,202279,9,1,14,30.0,Organic Sunflower Seed Spread,88,13
1050357,42824,7,1,202279,9,1,14,30.0,Black Eyed Peas,116,1
1050357,4461,8,1,202279,9,1,14,30.0,Organic Raw Unfiltered Apple Cider Vinegar,19,13
1050357,45002,9,1,202279,9,1,14,30.0,Organic Balsamic Vinegar Of Modena,19,13
1050357,8693,10,0,202279,9,1,14,30.0,Dairy Free French Vanilla Creamer Almond Milk,100,21


In [26]:
products[products["product_id"]==28985]

Unnamed: 0,product_id,product_name,aisle_id,department_id
17793,17794,Carrots,83,4


In [29]:
prior[prior["user_id"]==202279]["product_id"].value_counts().head(10)

17794    7
1003     6
28985    5
33120    5
40141    5
45918    5
35106    3
10960    3
30035    3
31275    3
Name: product_id, dtype: int64

In [30]:
train[train["user_id"]==202279]["product_id"].value_counts()

46064    1
4461     1
1003     1
45002    1
20168    1
21543    1
8693     1
20995    1
42824    1
33120    1
Name: product_id, dtype: int64