### Load data

In [1]:
from trane.utils.data_parser import denormalize

relationships = [
    ("data/orders.csv", "order_id", "data/order_products.csv", "order_id"),
    ("data/order_products.csv", "product_id", "data/products.csv", "product_id"),
    ("data/products.csv", "aisle_id", "data/aisles.csv", "aisle_id"),
    ("data/products.csv", "department_id", "data/departments.csv", "department_id"),
]
df = denormalize(relationships)

Generate artificial date and remove order no

In [18]:
import pandas as pd

df["order_date"] = pd.to_datetime("2023-01-01") + pd.to_timedelta(df["order_number"], unit="d")
df = df.drop("order_number", axis=1)
df = df.sort_values(["user_id", "order_date"])

## Trane

In [6]:
import trane

entity_col = "user_id"
time_col = "order_date"
meta = {
    "order_id": ("Integer", {"numeric"}),
    "user_id": ("Integer", {"numeric", "index"}),
    "eval_set": ("Categorical", {"category"}),
    "order_date": ("DateTime", {}),
    "order_dow": ("Categorical", {"category"}),
    "order_hour_of_day": ("Categorical", {"category"}),
    "days_since_prior_order": ("Integer", {"numeric"}),
    "product_id": ("Categorical", {"category"}),
    "add_to_cart_order": ("Integer", {"numeric"}),
    "reordered": ("Categorical", {"category"}),
    "product_name": ("Categorical", {"category"}),
    "aisle_id": ("Categorical", {"category"}),
    "department_id": ("Categorical", {"category"}),
    "aisle": ("Categorical", {"category"}),
    "department": ("Categorical", {"category"}),
}

cutoff_strategy = trane.CutoffStrategy(
    entity_col="user_id",
    window_size="1m",
    minimum_data="2023-01-01",
    maximum_data="2023-02-01",
)


Fix NaNs

In [8]:
df = df.fillna(0)

Shorten data

In [12]:
df = df.query("user_id < 1000")

#### Generate prediction problems

In [13]:
problem_generator = trane.PredictionProblemGenerator(
    df=df,
    table_meta=meta,
    entity_col=entity_col,
    cutoff_strategy=cutoff_strategy,
    time_col=time_col,
)

In [14]:
problems = problem_generator.generate(df, generate_thresholds=True)

  0%|          | 0/4187 [00:00<?, ?it/s]

Find a relevant problem

In [34]:
ex = problems[1886].execute(df, num_examples_per_instance=5)
problems[1886]

Elapsed: 00:03 | Remaining: 00:00 | Progress: 100%|██████████| user_id: 4995/4995 


For each <user_id> predict if there exists a record with with <product_name> equal to Bag of Organic Bananas in next 1m days

#### Generate features

In [35]:
import featuretools as ft
es = ft.EntitySet('instacart')

es.add_dataframe(
    dataframe=df.reset_index(),
    dataframe_name='order_products',
    time_index='order_date',
    index='__id__',
)

es.normalize_dataframe(
    base_dataframe_name='order_products',
    new_dataframe_name='orders',
    index='order_id',
    additional_columns=['user_id'],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='orders',
    new_dataframe_name='customers',
    index='user_id',
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='order_products',
    new_dataframe_name='products',
    index='product_id',
    additional_columns=['aisle_id', 'department_id'],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='products',
    new_dataframe_name='aisles',
    index='aisle_id',
    additional_columns=['department_id'],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='aisles',
    new_dataframe_name='departments',
    index='department_id',
    make_time_index=False,
)

fm, fd = ft.dfs(
    entityset=es,
    target_dataframe_name='customers',
    cutoff_time=ex,
    cutoff_time_in_index=True,
    include_cutoff_time=False,
    verbose=False,
)

fm.head()


index __id__ not found in dataframe, creating new integer column


Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT(orders),COUNT(order_products),MAX(order_products.add_to_cart_order),MAX(order_products.days_since_prior_order),MAX(order_products.index),MEAN(order_products.add_to_cart_order),MEAN(order_products.days_since_prior_order),MEAN(order_products.index),MIN(order_products.add_to_cart_order),MIN(order_products.days_since_prior_order),...,SUM(orders.NUM_UNIQUE(order_products.product_id)),SUM(orders.NUM_UNIQUE(order_products.product_name)),SUM(orders.NUM_UNIQUE(order_products.reordered)),SUM(orders.SKEW(order_products.add_to_cart_order)),SUM(orders.SKEW(order_products.days_since_prior_order)),SUM(orders.SKEW(order_products.index)),SUM(orders.STD(order_products.add_to_cart_order)),SUM(orders.STD(order_products.days_since_prior_order)),SUM(orders.STD(order_products.index)),_execute_operations_on_df
user_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2023-01-01,10,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,2023-01-01,14,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,2023-01-01,12,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,2023-01-01,5,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5,2023-01-01,4,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


X and y

In [36]:
fm.reset_index(drop=True, inplace=True)
y = fm.ww.pop('_execute_operations_on_df')

#### AutoML

In [37]:
import evalml
splits = evalml.preprocessing.split_data(
    X=fm,
    y=y,
    test_size=0.2,
    random_seed=0,
    problem_type='binary',
)

X_train, X_holdout, y_train, y_holdout = splits

automl = evalml.AutoMLSearch(
    X_train=X_train,
    y_train=y_train,
    problem_type='binary',
    objective='f1',
    random_seed=0,
    allowed_model_families=['extra_trees', 'random_forest'],
    max_iterations=2,
)

automl.search()

automl.best_pipeline.describe()


*************************************************************************
* Random Forest Classifier w/ Label Encoder + Imputer + One Hot Encoder *
*************************************************************************

Problem Type: binary
Model Family: Random Forest
Number of features: 296

Pipeline Steps
1. Label Encoder
	 * positive_label : None
2. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
3. One Hot Encoder
	 * top_n : 10
	 * features_to_encode : None
	 * categories : None
	 * drop : if_binary
	 * handle_unknown : ignore
	 * handle_missing : error
4. Random Forest Classifier
	 * n_estimators : 100
	 * max_depth : 6
	 * n_jobs : -1


In [40]:
automl.best_pipeline.score(X_train, y_train, objectives=["auc", "F1"])

OrderedDict([('AUC', 0.7313375240810892), ('F1', 0.5328836424957841)])

In [41]:
automl.best_pipeline.score(X_holdout, y_holdout, objectives=["auc", "F1"])

OrderedDict([('AUC', 0.6100510107689401), ('F1', 0.45283018867924535)])