### Load data

In [4]:
from trane.parsing import denormalize
import pandas as pd

dataframes = {
    "orders": (pd.read_csv("data/orders.csv"), "id"),
    "order_products": (pd.read_csv("data/order_products.csv"), "id"),
    "products": (pd.read_csv("data/products.csv"), "id"),
    "aisles": (pd.read_csv("data/aisles.csv"), "id"),
    "departments": (pd.read_csv("data/departments.csv"), "id"),
}
relationships = [
    ("aisles", "aisle_id", "products", "aisle_id"),
    ("departments", "department_id", "products", "department_id"),
    ("products", "product_id", "order_products", "product_id"),
    ("order_products", "order_id", "orders", "order_id"),
]

df = denormalize(
    dataframes,
    relationships,
    "orders"
)

Merging     departments.department_id departments.department
0                           1                 frozen
1                           2                  other
2                           3                 bakery
3                           4                produce
4                           5                alcohol
5                           6          international
6                           7              beverages
7                           8                   pets
8                           9        dry goods pasta
9                          10                   bulk
10                         11          personal care
11                         12           meat seafood
12                         13                 pantry
13                         14              breakfast
14                         15           canned goods
15                         16             dairy eggs
16                         17              household
17                         18         

MergeError: Merge keys are not unique in right dataset; not a many-to-one merge

In [4]:
dataframes["order_products"].head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


Generate artificial date and remove order no

In [2]:
df["order_date"] = pd.to_datetime("2023-01-01") + pd.to_timedelta(df["order_number"], unit="d")
df = df.drop("order_number", axis=1)
df = df.sort_values(["user_id", "order_date"])

## Trane

In [3]:
import trane

entity_col = "user_id"
time_col = "order_date"
meta = {
    "order_id": ("Integer", {"numeric"}),
    "user_id": ("Integer", {"numeric", "index"}),
    "eval_set": ("Categorical", {"category"}),
    "order_date": ("DateTime", {}),
    "order_dow": ("Categorical", {"category"}),
    "order_hour_of_day": ("Categorical", {"category"}),
    "days_since_prior_order": ("Integer", {"numeric"}),
    "product_id": ("Categorical", {"category"}),
    "add_to_cart_order": ("Integer", {"numeric"}),
    "reordered": ("Categorical", {"category"}),
    "product_name": ("Categorical", {"category"}),
    "aisle_id": ("Categorical", {"category"}),
    "department_id": ("Categorical", {"category"}),
    "aisle": ("Categorical", {"category"}),
    "department": ("Categorical", {"category"}),
}

cutoff_strategy = trane.CutoffStrategy(
    entity_col="user_id",
    window_size="1m",
    minimum_data="2023-01-01",
    maximum_data="2023-02-01",
)


Fix NaNs

In [4]:
df = df.fillna(0)

Shorten data

In [5]:
df = df.query("user_id < 1000")

#### Generate prediction problems

In [6]:
problem_generator = trane.PredictionProblemGenerator(
    df=df,
    table_meta=meta,
    entity_col=entity_col,
    cutoff_strategy=cutoff_strategy,
    time_col=time_col,
)

In [7]:
problems = problem_generator.generate(df, generate_thresholds=True)

  0%|          | 0/4187 [00:00<?, ?it/s]

Find a relevant problem

In [8]:
ex = problems[1984].execute(df, num_examples_per_instance=5)
problems[1984]

Elapsed: 00:00 | Remaining: ? | Progress:   0%|          | user_id: 0/4995 

Elapsed: 00:01 | Remaining: 00:00 | Progress: 100%|██████████| user_id: 4995/4995 


For each <user_id> predict if there exists a record with with <product_name> equal to Banana in next 1m days

#### Generate features

In [9]:
import featuretools as ft
es = ft.EntitySet('instacart')

es.add_dataframe(
    dataframe=df.reset_index(),
    dataframe_name='order_products',
    time_index='order_date',
    index='__id__',
)

es.normalize_dataframe(
    base_dataframe_name='order_products',
    new_dataframe_name='orders',
    index='order_id',
    additional_columns=['user_id'],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='orders',
    new_dataframe_name='customers',
    index='user_id',
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='order_products',
    new_dataframe_name='products',
    index='product_id',
    additional_columns=['aisle_id', 'department_id'],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='products',
    new_dataframe_name='aisles',
    index='aisle_id',
    additional_columns=['department_id'],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='aisles',
    new_dataframe_name='departments',
    index='department_id',
    make_time_index=False,
)

fm, fd = ft.dfs(
    entityset=es,
    target_dataframe_name='customers',
    cutoff_time=ex,
    cutoff_time_in_index=True,
    include_cutoff_time=False,
    verbose=False,
)

fm.head()




Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT(orders),COUNT(order_products),MAX(order_products.add_to_cart_order),MAX(order_products.days_since_prior_order),MAX(order_products.index),MEAN(order_products.add_to_cart_order),MEAN(order_products.days_since_prior_order),MEAN(order_products.index),MIN(order_products.add_to_cart_order),MIN(order_products.days_since_prior_order),...,SUM(orders.NUM_UNIQUE(order_products.product_id)),SUM(orders.NUM_UNIQUE(order_products.product_name)),SUM(orders.NUM_UNIQUE(order_products.reordered)),SUM(orders.SKEW(order_products.add_to_cart_order)),SUM(orders.SKEW(order_products.days_since_prior_order)),SUM(orders.SKEW(order_products.index)),SUM(orders.STD(order_products.add_to_cart_order)),SUM(orders.STD(order_products.days_since_prior_order)),SUM(orders.STD(order_products.index)),_execute_operations_on_df
user_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2023-01-01,11,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
2,2023-01-01,15,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,2023-01-01,13,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,2023-01-01,6,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5,2023-01-01,5,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


X and y

In [10]:
fm.reset_index(drop=True, inplace=True)
y = fm.ww.pop('_execute_operations_on_df')

### AutoML

#### BTB

In [24]:
from sklearn.datasets import load_wine
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

dataset = load_wine()

models = {
    'LGB': lgb.LGBMClassifier,
    'XGB': xgb.XGBClassifier,
    'DTC': DecisionTreeClassifier,
    'SGDC': SGDClassifier,
    'RF': RandomForestClassifier,
}

def scoring_function(model_name, hyperparameter_values):
    model_class = models[model_name]
    model_instance = model_class(**hyperparameter_values)
    scores = cross_val_score(
        estimator=model_instance,
        X=fm,
        y=y,
        scoring=make_scorer(roc_auc_score)
    )
    return scores.mean()

from btb.tuning import Tunable
from btb.tuning import hyperparams as hp

tunables = {
    'LGB': Tunable({
        'num_leaves': hp.IntHyperParam(min=2, max=100),
        'max_depth': hp.IntHyperParam(min=3, max=200),
        'learning_rate': hp.FloatHyperParam(min=0.01, max=1),
        'n_estimators': hp.IntHyperParam(min=10, max=1000),
    }),
    'XGB': Tunable({
        'max_depth': hp.IntHyperParam(min=3, max=200),
        'learning_rate': hp.FloatHyperParam(min=0.01, max=1),
        'n_estimators': hp.IntHyperParam(min=10, max=1000),
    }),
    'DTC': Tunable({
        'max_depth': hp.IntHyperParam(min=3, max=200),
        'min_samples_split': hp.FloatHyperParam(min=0.01, max=1)
    }),
    'SGDC': Tunable({
        'max_iter': hp.IntHyperParam(min=1, max=5000, default=1000),
        'tol': hp.FloatHyperParam(min=1e-3, max=1, default=1e-3),
    }),
    'RF': Tunable({
        'n_estimators': hp.IntHyperParam(min=10, max=1000),
        'max_depth': hp.IntHyperParam(min=3, max=200),
        'min_samples_split': hp.FloatHyperParam(min=0.01, max=1),
    }),
}

from btb import BTBSession

session = BTBSession(
    tunables=tunables,
    scorer=scoring_function,
    verbose=True
)

best_proposal = session.run(20)

best_proposal

  0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 290, number of negative: 618
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4447
[LightGBM] [Info] Number of data points in the train set: 908, number of used features: 190
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.319383 -> initscore=-0.756608
[LightGBM] [Info] Start training from score -0.756608
[LightGBM] [Info] Number of positive: 291, number of negative: 618
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4267
[LightGBM] [Info] Number of data points in the train set: 909, number of used features: 190
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.320132 -> initscore=-0.753165
[LightGBM] [Info] Start training from score -0.753165
[LightGBM] [Info] Number of positive: 291, number of negative: 618
You can set `force_row_wise=true` t

The optimal value found for dimension 0 of parameter length_scale is close to the specified lower bound 1e-05. Decreasing the bound and calling fit again may find a better value.


{'id': '5a676477e70bddc4971e39f8f972a13b',
 'name': 'LGB',
 'config': {'num_leaves': 48,
  'max_depth': 27,
  'learning_rate': 0.8256672346026949,
  'n_estimators': 918},
 'score': 0.5390087303819076}

#### EvalML

In [21]:
import evalml
splits = evalml.preprocessing.split_data(
    X=fm,
    y=y,
    test_size=0.2,
    random_seed=0,
    problem_type='binary',
)

X_train, X_holdout, y_train, y_holdout = splits

automl = evalml.AutoMLSearch(
    X_train=X_train,
    y_train=y_train,
    problem_type='binary',
    objective='f1',
    random_seed=0,
    allowed_model_families=['lightgbm', 'xgboost', 'decision_tree'],
    max_iterations=2,
)

automl.search()

automl.best_pipeline.describe()


*************************************************************************
* Random Forest Classifier w/ Label Encoder + Imputer + One Hot Encoder *
*************************************************************************

Problem Type: binary
Model Family: Random Forest
Number of features: 298

Pipeline Steps
1. Label Encoder
	 * positive_label : None
2. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * boolean_impute_strategy : most_frequent
	 * categorical_fill_value : None
	 * numeric_fill_value : None
	 * boolean_fill_value : None
3. One Hot Encoder
	 * top_n : 10
	 * features_to_encode : None
	 * categories : None
	 * drop : if_binary
	 * handle_unknown : ignore
	 * handle_missing : error
4. Random Forest Classifier
	 * n_estimators : 100
	 * max_depth : 6
	 * n_jobs : -1


In [22]:
automl.best_pipeline.score(X_train, y_train, objectives=["auc", "F1"])

OrderedDict([('AUC', 0.6931648253543132), ('F1', 0.5224880382775119)])

In [23]:
automl.best_pipeline.score(X_holdout, y_holdout, objectives=["auc", "F1"])

OrderedDict([('AUC', 0.6031816173221388), ('F1', 0.5093632958801498)])