### Load data

In [1]:
from trane.utils.data_parser import denormalize
import pandas as pd

dataframes = {
    "orders": pd.read_csv("data/orders.csv"),
    "order_products": pd.read_csv("data/order_products.csv"),
    "products": pd.read_csv("data/products.csv"),
    "aisles": pd.read_csv("data/aisles.csv"),
    "departments": pd.read_csv("data/departments.csv"),
}
relationships = [
    ("products", "aisle_id", "aisles", "aisle_id"),
    ("products", "department_id", "departments", "department_id"),
    ("order_products", "product_id", "products", "product_id"),
    ("orders", "order_id", "order_products", "order_id"),
]
df = denormalize(
    dataframes,
    relationships
)

Generate artificial date and remove order no

In [2]:
df["order_date"] = pd.to_datetime("2023-01-01") + pd.to_timedelta(df["order_number"], unit="d")
df = df.drop("order_number", axis=1)
df = df.sort_values(["user_id", "order_date"])

## Trane

In [3]:
import trane

entity_col = "user_id"
time_col = "order_date"
meta = {
    "order_id": ("Integer", {"numeric"}),
    "user_id": ("Integer", {"numeric", "index"}),
    "eval_set": ("Categorical", {"category"}),
    "order_date": ("DateTime", {}),
    "order_dow": ("Categorical", {"category"}),
    "order_hour_of_day": ("Categorical", {"category"}),
    "days_since_prior_order": ("Integer", {"numeric"}),
    "product_id": ("Categorical", {"category"}),
    "add_to_cart_order": ("Integer", {"numeric"}),
    "reordered": ("Categorical", {"category"}),
    "product_name": ("Categorical", {"category"}),
    "aisle_id": ("Categorical", {"category"}),
    "department_id": ("Categorical", {"category"}),
    "aisle": ("Categorical", {"category"}),
    "department": ("Categorical", {"category"}),
}

cutoff_strategy = trane.CutoffStrategy(
    entity_col="user_id",
    window_size="1m",
    minimum_data="2023-01-01",
    maximum_data="2023-02-01",
)


Fix NaNs

In [4]:
df = df.fillna(0)

Shorten data

In [5]:
df = df.query("user_id < 1000")

#### Generate prediction problems

In [6]:
problem_generator = trane.PredictionProblemGenerator(
    df=df,
    table_meta=meta,
    entity_col=entity_col,
    cutoff_strategy=cutoff_strategy,
    time_col=time_col,
)

In [7]:
problems = problem_generator.generate(df, generate_thresholds=True)

  0%|          | 0/4187 [00:00<?, ?it/s]

Find a relevant problem

In [16]:
ex = problems[1984].execute(df, num_examples_per_instance=5)
problems[1984]

Elapsed: 00:01 | Remaining: 00:00 | Progress: 100%|██████████| user_id: 4995/4995 


For each <user_id> predict if there exists a record with with <product_name> equal to Banana in next 1m days

#### Generate features

In [17]:
import featuretools as ft
es = ft.EntitySet('instacart')

es.add_dataframe(
    dataframe=df.reset_index(),
    dataframe_name='order_products',
    time_index='order_date',
    index='__id__',
)

es.normalize_dataframe(
    base_dataframe_name='order_products',
    new_dataframe_name='orders',
    index='order_id',
    additional_columns=['user_id'],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='orders',
    new_dataframe_name='customers',
    index='user_id',
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='order_products',
    new_dataframe_name='products',
    index='product_id',
    additional_columns=['aisle_id', 'department_id'],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='products',
    new_dataframe_name='aisles',
    index='aisle_id',
    additional_columns=['department_id'],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name='aisles',
    new_dataframe_name='departments',
    index='department_id',
    make_time_index=False,
)

fm, fd = ft.dfs(
    entityset=es,
    target_dataframe_name='customers',
    cutoff_time=ex,
    cutoff_time_in_index=True,
    include_cutoff_time=False,
    verbose=False,
)

fm.head()




Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT(orders),COUNT(order_products),MAX(order_products.add_to_cart_order),MAX(order_products.days_since_prior_order),MAX(order_products.index),MEAN(order_products.add_to_cart_order),MEAN(order_products.days_since_prior_order),MEAN(order_products.index),MIN(order_products.add_to_cart_order),MIN(order_products.days_since_prior_order),...,SUM(orders.NUM_UNIQUE(order_products.product_id)),SUM(orders.NUM_UNIQUE(order_products.product_name)),SUM(orders.NUM_UNIQUE(order_products.reordered)),SUM(orders.SKEW(order_products.add_to_cart_order)),SUM(orders.SKEW(order_products.days_since_prior_order)),SUM(orders.SKEW(order_products.index)),SUM(orders.STD(order_products.add_to_cart_order)),SUM(orders.STD(order_products.days_since_prior_order)),SUM(orders.STD(order_products.index)),_execute_operations_on_df
user_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2023-01-01,11,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
2,2023-01-01,15,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,2023-01-01,13,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,2023-01-01,6,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5,2023-01-01,5,0,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


X and y

In [18]:
fm.reset_index(drop=True, inplace=True)
y = fm.ww.pop('_execute_operations_on_df')

### AutoML

#### BTB

In [19]:
from sklearn.datasets import load_wine
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
import xgboost as xgb

dataset = load_wine()

models = {
    'LGB': lgb.LGBMClassifier,
    'XGB': xgb.XGBClassifier,
    'DTC': DecisionTreeClassifier,
    'SGDC': SGDClassifier,
}

def scoring_function(model_name, hyperparameter_values):
    model_class = models[model_name]
    model_instance = model_class(**hyperparameter_values)
    scores = cross_val_score(
        estimator=model_instance,
        X=fm,
        y=y,
        scoring=make_scorer(roc_auc_score)
    )
    return scores.mean()

from btb.tuning import Tunable
from btb.tuning import hyperparams as hp

tunables = {
    'LGB': Tunable({
        'num_leaves': hp.IntHyperParam(min=2, max=100),
        'max_depth': hp.IntHyperParam(min=3, max=200),
        'learning_rate': hp.FloatHyperParam(min=0.01, max=1),
        'n_estimators': hp.IntHyperParam(min=10, max=1000),
    }),
    'XGB': Tunable({
        'max_depth': hp.IntHyperParam(min=3, max=200),
        'learning_rate': hp.FloatHyperParam(min=0.01, max=1),
        'n_estimators': hp.IntHyperParam(min=10, max=1000),
    }),
    'DTC': Tunable({
        'max_depth': hp.IntHyperParam(min=3, max=200),
        'min_samples_split': hp.FloatHyperParam(min=0.01, max=1)
    }),
    'SGDC': Tunable({
        'max_iter': hp.IntHyperParam(min=1, max=5000, default=1000),
        'tol': hp.FloatHyperParam(min=1e-3, max=1, default=1e-3),
    })
}

from btb import BTBSession

session = BTBSession(
    tunables=tunables,
    scorer=scoring_function,
    verbose=True
)

best_proposal = session.run(20)

best_proposal

  0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 290, number of negative: 618
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4447
[LightGBM] [Info] Number of data points in the train set: 908, number of used features: 190
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.319383 -> initscore=-0.756608
[LightGBM] [Info] Start training from score -0.756608
[LightGBM] [Info] Number of positive: 291, number of negative: 618
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4267
[LightGBM] [Info] Number of data points in the train set: 909, number of used features: 190
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.320132 -> initscore=-0.753165
[LightGBM] [Info] Start training from score -0.753165
[LightGBM] [Info] Number of positive: 291, number of negative: 618
You can set `force_row_wise=true` t

{'id': '41a7f6b4a62622ae176080841db60acc',
 'name': 'LGB',
 'config': {'num_leaves': 83,
  'max_depth': 60,
  'learning_rate': 0.8687779771507773,
  'n_estimators': 967},
 'score': 0.5391347012833977}