In [1]:
from util import load_data

name = 'instacart'

dataframes, relationships = load_data()
target_entity = "order_products"

entity_col = "orders.user_id"
time_col = "orders.order_date"

window_size = "2w"
minimum_data = "2023-01-01"
maximum_data = "2023-11-01"

### Load data

In [2]:
from trane.parsing import denormalize
import pandas as pd

df = denormalize(
    dataframes,
    relationships,
    target_entity
)
df = df.dropna()

## Trane

#### Generate prediction problems

In [3]:
from trane import CutoffStrategy
from util import get_meta

meta = get_meta(df, entity_col)

cutoff_strategy = CutoffStrategy(
    entity_col=entity_col,
    window_size=window_size,
    minimum_data=minimum_data,
    maximum_data=maximum_data,
)

from trane import PredictionProblemGenerator
problem_generator = PredictionProblemGenerator(
    df=df,
    table_meta=meta,
    entity_col=entity_col,
    cutoff_strategy=cutoff_strategy,
    time_col=time_col,
)
problems = problem_generator.generate(df, generate_thresholds=True)

  0%|          | 0/4104 [00:00<?, ?it/s]

Find a relevant problem

In [4]:
with open("generated_problems.txt", "w") as text_file:
    for idx, p in enumerate(problems):
        print(idx, p, file=text_file)

In [6]:
ex = problems[2278].execute(df, num_examples_per_instance=-1)
problems[2278]

Elapsed: 00:45 | Remaining: 00:00 | Progress: 100%|██████████| orders.user_id: 10000/10000 


For each <orders.user_id> predict if there exists a record with <products.product_name> equal to Banana in next 2w days

#### Generate features

In [7]:
from featuretools_util import get_features

X, y = get_features(
    name=name,
    df=df,
    target_entity=target_entity,
    entity_col=entity_col,
    time_index=time_col,
    relationships=relationships,
    cutoff_time=ex
)

X.head()



Built 170 features
Elapsed: 02:32 | Progress: 100%|██████████


Unnamed: 0,COUNT(orders),MAX(orders.orders.days_since_prior_order),MAX(orders.orders.order_dow),MAX(orders.orders.order_hour_of_day),MAX(orders.orders.order_number),MEAN(orders.orders.days_since_prior_order),MEAN(orders.orders.order_dow),MEAN(orders.orders.order_hour_of_day),MEAN(orders.orders.order_number),MIN(orders.orders.days_since_prior_order),...,SKEW(order_products.orders.orders.order_hour_of_day),SKEW(order_products.orders.orders.order_number),STD(order_products.orders.orders.days_since_prior_order),STD(order_products.orders.orders.order_dow),STD(order_products.orders.orders.order_hour_of_day),STD(order_products.orders.orders.order_number),SUM(order_products.orders.orders.days_since_prior_order),SUM(order_products.orders.orders.order_dow),SUM(order_products.orders.orders.order_hour_of_day),SUM(order_products.orders.orders.order_number)
0,9,30.0,4.0,16.0,10.0,19.555556,2.555556,10.555556,6.0,0.0,...,,,,,,,0.0,0.0,0.0,0.0
1,9,30.0,4.0,16.0,10.0,19.555556,2.555556,10.555556,6.0,0.0,...,-0.349857,0.0,3.853951,1.278275,2.572479,1.53393,429.0,52.0,225.0,90.0
2,9,30.0,4.0,16.0,10.0,19.555556,2.555556,10.555556,6.0,0.0,...,-0.405025,0.460805,10.890295,1.391388,3.463035,2.142693,574.0,78.0,356.0,164.0
3,9,30.0,4.0,16.0,10.0,19.555556,2.555556,10.555556,6.0,0.0,...,-0.004286,0.454978,10.051297,1.268228,3.734464,2.394672,664.0,96.0,398.0,176.0
4,13,30.0,5.0,15.0,14.0,15.230769,2.153846,10.538462,8.0,3.0,...,,,,,,,0.0,0.0,0.0,0.0


### AutoML

#### BTB

In [8]:
from baytune_util import automl

automl(X, y)

  0%|          | 0/10 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 11322, number of negative: 52846
[LightGBM] [Info] Number of positive: 11321, number of negative: 52846
[LightGBM] [Info] Number of positive: 11322, number of negative: 52845
[LightGBM] [Info] Number of positive: 11322, number of negative: 52845
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Number of positive: 11321, number of negative: 52846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 28872
[LightGBM] [Info] Total Bins 28919
[LightGBM] [Info] Number of data points in the train set: 64168, number of used features: 161
[LightGBM] [Info] Number of data points in the train set: 64167, number of used features: 161
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.176430 -> initscore=-1.540723
[LightGBM] [Info] Start training from score -1.540723
[LightGBM

{'id': 'cb477bb0b2182648415f12d09a70816f',
 'name': 'LGB',
 'config': {'num_leaves': 26,
  'max_depth': 171,
  'learning_rate': 0.9300524809471027,
  'n_estimators': 793},
 'score': 0.5182155659914052}