### Set up parameters

In [1]:
name = "instacart"

target_table = "orders"
time_col = "order_time"

window_size = "2w"

### Dataset

##### Load data

In [2]:
from utils import load_instacart_data

dataframes, metadata = load_instacart_data(nrows=10000)

In [4]:
from trane import ProblemGenerator

problem_generator = ProblemGenerator(
    metadata=metadata,
    window_size=window_size,
    target_table=target_table,
)
problems = problem_generator.generate()

In [8]:
num_columns = dataframes[target_table].shape[1]
print(f"generated {len(problems)} problems from {num_columns} columns")

generated 54 problems from 3 columns


In [9]:
for problem in problems:
    print(problem)

For each <order_id> predict if there exists a record in next 2w days
For each <order_id> predict if there exists a record with <user_id> greater than <None> in next 2w days
For each <order_id> predict if there exists a record with <user_id> less than <None> in next 2w days
For each <order_id> predict the average <user_id> in all related records in next 2w days
For each <order_id> predict the average <user_id> in all related records with <user_id> greater than <None> in next 2w days
For each <order_id> predict the average <user_id> in all related records with <user_id> less than <None> in next 2w days
For each <order_id> predict the maximum <user_id> in all related records in next 2w days
For each <order_id> predict the maximum <user_id> in all related records with <user_id> greater than <None> in next 2w days
For each <order_id> predict the maximum <user_id> in all related records with <user_id> less than <None> in next 2w days
For each <order_id> predict the minimum <user_id> in all r

1. **For each `<order_id>` predict the number of records in next 2w days**: Knowing the predicted number of records related to an order can help in various aspects like inventory management, customer service, and other operational efficiencies.

2. **For each `<user_id>` predict the number of records in next 2w days**: Similar to the above, but focusing on user behavior rather than orders. This could be crucial for understanding customer engagement and could feed into personalized marketing efforts.

3. **Predict the number of records in next 2w days**: This can provide a high-level view of expected system load, required manpower, and other resources. Good for strategic planning.

4. **For each `<order_id>` predict if there exists a record in next 2w days**: This could be useful for triggering other processes like follow-up customer communications, review solicitations, or even automated systems like re-order reminders.

5. **For each `<user_id>` predict if there exists a record in next 2w days**: Understanding which users are likely to be active in the short term can be useful for targeted marketing campaigns, customer retention efforts, or fraud detection.

In [None]:
with open("generated_problems.txt", "w") as text_file:
    for idx, p in enumerate(problems):
        print(idx, p, file=text_file)

### Find a relevant problem
Look through the generated_problems.txt file and find the ID of an interesting problem

In [None]:
ex = problems[273].execute(df, num_examples_per_instance=-1)
problems[273]

### Feature engineering
Using Featuretools

In [None]:
import featuretools as ft

es = ft.EntitySet(name)

es.add_dataframe(
    dataframe=df.reset_index(),
    dataframe_name="order_products",
    time_index="orders.order_date",
    index="__id__",
)

es.normalize_dataframe(
    base_dataframe_name="order_products",
    new_dataframe_name="products",
    index="product_id",
    additional_columns=["products.aisle_id", "products.aisles.aisle", "products.department_id", "products.departments.department", "products.product_name"],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name="order_products",
    new_dataframe_name="orders",
    index="order_id",
    additional_columns=["orders.user_id", "orders.eval_set", "orders.order_number", "orders.order_dow", "orders.order_hour_of_day", "orders.days_since_prior_order"],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name="products",
    new_dataframe_name="aisles",
    index="products.aisle_id",
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name="products",
    new_dataframe_name="departments",
    index="products.department_id",
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name="orders",
    new_dataframe_name="orders.user_id",
    index="orders.user_id",
    make_time_index=False,
)

es.plot()

In [None]:
fm, fd = ft.dfs(
    entityset=es,
    target_dataframe_name=entity_col,
    cutoff_time=ex,
    cutoff_time_in_index=True,
    include_cutoff_time=False,
    verbose=True,
)

fm.reset_index(drop=True, inplace=True)
y = fm.ww.pop('_execute_operations_on_df')
X = fm

### Machine Learning

##### Train/Test split by time cutoff

In [None]:
train_cutoff = pd.to_datetime("2023-09-01")
X_train, y_train = X[ex["time"] <= train_cutoff], y[ex["time"] <= train_cutoff]
X_test, y_test = X[ex["time"] > train_cutoff], y[ex["time"] > train_cutoff]

##### Train a model

In [None]:
import lightgbm as lgb
model = lgb.LGBMClassifier()

model.fit(X_train, y_train)

##### Run inference on the test set

In [None]:
y_hat = model.predict(X_test)

##### Evaluate model on the test set

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    plt.show()

In [None]:
%matplotlib inline
plot_confusion_matrix(y_test, y_hat)

In [None]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
print(f"AUC: {round(roc_auc_score(y_test, y_hat), 2)}")
print(f"F1 Score: {round(f1_score(y_test, y_hat), 2)}")
print(f"Accuracy: {round(accuracy_score(y_test, y_hat), 2)}")