In [1]:
from utils import load_instacart_data

dataframes, metadata = load_instacart_data(nrows=10000)

In [None]:
metadata

In [2]:
from trane import ProblemGenerator

problem_generator = ProblemGenerator(
    metadata=metadata,
    window_size="2w",
    target_table="orders",
)
problems = problem_generator.generate()

Generated 54 total problems
--------------------------------------------------
Classification problems: 9
Regression problems: 45


In [4]:
from trane.parsing.denormalize import (
    denormalize,
)

In [5]:
_, single_metadata = denormalize(
    metadata=metadata,
    target_table="orders",
)

In [7]:
single_metadata.ml_types

defaultdict(dict,
            {'order_id': Integer, 'order_time': Datetime, 'user_id': Integer})

In [3]:
from trane.llm import analyze

instructions = "determine 5 most relevant problems about consumers future product preferences"
context = "a relational set of files describing customers' orders over time"

relevant_problems = analyze(
    problems=problems,
    instructions=instructions,
    context=context,
    model="gpt-3.5-turbo-16k"
)
relevant_problems

ID: 1 Problem: For each order_id predict if there exists a record in the next 2 weeks.
Reasoning: This problem can help determine if customers are likely to make another purchase within a specific time frame and can inform marketing strategies to target these customers.

ID: 13 Problem: For each order_id predict the number of records in the next 2 weeks.
Reasoning: This problem can provide insights into the frequency of customer purchases and can help identify customers who make multiple purchases, indicating higher engagement and potential loyalty.

ID: 28 Problem: For each user_id predict the minimum user_id in all related records in the next 2 weeks.
Reasoning: This problem can identify the earliest user_id associated with a customer's purchase history and can be used to understand the customer's initial interaction with the product or service.

ID: 30 Problem: For each user_id predict the minimum user_id in all related records with user_id less than a float value in the next 2 weeks.
Reasoning: This problem can help identify the earliest user_id associated with a specific segment of customers, such as those with a lower user_id, allowing for targeted analysis of different customer groups.

ID: 41 Problem: Predict the average user_id in all related records with user_id greater than a float value in the next 2 weeks.
Reasoning: This problem can provide insights into the average user_id of customers who have higher user_ids, potentially indicating patterns or preferences among specific customer segments.

[1, 13, 28, 30, 41]


[For each <order_id> predict if there exists a record in next 2w days,
 For each <order_id> predict the number of records in next 2w days,
 For each <user_id> predict the minimum <user_id> in all related records in next 2w days,
 For each <user_id> predict the minimum <user_id> in all related records with <user_id> less than <float> in next 2w days,
 Predict the average <user_id> in all related records with <user_id> greater than <float> in next 2w days]

In [None]:
ex = problems[273].execute(df, num_examples_per_instance=-1)
problems[273]

### Feature engineering
Using Featuretools

In [None]:
import featuretools as ft

es = ft.EntitySet(name)

es.add_dataframe(
    dataframe=df.reset_index(),
    dataframe_name="order_products",
    time_index="orders.order_date",
    index="__id__",
)

es.normalize_dataframe(
    base_dataframe_name="order_products",
    new_dataframe_name="products",
    index="product_id",
    additional_columns=["products.aisle_id", "products.aisles.aisle", "products.department_id", "products.departments.department", "products.product_name"],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name="order_products",
    new_dataframe_name="orders",
    index="order_id",
    additional_columns=["orders.user_id", "orders.eval_set", "orders.order_number", "orders.order_dow", "orders.order_hour_of_day", "orders.days_since_prior_order"],
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name="products",
    new_dataframe_name="aisles",
    index="products.aisle_id",
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name="products",
    new_dataframe_name="departments",
    index="products.department_id",
    make_time_index=False,
)

es.normalize_dataframe(
    base_dataframe_name="orders",
    new_dataframe_name="orders.user_id",
    index="orders.user_id",
    make_time_index=False,
)

es.plot()

In [None]:
fm, fd = ft.dfs(
    entityset=es,
    target_dataframe_name=entity_col,
    cutoff_time=ex,
    cutoff_time_in_index=True,
    include_cutoff_time=False,
    verbose=True,
)

fm.reset_index(drop=True, inplace=True)
y = fm.ww.pop('_execute_operations_on_df')
X = fm

### Machine Learning

##### Train/Test split by time cutoff

In [None]:
train_cutoff = pd.to_datetime("2023-09-01")
X_train, y_train = X[ex["time"] <= train_cutoff], y[ex["time"] <= train_cutoff]
X_test, y_test = X[ex["time"] > train_cutoff], y[ex["time"] > train_cutoff]

##### Train a model

In [None]:
import lightgbm as lgb
model = lgb.LGBMClassifier()

model.fit(X_train, y_train)

##### Run inference on the test set

In [None]:
y_hat = model.predict(X_test)

##### Evaluate model on the test set

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    plt.show()

In [None]:
%matplotlib inline
plot_confusion_matrix(y_test, y_hat)

In [None]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
print(f"AUC: {round(roc_auc_score(y_test, y_hat), 2)}")
print(f"F1 Score: {round(f1_score(y_test, y_hat), 2)}")
print(f"Accuracy: {round(accuracy_score(y_test, y_hat), 2)}")