In [1]:
import pandas as pd
orders = pd.read_parquet("orders.parquet")  
order_products_denormalized = pd.read_csv("order_products_denormalized.csv")  
tips_public = pd.read_csv("tips_public.csv").drop(columns=["Unnamed: 0"])  

In [2]:
orders.head(1) 

Unnamed: 0,order_id,user_id,order_date
0,1374495,3,2024-03-31 14:05:18


In [3]:
tips_public.head(1)

Unnamed: 0,order_id,tip
0,1374495,True


In [4]:
order_products_denormalized_sample = order_products_denormalized[order_products_denormalized["order_id"] < 10000]
order_products_denormalized.tail(1)

Unnamed: 0,order_id,product_id,add_to_cart_order,product_name,aisle_id,department_id,department,aisle
14857352,3421083,5020,10,Organic Sweet & Salty Peanut Pretzel Granola ...,3,19,snacks,energy granola bars


### Feature engineering **USER**: new table per feature with columns: user_id, feature_name


In [5]:
def add_feature_alcohol_count():
    # 1. Filter products in the "alcohol" department
    alcohol_df = order_products_denormalized[order_products_denormalized["department"] == "alcohol"]

    # 2. Join with the 'orders' table to get user_id for each order
    alcohol_with_users = alcohol_df.merge(orders[["order_id", "user_id"]], on="order_id")

    # 3. Count how many times each user bought alcohol
    alcohol_counts = alcohol_with_users["user_id"].value_counts().reset_index()
    alcohol_counts.columns = ["user_id", "alcohol_purchases"]

    # Optional: sort users by alcohol purchases (descending)
    alcohol_counts = alcohol_counts.sort_values(by="alcohol_purchases", ascending=False)

    # 4. Merge with all user_ids to include users with 0 alcohol purchases
    result = (
        alcohol_counts
        .merge(orders[["user_id"]], how="right")
        .drop_duplicates()
        .fillna(0)
        .set_index("user_id")
    )

    return result
   
#add_feature_alcohol_count().head(1)

In [None]:
def total_products_per_user():
    # Join order_products_denormalized mit orders, um user_id zu bekommen
    merged = order_products_denormalized.merge(orders[["order_id", "user_id"]], on="order_id")
    
    # Für jeden user_id die Gesamtanzahl der Produkte zählen (inkl. Duplikate)
    total_products = merged.groupby("user_id")["product_id"].count().reset_index()
    total_products.columns = ["user_id", "total_products_bought"]
    
    return total_products
#total_products_per_user().head(1)

Unnamed: 0,user_id,total_products_bought
0,3,88


In [None]:
def total_unique_products_per_user():
    """
    Calculate how many unique products each user has purchased in total.
    """
    merged = order_products_denormalized.merge(orders[["order_id", "user_id"]], on="order_id")
    unique_products = (
        merged.groupby("user_id")["product_id"]
        .nunique()
        .reset_index()
        .rename(columns={"product_id": "unique_products_bought"})
    )
    return unique_products
#total_unique_products_per_user().head(1)

Unnamed: 0,user_id,unique_products_bought
0,3,33


In [None]:
def unique_to_total_product_ratio_per_user():
    """
    Calculate the ratio of unique products bought to total products bought per user.
    This shows how diverse the user's purchases are relative to their total quantity.
    """
    total = total_products_per_user()
    unique = total_unique_products_per_user()
    
    merged = total.merge(unique, on="user_id")
    merged["unique_to_total_product_ratio"] = merged["unique_products_bought"] / merged["total_products_bought"]
    
    return merged[["user_id", "unique_to_total_product_ratio"]]
#unique_to_total_product_ratio_per_user().head(1)

Unnamed: 0,user_id,unique_to_total_product_ratio
0,3,0.375


### Feature engineering **ORDER**: new table per feature with columns: order_id, feature_name


In [9]:
def add_feature_order_contains_alcohol():
    # Step 1: Identify all order_ids that include alcohol products
    alcohol_orders = order_products_denormalized[
        order_products_denormalized["department"] == "alcohol"
    ][["order_id"]].drop_duplicates()
    
    # Step 2: Mark these orders with a True flag
    alcohol_orders["contains_alcohol"] = True

    # Step 3: Get all unique orders and join with the alcohol flag
    all_orders = orders[["order_id"]].drop_duplicates()
    result = all_orders.merge(alcohol_orders, on="order_id", how="left")

    # Step 4: Fill missing values (orders without alcohol) with False (0)
    result["contains_alcohol"] = result["contains_alcohol"].fillna(False).astype(int)

    return result.set_index("order_id")

#add_feature_order_contains_alcohol().head(1)

In [10]:
def add_feature_order_item_count():
    # Count how many products each order contains
    item_counts = (
        order_products_denormalized
        .groupby("order_id")["product_id"]
        .count()
        .reset_index()
        .rename(columns={"product_id": "item_count"})
    )
    return item_counts

#add_feature_order_item_count().head(1)

In [11]:
def add_feature_order_unique_departments_count():
    # Count distinct departments per order_id
    dept_counts = (
        order_products_denormalized
        .groupby("order_id")["department"]
        .nunique()
        .reset_index()
        .rename(columns={"department": "unique_departments_count"})
    )
    return dept_counts#.set_index("order_id")

#add_feature_order_unique_departments_count().head(1)

In [None]:
def add_feature_order_unique_aisles_count():
    # Count distinct aisles per order_id
    aisle_counts = (
        order_products_denormalized
        .groupby("order_id")["aisle"]
        .nunique()
        .reset_index()
        .rename(columns={"aisle": "unique_aisles_count"})
    )
    return aisle_counts
#add_feature_order_unique_aisles_count().head(1)

Unnamed: 0,order_id,unique_aisles_count
0,1,6


In [None]:
def add_feature_order_unique_departments_ratio():
    """
    Calculate the ratio of unique departments to total items per order. \n
    This feature reflects how diverse an order is relative to its size.
    """
    total_items = add_feature_order_item_count()
    unique_depts = add_feature_order_unique_departments_count()

    # Merge on order_id
    merged = total_items.merge(unique_depts, on="order_id")

    # Calculate ratio
    merged["unique_departments_ratio"] = merged["unique_departments_count"] / merged["item_count"]

    return merged[["order_id", "unique_departments_ratio"]]
#add_feature_order_unique_departments_ratio().head(1)

Unnamed: 0,order_id,unique_departments_ratio
0,1,0.375


In [None]:
def add_feature_order_unique_aisles_ratio():
    """
    Calculate the ratio of unique aisles to total items per order. \n
    This feature indicates the variety of product aisles in the order normalized by order size.
    """
    total_items = add_feature_order_item_count()
    unique_aisles = add_feature_order_unique_aisles_count()

    merged = total_items.merge(unique_aisles, on="order_id")

    merged["unique_aisles_ratio"] = merged["unique_aisles_count"] / merged["item_count"]

    return merged[["order_id", "unique_aisles_ratio"]]
#add_feature_order_unique_aisles_ratio().head(1)

Unnamed: 0,order_id,unique_aisles_ratio
0,1,0.75


### Feature engineering **ORDER** - **USER**: new table per feature with columns: order_id, feature_name

In [8]:
def count_products_per_user():
    """
    Returns a DataFrame showing how many times each user bought each product.
    One row per (user_id, product_id) pair.
    """
    # Merge to get user_id for each order-product row
    merged = order_products_denormalized.merge(
        orders[["order_id", "user_id"]], on="order_id"
    )

    # Group by user and product, then count occurrences
    counts = merged.groupby(["user_id", "product_id"]).size().reset_index(name="times_bought")

    return counts
t1 = count_products_per_user()

In [15]:
t1[t1["user_id"] == 5].reset_index(drop=True)

Unnamed: 0,user_id,product_id,times_bought
0,5,3376,1
1,5,5999,1
2,5,6808,1
3,5,8518,2
4,5,11777,4
5,5,13870,1
6,5,13988,2
7,5,15349,2
8,5,16168,1
9,5,16185,1


In [None]:
def latest_tip_rate_per_user_product():
    """
    Return the latest purchase per user-product combination with the tip rate
    observed on *prior* purchases of that product by the user.
    If it's the first time, the value will be NaN.
    """
    merged = order_products_denormalized.merge(
        orders[["order_id", "user_id", "order_date"]], on="order_id"
    ).merge(
        tips_public[["order_id", "tip"]], on="order_id", how="left"
    )

    merged = merged.sort_values(by=["user_id", "product_id", "order_date"])

    merged["tip"] = merged["tip"].fillna(False).astype(float)

    # Count how often user bought the product before
    merged["times_product_bought_before"] = merged.groupby(
        ["user_id", "product_id"]
    ).cumcount()

    # How many of those times included a tip
    merged["tip_cumsum"] = merged.groupby(
        ["user_id", "product_id"]
    )["tip"].cumsum() - merged["tip"]

    merged["tip_rate_before"] = merged["tip_cumsum"] / merged["times_product_bought_before"]

    # Only keep the most recent row for each (user_id, product_id)
    latest = merged.sort_values(by="order_date").groupby(
        ["user_id", "product_id"], as_index=False
    ).last()

    return latest[["user_id", "product_id", "tip_rate_before"]]
l = latest_tip_rate_per_user_product()

In [34]:
l[l["user_id"] == 5].reset_index(drop=True)

Unnamed: 0,user_id,product_id,tip_rate_before
0,5,3376,
1,5,5999,
2,5,6808,
3,5,8518,0.0
4,5,11777,0.333333
5,5,13870,
6,5,13988,1.0
7,5,15349,0.0
8,5,16168,
9,5,16185,


In [None]:
# def add_avg_tip_rate_before_per_product_row():
#     """
#     Add a column to each order-product row that contains the average historical tip rate
#     the user had given for this specific product in the past. If the product is new, it's NaN.
#     Useful as a contextual feature for tipping behavior on a product level.
#     """
#     # 1. Merge necessary data
#     merged = order_products_denormalized.merge(
#         orders[["order_id", "user_id", "order_date"]], on="order_id"
#     ).merge(
#         tips_public[["order_id", "tip"]], on="order_id", how="left"
#     )

#     # 2. Sort to ensure chronological order for cumsum logic
#     merged = merged.sort_values(by=["user_id", "product_id", "order_date"])

#     # 3. Ensure tip is numeric
#     merged["tip"] = merged["tip"].fillna(0.0).astype(float)

#     # 4. Count how many times the user has bought this product before this order
#     merged["times_product_bought_before"] = merged.groupby(
#         ["user_id", "product_id"]
#     ).cumcount()

#     # 5. Cumulative sum of tips before this point
#     merged["tip_cumsum"] = merged.groupby(["user_id", "product_id"])["tip"].cumsum() - merged["tip"]

#     # 6. Compute historical tip rate before for this product
#     merged["avg_tip_rate_before_product"] = (
#         merged["tip_cumsum"] / merged["times_product_bought_before"]
#     )

#     # 7. Handle division by 0 (new products)
#     merged["avg_tip_rate_before_product"] = merged["avg_tip_rate_before_product"].replace(
#         [float("inf"), -float("inf")], pd.NA
#     )

#     return merged[["order_id", "product_id", "user_id", "tip", "times_product_bought_before", "avg_tip_rate_before_product", "tip_cumsum", "order_date"]]
# avg_tip_rate_before_per_product_row = add_avg_tip_rate_before_per_product_row()

In [None]:
# avg_tip_rate_before_per_product_row[avg_tip_rate_before_per_product_row["user_id"] == 5].sort_values("order_date").sort_values("order_id").sort_values("order_date").reset_index(drop=True)

In [32]:
def add_avg_tip_rate_per_user_product():
    """
    Computes the average tip rate per (user_id, product_id) pair across all orders.
    This shows how likely a user is to tip when buying a specific product.
    """
    # Join tip info to order-product data
    merged = order_products_denormalized.merge(
        orders[["order_id", "user_id"]], on="order_id"
    ).merge(
        tips_public[["order_id", "tip"]], on="order_id", how="left"
    )

    # Fill missing tips with 0 (no tip)
    merged["tip"] = merged["tip"].fillna(0).astype(int)

    # Group by user and product, then calculate mean tip
    tip_rate = (
        merged.groupby(["user_id", "product_id"])["tip"]
        .mean()
        .reset_index()
        .rename(columns={"tip": "avg_tip_rate"})
    )

    return tip_rate
avg_tip_rate_per_user_product = add_avg_tip_rate_per_user_product()
avg_tip_rate_per_user_product[avg_tip_rate_per_user_product["user_id"] == 5].reset_index(drop=True)

Unnamed: 0,user_id,product_id,avg_tip_rate
0,5,3376,1.0
1,5,5999,1.0
2,5,6808,1.0
3,5,8518,0.5
4,5,11777,0.5
5,5,13870,1.0
6,5,13988,1.0
7,5,15349,0.0
8,5,16168,1.0
9,5,16185,0.0


In [35]:
def add_cumulative_avg_tip_rate_per_user_product():
    """
    Calculate the cumulative average tip rate per (user_id, product_id) up to each order date.
    For each order-product row, it shows the tip probability based on *past* purchases only.
    New products (first purchase) get NaN or 0.
    """
    # Merge orders and tips into order_products_denormalized
    merged = order_products_denormalized.merge(
        orders[["order_id", "user_id", "order_date"]], on="order_id"
    ).merge(
        tips_public[["order_id", "tip"]], on="order_id", how="left"
    )

    merged = merged.sort_values(by=["user_id", "product_id", "order_date"])

    # Replace missing tips with 0 (no tip)
    merged["tip"] = merged["tip"].fillna(0).astype(float)

    # Count how many times product was bought before current row
    merged["times_bought_before"] = merged.groupby(["user_id", "product_id"]).cumcount()

    # Cumulative sum of tips *before* current row (exclude current tip)
    merged["tip_cumsum_before"] = merged.groupby(["user_id", "product_id"])["tip"].cumsum() - merged["tip"]

    # Calculate cumulative average tip rate before this order
    merged["avg_tip_rate_before"] = merged["tip_cumsum_before"] / merged["times_bought_before"]

    # Replace inf and NaN for first purchases
    merged.loc[merged["times_bought_before"] == 0, "avg_tip_rate_before"] = pd.NA

    return merged[["order_id", "user_id", "product_id", "avg_tip_rate_before"]]

cumulative_avg_tip_rate_per_user_product = add_cumulative_avg_tip_rate_per_user_product()

In [49]:
cumulative_avg_tip_rate_per_user_product[cumulative_avg_tip_rate_per_user_product["user_id"]==5].sort_values(["user_id","order_id"]).reset_index(drop=True)

Unnamed: 0,order_id,user_id,product_id,avg_tip_rate_before
0,157374,5,3376,
1,157374,5,5999,
2,157374,5,8518,0.0
3,157374,5,11777,0.333333
4,157374,5,13988,1.0
5,157374,5,16168,
6,157374,5,21413,0.0
7,157374,5,24535,0.5
8,157374,5,26604,0.333333
9,157374,5,27344,
