In [1]:
import pandas as pd
import numpy as np

# Load datasets
orders = pd.read_parquet("orders.parquet")
order_products_denormalized = pd.read_csv("order_products_denormalized.csv")
tips_public = pd.read_csv("tips_public.csv").drop(columns=["Unnamed: 0"])

# Optimize memory usage by converting to categorical types
order_products_denormalized['department'] = order_products_denormalized['department'].astype('category')
order_products_denormalized['aisle'] = order_products_denormalized['aisle'].astype('category')

# Ensure order_date is datetime
orders['order_date'] = pd.to_datetime(orders['order_date'])

# Validate data
if any(df.empty for df in [orders, order_products_denormalized, tips_public]):
    raise ValueError("One or more input DataFrames are empty.")
if orders['order_date'].isna().any():
    raise ValueError("order_date contains missing values.")


In [9]:
order_products_denormalized

Unnamed: 0,order_id,product_id,add_to_cart_order,product_name,aisle_id,department_id,department,aisle
0,1,49302,1,Bulgarian Yogurt,120,16,dairy eggs,yogurt
1,1,11109,2,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,dairy eggs,other creams cheeses
2,1,10246,3,Organic Celery Hearts,83,4,produce,fresh vegetables
3,1,49683,4,Cucumber Kirby,83,4,produce,fresh vegetables
4,1,43633,5,Lightly Smoked Sardines in Olive Oil,95,15,canned goods,canned meat seafood
...,...,...,...,...,...,...,...,...
14857348,3421083,39678,6,Free & Clear Natural Dishwasher Detergent,74,17,household,dish detergents
14857349,3421083,11352,7,Organic Mini Sandwich Crackers Peanut Butter,78,19,snacks,crackers
14857350,3421083,4600,8,All Natural French Toast Sticks,52,1,frozen,frozen breakfast
14857351,3421083,24852,9,Banana,24,4,produce,fresh fruits


## Feature Overview

The following table lists all features engineered in this notebook, including their level, output columns, data types, and descriptions.

| **Feature Name** | **Level** | **Output Columns** | **Data Type** | **Description** |
|------------------|-----------|--------------------|---------------|-----------------|
| `alcohol_purchases` | User | `[user_id, alcohol_purchases]` | Integer | Counts the total number of alcohol products purchased by each user across all orders. |
| `total_products_bought` | User | `[user_id, total_products_bought]` | Integer | Counts the total number of products purchased by each user across all orders. |
| `unique_products_bought` | User | `[user_id, unique_products_bought]` | Integer | Counts the number of unique products purchased by each user. |
| `unique_to_total_product_ratio` | User | `[user_id, unique_to_total_product_ratio]` | Float | Calculates the ratio of unique products to total products purchased by each user (unique_products_bought / total_products_bought). |
| `most_frequent_hour` | User | `[user_id, most_frequent_hour]` | Integer (0–23) | Identifies the hour of the day when the user places the most orders, defaulting to 12 (noon) if missing. |
| `most_frequent_dow` | User | `[user_id, most_frequent_dow]` | Integer (0–6) | Identifies the day of the week (0=Monday, 6=Sunday) when the user places the most orders, defaulting to 0 (Monday). |
| `avg_time_between_orders_hours` | User | `[user_id, avg_time_between_orders_hours]` | Float | Calculates the average time (in hours) between consecutive orders for each user, using the dataset median for users with one order. |
| `purchase_hour_sin`, `purchase_hour_cos` | User | `[user_id, purchase_hour_sin, purchase_hour_cos]` | Float (-1 to 1) | Applies sine-cosine transformation to the most frequent purchase hour to capture its cyclical nature. |
| `purchase_season_sin`, `purchase_season_cos` | User | `[user_id, purchase_season_sin, purchase_season_cos]` | Float (-1 to 1) | Applies sine-cosine transformation to the most frequent purchase month to capture seasonal cyclicality, defaulting to January. |
| `contains_alcohol` | Order | `[order_id, contains_alcohol]` | Integer (0 or 1) | Flags whether an order contains any alcohol products (1 if yes, 0 if no). |
| `item_count` | Order | `[order_id, item_count]` | Integer | Counts the total number of items (products) in each order. |
| `unique_departments_count` | Order | `[order_id, unique_departments_count]` | Integer | Counts the number of unique departments in each order. |
| `unique_aisles_count` | Order | `[order_id, unique_aisles_count]` | Integer | Counts the number of unique aisles in each order. |
| `unique_departments_ratio` | Order | `[order_id, unique_departments_ratio]` | Float | Calculates the ratio of unique departments to total items in each order (unique_departments_count / item_count). |
| `unique_aisles_ratio` | Order | `[order_id, unique_aisles_ratio]` | Float | Calculates the ratio of unique aisles to total items in each order (unique_aisles_count / item_count). |
| `avg_tip_rate_department` | Order | `[order_id, avg_tip_rate_department]` | Float (0 to 1) | Computes the average tip rate for the departments in an order based on prior orders, defaulting to 0.5 for no history. |
| `avg_tip_rate_aisle` | Order | `[order_id, avg_tip_rate_aisle]` | Float (0 to 1) | Computes the average tip rate for the aisles in an order based on prior orders, defaulting to 0.5 for no history. |
| `order_hour` | Order | `[order_id, order_hour]` | Integer (0–23) | Extracts the hour of the day when the order was placed. |
| `order_dow` | Order | `[order_id, order_dow]` | Integer (0–6) | Extracts the day of the week (0=Monday, 6=Sunday) when the order was placed. |
| `is_weekend` | Order | `[order_id, is_weekend]` | Integer (0 or 1) | Flags whether the order was placed on a weekend (Saturday or Sunday). |
| `order_hour_sin`, `order_hour_cos` | Order | `[order_id, order_hour_sin, order_hour_cos]` | Float (-1 to 1) | Applies sine-cosine transformation to the order’s hour to capture its cyclical nature. |
| `order_season_sin`, `order_season_cos` | Order | `[order_id, order_season_sin, order_season_cos]` | Float (-1 to 1) | Applies sine-cosine transformation to the order’s month to capture seasonal cyclicality. |
| `time_since_last_order_hours` | Order | `[order_id, time_since_last_order_hours]` | Float | Calculates the time (in hours) since the user’s previous order, using the dataset median for first orders. |
| `times_bought` | User-Product | `[user_id, product_id, times_bought]` | Integer | Counts how many times each user has purchased each product. |
| `tip_probability` | User-Product | `[user_id, product_id, tip_probability]` | Float (0 to 1) | Calculates the average tip probability for each user-product pair based on prior orders, defaulting to 0.5 for no history. |


## Helper Functions

Common operations used across feature engineering functions.

In [2]:
def merge_with_orders(df, columns=['order_id', 'user_id']):
    """Merge a DataFrame with orders to include user_id and/or other columns.
    
    Args:
        df (pd.DataFrame): DataFrame containing order_id.
        columns (list): Columns from orders to include.
    
    Returns:
        pd.DataFrame: Merged DataFrame.
    """
    return df.merge(orders[columns], on='order_id', how='left')

def validate_dataframe(df, required_columns):
    """Check if DataFrame has required columns and is not empty.
    
    Args:
        df (pd.DataFrame): DataFrame to validate.
        required_columns (list): List of required column names.
    
    Raises:
        ValueError: If validation fails.
    """
    if df.empty:
        raise ValueError("DataFrame is empty.")
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

def cyclic_transform(value, max_value):
    """Apply sine-cosine transformation to a cyclical value.
    
    Args:
        value (pd.Series): Values to transform (e.g., hour or month).
        max_value (float): Maximum value of the cycle (e.g., 24 for hours, 12 for months).
    
    Returns:
        tuple: (sin_values, cos_values) as pd.Series.
    """
    radians = 2 * np.pi * value / max_value
    return np.sin(radians), np.cos(radians)

def compute_median_time_diff():
    """Compute median time difference between consecutive orders across all users.
    
    Returns:
        float: Median time difference in hours.
    """
    sorted_orders = orders[['user_id', 'order_date']].sort_values(['user_id', 'order_date'])
    time_diffs = sorted_orders.groupby('user_id')['order_date'].diff().dt.total_seconds() / 3600
    return time_diffs.median() if not time_diffs.empty else 24.0  # Default to 24 hours if empty

def downcast_dtypes(df):
    """Downcast numeric columns to reduce memory usage.
    
    Args:
        df (pd.DataFrame): DataFrame to downcast.
    
    Returns:
        pd.DataFrame: Downcasted DataFrame.
    """
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df


## Feature Engineering: User-Level Features

Each feature returns a DataFrame with columns: user_id, feature_name.

In [3]:
def add_feature_alcohol_count():
    """Count how many alcohol products each user has purchased.
    
    Returns:
        pd.DataFrame: Columns [user_id, alcohol_purchases].
    """
    alcohol_df = order_products_denormalized[order_products_denormalized['department'] == 'alcohol']
    if alcohol_df.empty:
        return pd.DataFrame({'user_id': orders['user_id'].unique(), 'alcohol_purchases': 0})
    
    alcohol_with_users = merge_with_orders(alcohol_df)
    alcohol_counts = alcohol_with_users.groupby('user_id').size().reset_index(name='alcohol_purchases')
    
    all_users = pd.DataFrame({'user_id': orders['user_id'].unique()})
    result = all_users.merge(alcohol_counts, on='user_id', how='left').fillna({'alcohol_purchases': 0})
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['user_id', 'alcohol_purchases'])
    return result

def total_products_per_user():
    """Count total products purchased by each user.
    
    Returns:
        pd.DataFrame: Columns [user_id, total_products_bought].
    """
    merged = merge_with_orders(order_products_denormalized)
    total_products = merged.groupby('user_id')['product_id'].count().reset_index(name='total_products_bought')
    total_products = downcast_dtypes(total_products)
    
    validate_dataframe(total_products, ['user_id', 'total_products_bought'])
    return total_products

def total_unique_products_per_user():
    """Count unique products purchased by each user.
    
    Returns:
        pd.DataFrame: Columns [user_id, unique_products_bought].
    """
    merged = merge_with_orders(order_products_denormalized)
    unique_products = merged.groupby('user_id')['product_id'].nunique().reset_index(name='unique_products_bought')
    unique_products = downcast_dtypes(unique_products)
    
    validate_dataframe(unique_products, ['user_id', 'unique_products_bought'])
    return unique_products

def unique_to_total_product_ratio_per_user():
    """Calculate ratio of unique to total products purchased by each user.
    
    Returns:
        pd.DataFrame: Columns [user_id, unique_to_total_product_ratio].
    """
    total = total_products_per_user()
    unique = total_unique_products_per_user()
    merged = total.merge(unique, on='user_id')
    merged['unique_to_total_product_ratio'] = merged['unique_products_bought'] / merged['total_products_bought']
    merged = downcast_dtypes(merged)
    
    validate_dataframe(merged, ['user_id', 'unique_to_total_product_ratio'])
    return merged[['user_id', 'unique_to_total_product_ratio']]

def most_frequent_purchase_hour():
    """Identify the most frequent hour of day for each user's orders.
    
    Returns:
        pd.DataFrame: Columns [user_id, most_frequent_hour].
    """
    orders_with_hour = orders[['user_id', 'order_date']].copy()
    orders_with_hour['hour'] = orders_with_hour['order_date'].dt.hour
    
    hour_counts = orders_with_hour.groupby(['user_id', 'hour']).size().reset_index(name='count')
    idx = hour_counts.groupby('user_id')['count'].idxmax()
    result = hour_counts.loc[idx, ['user_id', 'hour']].rename(columns={'hour': 'most_frequent_hour'})
    
    all_users = pd.DataFrame({'user_id': orders['user_id'].unique()})
    result = all_users.merge(result, on='user_id', how='left').fillna({'most_frequent_hour': 12})  # Default to noon
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['user_id', 'most_frequent_hour'])
    return result

def most_frequent_purchase_dow():
    """Identify the most frequent day of week for each user's orders (0=Monday, 6=Sunday).
    
    Returns:
        pd.DataFrame: Columns [user_id, most_frequent_dow].
    """
    orders_with_dow = orders[['user_id', 'order_date']].copy()
    orders_with_dow['dow'] = orders_with_dow['order_date'].dt.dayofweek
    
    dow_counts = orders_with_dow.groupby(['user_id', 'dow']).size().reset_index(name='count')
    idx = dow_counts.groupby('user_id')['count'].idxmax()
    result = dow_counts.loc[idx, ['user_id', 'dow']].rename(columns={'dow': 'most_frequent_dow'})
    
    all_users = pd.DataFrame({'user_id': orders['user_id'].unique()})
    result = all_users.merge(result, on='user_id', how='left').fillna({'most_frequent_dow': 0})  # Default to Monday
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['user_id', 'most_frequent_dow'])
    return result

def avg_time_between_orders():
    """Calculate average time between consecutive orders for each user in hours.
    
    Returns:
        pd.DataFrame: Columns [user_id, avg_time_between_orders_hours].
    """
    sorted_orders = orders[['user_id', 'order_date']].sort_values(['user_id', 'order_date'])
    time_diffs = sorted_orders.groupby('user_id')['order_date'].diff().dt.total_seconds() / 3600
    
    avg_diffs = time_diffs.groupby(sorted_orders['user_id']).mean().reset_index(name='avg_time_between_orders_hours')
    all_users = pd.DataFrame({'user_id': orders['user_id'].unique()})
    result = all_users.merge(avg_diffs, on='user_id', how='left')
    result['avg_time_between_orders_hours'] = result['avg_time_between_orders_hours'].fillna(compute_median_time_diff())
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['user_id', 'avg_time_between_orders_hours'])
    return result

def purchase_hour_cyclic():
    """Apply sine-cosine transformation to the most frequent purchase hour.
    
    Returns:
        pd.DataFrame: Columns [user_id, purchase_hour_sin, purchase_hour_cos].
    """
    hours = most_frequent_purchase_hour()
    sin_vals, cos_vals = cyclic_transform(hours['most_frequent_hour'], 24)
    
    result = hours[['user_id']].copy()
    result['purchase_hour_sin'] = sin_vals
    result['purchase_hour_cos'] = cos_vals
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['user_id', 'purchase_hour_sin', 'purchase_hour_cos'])
    return result

def purchase_season_cyclic():
    """Apply sine-cosine transformation to the most frequent purchase month.
    
    Returns:
        pd.DataFrame: Columns [user_id, purchase_season_sin, purchase_season_cos].
    """
    orders_with_month = orders[['user_id', 'order_date']].copy()
    orders_with_month['month'] = orders_with_month['order_date'].dt.month
    
    month_counts = orders_with_month.groupby(['user_id', 'month']).size().reset_index(name='count')
    idx = month_counts.groupby('user_id')['count'].idxmax()
    most_frequent_month = month_counts.loc[idx, ['user_id', 'month']].rename(columns={'month': 'most_frequent_month'})
    
    all_users = pd.DataFrame({'user_id': orders['user_id'].unique()})
    most_frequent_month = all_users.merge(most_frequent_month, on='user_id', how='left').fillna({'most_frequent_month': 1})  # Default to January
    
    sin_vals, cos_vals = cyclic_transform(most_frequent_month['most_frequent_month'], 12)
    result = most_frequent_month[['user_id']].copy()
    result['purchase_season_sin'] = sin_vals
    result['purchase_season_cos'] = cos_vals
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['user_id', 'purchase_season_sin', 'purchase_season_cos'])
    return result


## Feature Engineering: Order-Level Features

Each feature returns a DataFrame with columns: order_id, feature_name.

In [4]:
def add_feature_order_contains_alcohol():
    """Flag orders containing alcohol products.
    
    Returns:
        pd.DataFrame: Columns [order_id, contains_alcohol].
    """
    alcohol_orders = order_products_denormalized[order_products_denormalized['department'] == 'alcohol'][['order_id']].drop_duplicates()
    alcohol_orders['contains_alcohol'] = 1
    
    all_orders = pd.DataFrame({'order_id': orders['order_id'].unique()})
    result = all_orders.merge(alcohol_orders, on='order_id', how='left').fillna({'contains_alcohol': 0})
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['order_id', 'contains_alcohol'])
    return result

def add_feature_order_item_count():
    """Count total items in each order.
    
    Returns:
        pd.DataFrame: Columns [order_id, item_count].
    """
    item_counts = order_products_denormalized.groupby('order_id')['product_id'].count().reset_index(name='item_count')
    item_counts = downcast_dtypes(item_counts)
    
    validate_dataframe(item_counts, ['order_id', 'item_count'])
    return item_counts

def add_feature_order_unique_departments_count():
    """Count unique departments in each order.
    
    Returns:
        pd.DataFrame: Columns [order_id, unique_departments_count].
    """
    dept_counts = order_products_denormalized.groupby('order_id')['department'].nunique().reset_index(name='unique_departments_count')
    dept_counts = downcast_dtypes(dept_counts)
    
    validate_dataframe(dept_counts, ['order_id', 'unique_departments_count'])
    return dept_counts

def add_feature_order_unique_aisles_count():
    """Count unique aisles in each order.
    
    Returns:
        pd.DataFrame: Columns [order_id, unique_aisles_count].
    """
    aisle_counts = order_products_denormalized.groupby('order_id')['aisle'].nunique().reset_index(name='unique_aisles_count')
    aisle_counts = downcast_dtypes(aisle_counts)
    
    validate_dataframe(aisle_counts, ['order_id', 'unique_aisles_count'])
    return aisle_counts

def add_feature_order_unique_departments_ratio():
    """Calculate ratio of unique departments to total items in each order.
    
    Returns:
        pd.DataFrame: Columns [order_id, unique_departments_ratio].
    """
    total_items = add_feature_order_item_count()
    unique_depts = add_feature_order_unique_departments_count()
    merged = total_items.merge(unique_depts, on='order_id')
    merged['unique_departments_ratio'] = merged['unique_departments_count'] / merged['item_count']
    merged = downcast_dtypes(merged)
    
    validate_dataframe(merged, ['order_id', 'unique_departments_ratio'])
    return merged[['order_id', 'unique_departments_ratio']]

def add_feature_order_unique_aisles_ratio():
    """Calculate ratio of unique aisles to total items in each order.
    
    Returns:
        pd.DataFrame: Columns [order_id, unique_aisles_ratio].
    """
    total_items = add_feature_order_item_count()
    unique_aisles = add_feature_order_unique_aisles_count()
    merged = total_items.merge(unique_aisles, on='order_id')
    merged['unique_aisles_ratio'] = merged['unique_aisles_count'] / merged['item_count']
    merged = downcast_dtypes(merged)
    
    validate_dataframe(merged, ['order_id', 'unique_aisles_ratio'])
    return merged[['order_id', 'unique_aisles_ratio']]

def add_feature_avg_tip_rate_department(default_tip_rate=0.5):
    """Calculate the average tip rate for each department up to the order date, aggregated per order.
    
    Args:
        default_tip_rate (float): Default tip rate for departments with no prior orders.
    
    Returns:
        pd.DataFrame: Columns [order_id, avg_tip_rate_department].
    """
    merged = order_products_denormalized.merge(
        orders[['order_id', 'order_date']], on='order_id'
    ).merge(
        tips_public[['order_id', 'tip']], on='order_id', how='left'
    )
    
    merged['tip'] = merged['tip'].fillna(0).astype('float32')
    merged = merged.sort_values(by=['department', 'order_date'])
    
    merged['order_count'] = merged.groupby('department').cumcount().astype('int32')
    merged['tip_cumsum_before'] = merged.groupby('department')['tip'].cumsum() - merged['tip']
    merged['avg_tip_rate_before'] = merged['tip_cumsum_before'] / merged['order_count']
    merged.loc[merged['order_count'] == 0, 'avg_tip_rate_before'] = pd.NA
    
    order_tip_rate = merged.groupby('order_id')['avg_tip_rate_before'].mean().reset_index(name='avg_tip_rate_department')
    order_tip_rate['avg_tip_rate_department'] = order_tip_rate['avg_tip_rate_department'].fillna(default_tip_rate).astype('float32')
    order_tip_rate = downcast_dtypes(order_tip_rate)
    
    validate_dataframe(order_tip_rate, ['order_id', 'avg_tip_rate_department'])
    return order_tip_rate

def add_feature_avg_tip_rate_aisle(default_tip_rate=0.5):
    """Calculate the average tip rate for each aisle up to the order date, aggregated per order.
    
    Args:
        default_tip_rate (float): Default tip rate for aisles with no prior orders.
    
    Returns:
        pd.DataFrame: Columns [order_id, avg_tip_rate_aisle].
    """
    merged = order_products_denormalized.merge(
        orders[['order_id', 'order_date']], on='order_id'
    ).merge(
        tips_public[['order_id', 'tip']], on='order_id', how='left'
    )
    
    merged['tip'] = merged['tip'].fillna(0).astype('float32')
    merged = merged.sort_values(by=['aisle', 'order_date'])
    
    merged['order_count'] = merged.groupby('aisle').cumcount().astype('int32')
    merged['tip_cumsum_before'] = merged.groupby('aisle')['tip'].cumsum() - merged['tip']
    merged['avg_tip_rate_before'] = merged['tip_cumsum_before'] / merged['order_count']
    merged.loc[merged['order_count'] == 0, 'avg_tip_rate_before'] = pd.NA
    
    order_tip_rate = merged.groupby('order_id')['avg_tip_rate_before'].mean().reset_index(name='avg_tip_rate_aisle')
    order_tip_rate['avg_tip_rate_aisle'] = order_tip_rate['avg_tip_rate_aisle'].fillna(default_tip_rate).astype('float32')
    order_tip_rate = downcast_dtypes(order_tip_rate)
    
    validate_dataframe(order_tip_rate, ['order_id', 'avg_tip_rate_aisle'])
    return order_tip_rate

def order_hour():
    """Extract the hour of the day for each order.
    
    Returns:
        pd.DataFrame: Columns [order_id, order_hour].
    """
    result = orders[['order_id', 'order_date']].copy()
    result['order_hour'] = result['order_date'].dt.hour
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['order_id', 'order_hour'])
    return result[['order_id', 'order_hour']]

def order_dow():
    """Extract the day of week for each order (0=Monday, 6=Sunday).
    
    Returns:
        pd.DataFrame: Columns [order_id, 'order_dow'].
    """
    result = orders[['order_id', 'order_date']].copy()
    result['order_dow'] = result['order_date'].dt.dayofweek
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['order_id', 'order_dow'])
    return result[['order_id', 'order_dow']]

def is_weekend_order():
    """Flag orders placed on weekends (Saturday or Sunday).
    
    Returns:
        pd.DataFrame: Columns [order_id, is_weekend].
    """
    dow = order_dow()
    result = dow[['order_id']].copy()
    result['is_weekend'] = dow['order_dow'].isin([5, 6]).astype('int8')
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['order_id', 'is_weekend'])
    return result

def order_hour_cyclic():
    """Apply sine-cosine transformation to the order hour.
    
    Returns:
        pd.DataFrame: Columns [order_id, order_hour_sin, order_hour_cos].
    """
    hours = order_hour()
    sin_vals, cos_vals = cyclic_transform(hours['order_hour'], 24)
    
    result = hours[['order_id']].copy()
    result['order_hour_sin'] = sin_vals
    result['order_hour_cos'] = cos_vals
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['order_id', 'order_hour_sin', 'order_hour_cos'])
    return result

def order_season_cyclic():
    """Apply sine-cosine transformation to the order month.
    
    Returns:
        pd.DataFrame: Columns [order_id, order_season_sin, order_season_cos].
    """
    result = orders[['order_id', 'order_date']].copy()
    result['month'] = result['order_date'].dt.month
    
    sin_vals, cos_vals = cyclic_transform(result['month'], 12)
    result['order_season_sin'] = sin_vals
    result['order_season_cos'] = cos_vals
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['order_id', 'order_season_sin', 'order_season_cos'])
    return result[['order_id', 'order_season_sin', 'order_season_cos']]

def time_since_last_order():
    """Calculate time since the user's last order in hours.
    
    Returns:
        pd.DataFrame: Columns [order_id, time_since_last_order_hours].
    """
    sorted_orders = orders[['order_id', 'user_id', 'order_date']].sort_values(['user_id', 'order_date'])
    time_diffs = sorted_orders.groupby('user_id')['order_date'].diff().shift(-1).dt.total_seconds() / 3600
    time_diffs = time_diffs.reindex(sorted_orders.index).shift(1)  # Shift back to align with current order
    
    result = sorted_orders[['order_id']].copy()
    result['time_since_last_order_hours'] = time_diffs.fillna(compute_median_time_diff())
    result = downcast_dtypes(result)
    
    validate_dataframe(result, ['order_id', 'time_since_last_order_hours'])
    return result


## Feature Engineering: User-Product-Level Features

Each feature returns a DataFrame with columns: user_id, product_id, feature_name.

In [5]:
def count_products_per_user():
    """Count how many times each user purchased each product.
    
    Returns:
        pd.DataFrame: Columns [user_id, product_id, times_bought].
    """
    merged = merge_with_orders(order_products_denormalized)
    counts = merged.groupby(['user_id', 'product_id']).size().reset_index(name='times_bought')
    counts = downcast_dtypes(counts)
    
    validate_dataframe(counts, ['user_id', 'product_id', 'times_bought'])
    return counts

def create_user_product_tip_probability(default_tip_rate=0.500111):
    """Berechnet die durchschnittliche Trinkgeldwahrscheinlichkeit pro (user_id, product_id)-Paar
    und verknüpft sie mit order_products_denormalized, ohne die Zeilenanzahl zu ändern.
    
    Args:
        default_tip_rate (float): Standard-Trinkgeldwahrscheinlichkeit für Paare ohne Historie.
    
    Returns:
        pd.DataFrame: DataFrame mit order_id, user_id, product_id und tip_probability.
    """
    # Schritt 1: Validierung der Eingabedaten
    if 'user_id' not in orders.columns:
        raise ValueError("Spalte 'user_id' fehlt in orders DataFrame")
    if 'order_id' not in order_products_denormalized.columns:
        raise ValueError("Spalte 'order_id' fehlt in order_products_denormalized DataFrame")
    
    # Schritt 2: Merge der DataFrames, um user_id hinzuzufügen
    merged = order_products_denormalized.merge(
        orders[['order_id', 'user_id', 'order_date']], 
        on='order_id', 
        how='left'
    )
    
    # Validierung nach erstem Merge
    if 'user_id' not in merged.columns:
        raise ValueError("user_id fehlt nach Merge mit orders. Überprüfe order_id Übereinstimmungen.")
    if merged['user_id'].isna().any():
        print(f"Warnung: {merged['user_id'].isna().sum()} Zeilen mit fehlendem user_id nach Merge mit orders")
        merged = merged.dropna(subset=['user_id'])  # Entferne Zeilen mit fehlendem user_id
    
    merged = merged.merge(
        tips_public[['order_id', 'tip']], 
        on='order_id', 
        how='left'
    )
    
    # Schritt 3: Optimierung und Berechnung
    merged['tip'] = merged['tip'].fillna(0).astype('float32')
    merged = merged.sort_values(by=['user_id', 'product_id', 'order_date'])
    
    merged['times_bought_before'] = merged.groupby(['user_id', 'product_id']).cumcount().astype('int32')
    merged['tip_cumsum_before'] = merged.groupby(['user_id', 'product_id'])['tip'].cumsum() - merged['tip']
    merged['avg_tip_rate_before'] = merged['tip_cumsum_before'] / merged['times_bought_before']
    merged.loc[merged['times_bought_before'] == 0, 'avg_tip_rate_before'] = pd.NA
    
    # Schritt 4: Aggregation zu user-product Ebene
    user_product_tip = merged.groupby(['user_id', 'product_id'])['avg_tip_rate_before'].mean().reset_index(
        name='tip_probability'
    )
    user_product_tip['tip_probability'] = user_product_tip['tip_probability'].fillna(default_tip_rate).astype('float32')
    user_product_tip = downcast_dtypes(user_product_tip, exclude_columns=['user_id', 'product_id'])
    
    # Schritt 5: Verknüpfung mit order_products_denormalized, inklusive user_id
    result = order_products_denormalized.merge(
        orders[['order_id', 'user_id']], 
        on='order_id', 
        how='left'
    ).merge(
        user_product_tip[['user_id', 'product_id', 'tip_probability']],
        on=['user_id', 'product_id'],
        how='left'
    )
    
    # Fülle fehlende Werte
    result['tip_probability'] = result['tip_probability'].fillna(default_tip_rate).astype('float32')
    result = downcast_dtypes(result, exclude_columns=['order_id', 'user_id', 'product_id'])
    
    # Schritt 6: Validierung
    if len(result) != len(order_products_denormalized):
        raise ValueError(f"Zeilenanzahl stimmt nicht: {len(result)} statt {len(order_products_denormalized)}")
    expected_columns = list(order_products_denormalized.columns) + ['user_id', 'tip_probability']
    validate_dataframe(result, expected_columns)
    
    return result


## Feature Consolidation

Combine all features into a single DataFrame for a comprehensive view.

In [7]:
# In execution_count 2 (Helper Functions), aktualisiere downcast_dtypes
def downcast_dtypes(df, exclude_columns=['order_id']):
    """Downcast numeric columns to reduce memory usage, excluding specified columns.
    
    Args:
        df (pd.DataFrame): DataFrame to downcast.
        exclude_columns (list): Columns to exclude from downcasting.
    
    Returns:
        pd.DataFrame: Downcasted DataFrame.
    """
    for col in df.select_dtypes(include=['int64']).columns:
        if col not in exclude_columns:
            df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

from IPython.display import display

def combine_all_features(default_tip_rate=0.5):
    """Combine all engineered features into a single DataFrame.
    
    Args:
        default_tip_rate (float): Standard-Trinkgeldwahrscheinlichkeit für fehlende Werte.
    
    Returns:
        pd.DataFrame: DataFrame with all features, keyed by order_id, user_id, and including the tip column.
    """
    # Base DataFrame with order_id and user_id
    base_df = orders[['order_id', 'user_id']].copy()
    base_df['order_id'] = base_df['order_id'].astype('int64')
    base_df = downcast_dtypes(base_df, exclude_columns=['order_id'])
    
    # Merge order-level features
    order_features = [
        add_feature_order_contains_alcohol(),
        add_feature_order_item_count(),
        add_feature_order_unique_departments_count(),
        add_feature_order_unique_aisles_count(),
        add_feature_order_unique_departments_ratio(),
        add_feature_order_unique_aisles_ratio(),
        add_feature_avg_tip_rate_department(),
        add_feature_avg_tip_rate_aisle(),
        order_hour(),
        order_dow(),
        is_weekend_order(),
        order_hour_cyclic(),
        order_season_cyclic(),
        time_since_last_order()
    ]
    
    result = base_df
    for feature_df in order_features:
        if 'order_id' in feature_df.columns:
            feature_df['order_id'] = feature_df['order_id'].astype('int64')
            print(f"Merging feature with columns {feature_df.columns.tolist()}, order_id dtype: {feature_df['order_id'].dtype}")
        else:
            raise ValueError(f"feature_df missing order_id: {feature_df.columns.tolist()}")
        
        merge_cols = ['order_id']
        feature_cols = [col for col in feature_df.columns if col not in merge_cols]
        result_cols = [col for col in result.columns if col in feature_cols]
        if result_cols:
            result = result.drop(columns=result_cols)
        result = result.merge(feature_df, on='order_id', how='left')
        result = downcast_dtypes(result, exclude_columns=['order_id'])
    
    # Merge user-level features
    user_features = [
        add_feature_alcohol_count(),
        total_products_per_user(),
        total_unique_products_per_user(),
        unique_to_total_product_ratio_per_user(),
        most_frequent_purchase_hour(),
        most_frequent_purchase_dow(),
        avg_time_between_orders(),
        purchase_hour_cyclic(),
        purchase_season_cyclic()
    ]
    
    for feature_df in user_features:
        merge_cols = ['user_id']
        feature_cols = [col for col in feature_df.columns if col not in merge_cols]
        result_cols = [col for col in result.columns if col in feature_cols]
        if result_cols:
            result = result.drop(columns=result_cols)
        result = result.merge(feature_df, on='user_id', how='left')
        result = downcast_dtypes(result, exclude_columns=['order_id'])
    
    # Aggregate user-product-level features to user level
    user_product = count_products_per_user()
    user_product_sum = user_product.groupby('user_id')['times_bought'].sum().reset_index(name='total_times_bought')
    user_product_agg = orders[['order_id', 'user_id']].merge(user_product_sum, on='user_id', how='left')
    user_product_agg['order_id'] = user_product_agg['order_id'].astype('int64')
    user_product_agg['total_times_bought'] = user_product_agg['total_times_bought'].fillna(0)
    user_product_agg = downcast_dtypes(user_product_agg, exclude_columns=['order_id'])
    result = result.merge(user_product_agg[['order_id', 'user_id', 'total_times_bought']], on=['order_id', 'user_id'], how='left')
    
    # Add user-product tip probability
    user_product_tip_df = create_user_product_tip_probability(default_tip_rate=default_tip_rate)
    user_product_tip_df['order_id'] = user_product_tip_df['order_id'].astype('int64')
    # Aggregiere tip_probability auf order_id-Ebene (Mittelwert)
    tip_prob_agg = user_product_tip_df.groupby('order_id')['tip_probability'].mean().reset_index()
    result = result.merge(
        tip_prob_agg[['order_id', 'tip_probability']],
        on='order_id',
        how='left'
    )
    result['tip_probability'] = result['tip_probability'].fillna(default_tip_rate).astype('float32')
    result = downcast_dtypes(result, exclude_columns=['order_id'])
    
    # Add target variable (tip)
    result = result.merge(tips_public[['order_id', 'tip']], on='order_id', how='left')
    result = downcast_dtypes(result, exclude_columns=['order_id'])
    
    # Validate
    expected_columns = [
        'order_id', 'user_id', 'contains_alcohol', 'item_count', 'unique_departments_count',
        'unique_aisles_count', 'unique_departments_ratio', 'unique_aisles_ratio',
        'avg_tip_rate_department', 'avg_tip_rate_aisle', 'order_hour', 'order_dow',
        'is_weekend', 'order_hour_sin', 'order_hour_cos', 'order_season_sin',
        'order_season_cos', 'time_since_last_order_hours',
        'alcohol_purchases', 'total_products_bought', 'unique_products_bought',
        'unique_to_total_product_ratio', 'most_frequent_hour', 'most_frequent_dow',
        'avg_time_between_orders_hours', 'purchase_hour_sin', 'purchase_hour_cos',
        'purchase_season_sin', 'purchase_season_cos', 'total_times_bought',
        'tip_probability', 'tip'
    ]
    validate_dataframe(result, expected_columns)
    
    return result

# Generate and display the combined feature DataFrame
all_features_df = combine_all_features()
display(all_features_df)

  merged['order_count'] = merged.groupby('department').cumcount().astype('int32')
  merged['tip_cumsum_before'] = merged.groupby('department')['tip'].cumsum() - merged['tip']
  merged['order_count'] = merged.groupby('aisle').cumcount().astype('int32')
  merged['tip_cumsum_before'] = merged.groupby('aisle')['tip'].cumsum() - merged['tip']


Merging feature with columns ['order_id', 'contains_alcohol'], order_id dtype: int64
Merging feature with columns ['order_id', 'item_count'], order_id dtype: int64
Merging feature with columns ['order_id', 'unique_departments_count'], order_id dtype: int64
Merging feature with columns ['order_id', 'unique_aisles_count'], order_id dtype: int64
Merging feature with columns ['order_id', 'unique_departments_ratio'], order_id dtype: int64
Merging feature with columns ['order_id', 'unique_aisles_ratio'], order_id dtype: int64
Merging feature with columns ['order_id', 'avg_tip_rate_department'], order_id dtype: int64
Merging feature with columns ['order_id', 'avg_tip_rate_aisle'], order_id dtype: int64
Merging feature with columns ['order_id', 'order_hour'], order_id dtype: int64
Merging feature with columns ['order_id', 'order_dow'], order_id dtype: int64
Merging feature with columns ['order_id', 'is_weekend'], order_id dtype: int64
Merging feature with columns ['order_id', 'order_hour_sin',

Unnamed: 0,order_id,user_id,contains_alcohol,item_count,unique_departments_count,unique_aisles_count,unique_departments_ratio,unique_aisles_ratio,avg_tip_rate_department,avg_tip_rate_aisle,...,most_frequent_hour,most_frequent_dow,avg_time_between_orders_hours,purchase_hour_sin,purchase_hour_cos,purchase_season_sin,purchase_season_cos,total_times_bought,tip_probability,tip
0,1374495,3,0.0,10.0,3.0,4.0,0.300000,0.400000,0.516237,0.523879,...,16,5,288.102417,-0.866025,-0.500000,1.224647e-16,-1.000000,88,0.849794,True
1,444309,3,0.0,9.0,5.0,9.0,0.555556,1.000000,0.502990,0.499790,...,16,5,288.102417,-0.866025,-0.500000,1.224647e-16,-1.000000,88,0.714135,True
2,3002854,3,0.0,6.0,4.0,6.0,0.666667,1.000000,0.510314,0.507816,...,16,5,288.102417,-0.866025,-0.500000,1.224647e-16,-1.000000,88,0.791323,True
3,2037211,3,0.0,5.0,4.0,5.0,0.800000,1.000000,0.442459,0.457083,...,16,5,288.102417,-0.866025,-0.500000,1.224647e-16,-1.000000,88,0.900000,True
4,2710558,3,0.0,11.0,4.0,7.0,0.363636,0.636364,0.518061,0.508472,...,16,5,288.102417,-0.866025,-0.500000,1.224647e-16,-1.000000,88,0.886176,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1463627,3059777,206208,0.0,7.0,4.0,6.0,0.571429,0.857143,0.542463,0.544971,...,15,0,176.729568,-0.707107,-0.707107,5.000000e-01,0.866025,677,0.553786,False
1463628,2239861,206208,0.0,23.0,6.0,14.0,0.260870,0.608696,0.528861,0.534091,...,15,0,176.729568,-0.707107,-0.707107,5.000000e-01,0.866025,677,0.325015,True
1463629,1285346,206208,0.0,8.0,4.0,6.0,0.500000,0.750000,0.520542,0.527629,...,15,0,176.729568,-0.707107,-0.707107,5.000000e-01,0.866025,677,0.373723,True
1463630,1882108,206208,0.0,17.0,6.0,12.0,0.352941,0.705882,0.530163,0.523381,...,15,0,176.729568,-0.707107,-0.707107,5.000000e-01,0.866025,677,0.355284,True


## Feature Validation

Basic tests to ensure feature correctness.

In [10]:
def run_feature_tests():
    """Run basic validation tests on features."""
    # Test alcohol count
    alcohol_count = add_feature_alcohol_count()
    assert alcohol_count['alcohol_purchases'].ge(0).all(), "Alcohol purchases cannot be negative"
    
    # Test unique to total product ratio
    ratio = unique_to_total_product_ratio_per_user()
    assert ratio['unique_to_total_product_ratio'].between(0, 1).all(), "Ratio must be between 0 and 1"
    
    # Test order item count
    item_count = add_feature_order_item_count()
    assert item_count['item_count'].ge(1).all(), "Item count must be at least 1"
    
    # Test department tip rate
    dept_tip_rate = add_feature_avg_tip_rate_department()
    assert dept_tip_rate['avg_tip_rate_department'].between(0, 1).all(), "Department tip rate must be between 0 and 1"
    
    # Test aisle tip rate
    aisle_tip_rate = add_feature_avg_tip_rate_aisle()
    assert aisle_tip_rate['avg_tip_rate_aisle'].between(0, 1).all(), "Aisle tip rate must be between 0 and 1"
    
    # Test most frequent hour
    freq_hour = most_frequent_purchase_hour()
    assert freq_hour['most_frequent_hour'].between(0, 23).all(), "Hour must be between 0 and 23"
    
    # Test cyclic transformations
    hour_cyclic = purchase_hour_cyclic()
    assert ((hour_cyclic['purchase_hour_sin'] >= -1) & (hour_cyclic['purchase_hour_sin'] <= 1) &
            (hour_cyclic['purchase_hour_cos'] >= -1) & (hour_cyclic['purchase_hour_cos'] <= 1)).all(), \
            "Cyclic values must be between -1 and 1"
    
    # Test time since last order
    time_since = time_since_last_order()
    assert time_since['time_since_last_order_hours'].ge(0).all(), "Time since last order cannot be negative"
    
    # Test user-product tip probability
    user_product_tip_df = create_user_product_tip_probability()
    assert user_product_tip_df['tip_probability'].between(0, 1).all(), "Tip probability must be between 0 and 1"
    assert len(user_product_tip_df) == len(order_products_denormalized), "Row count must match order_products_denormalized"
    
    # Test combined features
    combined_df = combine_all_features()
    assert combined_df['order_id'].nunique() == orders['order_id'].nunique(), "Combined DataFrame must contain all orders"
    assert combined_df['tip'].notna().sum() == tips_public['tip'].notna().sum(), "Combined DataFrame must preserve all tip values"
    
    print("All tests passed!")

#run_feature_tests()