# Main Program

In [1]:
import numpy as np
import pandas as pd
import features

# Load datasets
orders = pd.read_parquet("orders.parquet")
order_products_denormalized = pd.read_csv("order_products_denormalized.csv", dtype={'order_id': 'int64'})
tips_public = pd.read_csv("tips_public.csv", dtype={'order_id': 'int64'}).drop(columns=["Unnamed: 0"])

# Optimize memory usage by converting to categorical types
order_products_denormalized['department'] = order_products_denormalized['department'].astype('category')
order_products_denormalized['aisle'] = order_products_denormalized['aisle'].astype('category')

# Ensure consistent data types
orders['order_id'] = orders['order_id'].astype('int64')
orders['user_id'] = orders['user_id'].astype('int64')
order_products_denormalized['product_id'] = order_products_denormalized['product_id'].astype('int64')
tips_public['order_id'] = tips_public['order_id'].astype('int64')

# Feature Engineering

## Feature Overview

The following table lists all features engineered in this notebook, including their level, output columns, data types, and descriptions. All features are aggregated to the `order_id` level in the final DataFrame.

| **Feature Name** | **Level** | **Output Columns** | **Data Type** | **Description** |
|------------------|-----------|--------------------|---------------|-----------------|
| `user_alcohol_purchase_count` | User | `[user_id, user_alcohol_purchase_count]` | Integer | Counts the total number of alcohol products purchased by each user across all orders, merged via user_id. |
| `user_total_purchase_count` | User | `[user_id, user_total_purchase_count]` | Integer | Counts the total number of products purchased by each user across all orders, merged via user_id. |
| `user_unique_product_count` | User | `[user_id, user_unique_product_count]` | Integer | Counts the number of unique products purchased by each user, merged via user_id. |
| `user_unique_to_total_ratio` | User | `[user_id, user_unique_to_total_ratio]` | Float | Calculates the ratio of unique products to total products purchased by each user, merged via user_id. |
| `user_frequent_purchase_hour` | User | `[user_id, user_frequent_purchase_hour]` | Integer (0–23) | Identifies the hour of the day when the user places the most orders, defaulting to 12 (noon) if missing, merged via user_id. |
| `user_frequent_purchase_dow` | User | `[user_id, user_frequent_purchase_dow]` | Integer (0–6) | Identifies the day of the week (0=Monday, 6=Sunday) when the user places the most orders, defaulting to 0 (Monday), merged via user_id. |
| `user_avg_order_interval_hours` | User | `[user_id, user_avg_order_interval_hours]` | Float | Calculates the average time (in hours) between consecutive orders for each user, using the dataset median for users with one order, merged via user_id. |
| `user_frequent_hour_sin`, `user_frequent_hour_cos` | User | `[user_id, user_frequent_hour_sin, user_frequent_hour_cos]` | Float (-1 to 1) | Applies sine-cosine transformation to the most frequent purchase hour to capture its cyclical nature, merged via user_id. |
| `user_frequent_season_sin`, `user_frequent_season_cos` | User | `[user_id, user_frequent_season_sin, user_frequent_season_cos]` | Float (-1 to 1) | Applies sine-cosine transformation to the most frequent purchase month to capture seasonal cyclicality, defaulting to January, merged via user_id. |
| `order_has_alcohol` | Order | `[order_id, order_has_alcohol]` | Integer (0 or 1) | Flags whether an order contains any alcohol products (1 if yes, 0 if no). |
| `order_product_count` | Order | `[order_id, order_product_count]` | Integer | Counts the total number of items (products) in each order. |
| `order_unique_dept_count` | Order | `[order_id, order_unique_dept_count]` | Integer | Counts the number of unique departments in each order. |
| `order_unique_aisle_count` | Order | `[order_id, order_unique_aisle_count]` | Integer | Counts the number of unique aisles in each order. |
| `order_unique_dept_ratio` | Order | `[order_id, order_unique_dept_ratio]` | Float | Calculates the ratio of unique departments to total items in each order. |
| `order_unique_aisle_ratio` | Order | `[order_id, order_unique_aisle_ratio]` | Float | Calculates the ratio of unique aisles to total items in each order. |
| `order_dept_tip_rate` | Order | `[order_id, order_dept_tip_rate]` | Float (0 to 1) | Computes the average tip rate for the departments in an order based on prior orders, defaulting to 0.500111 for no history. |
| `order_aisle_tip_rate` | Order | `[order_id, order_aisle_tip_rate]` | Float (0 to 1) | Computes the average tip rate for the aisles in an order based on prior orders, defaulting to 0.500111 for no history. |
| `order_placed_hour` | Order | `[order_id, order_placed_hour]` | Integer (0–23) | Extracts the hour of the day when the order was placed. |
| `order_placed_dow` | Order | `[order_id, order_placed_dow]` | Integer (0–6) | Extracts the day of the week (0=Monday, 6=Sunday) when the order was placed. |
| `order_is_weekend` | Order | `[order_id, order_is_weekend]` | Integer (0 or 1) | Flags whether the order was placed on a weekend (Saturday or Sunday). |
| `order_placed_hour_sin`, `order_placed_hour_cos` | Order | `[order_id, order_placed_hour_sin, order_placed_hour_cos]` | Float (-1 to 1) | Applies sine-cosine transformation to the order’s hour to capture its cyclical nature. |
| `order_placed_season_sin`, `order_placed_season_cos` | Order | `[order_id, order_placed_season_sin, order_placed_season_cos]` | Float (-1 to 1) | Applies sine-cosine transformation to the order’s month to capture seasonal cyclicality. |
| `order_time_since_last_hours` | Order | `[order_id, order_time_since_last_hours]` | Float | Calculates the time (in hours) since the user’s previous order, using the dataset median for first orders. |
| `user_total_product_purchase_count` | User | `[user_id, user_total_product_purchase_count]` | Integer | Total count of products purchased by each user, aggregated from user-product level, merged via user_id. |
| `user_product_tip_prob` | Order | `[order_id, user_product_tip_prob]` | Float (0 to 1) | Average tip probability for user-product pairs in an order, aggregated to order_id, defaulting to 0.500111 for no history. |


In [None]:
# Generate and display the combined feature DataFrame
all_features_df = features.combine_all_features(orders, order_products_denormalized, tips_public)
display(all_features_df)

In [None]:
all_features_df.to_csv('all_features.csv', index=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Daten aufteilen
train_df = all_features_df[~all_features_df['tip'].isna()]
predict_df = all_features_df[all_features_df['tip'].isna()]

print(f"Trainingsdaten: {train_df.shape}")
print(f"Vorhersagedaten: {predict_df.shape}")

# 2. Features und Zielvariable definieren
feature_columns = [
    'order_has_alcohol', 'order_product_count', 'order_unique_dept_count',
    'order_unique_aisle_count', 'order_unique_dept_ratio', 'order_unique_aisle_ratio',
    'order_dept_tip_rate', 'order_aisle_tip_rate', 'order_placed_hour',
    'order_placed_dow', 'order_is_weekend', 'order_placed_hour_sin',
    'order_placed_hour_cos', 'order_placed_season_sin', 'order_placed_season_cos',
    'order_time_since_last_hours', 'user_alcohol_purchase_count',
    'user_total_purchase_count', 'user_unique_product_count',
    'user_unique_to_total_ratio', 'user_frequent_purchase_hour',
    'user_frequent_purchase_dow', 'user_avg_order_interval_hours',
    'user_frequent_hour_sin', 'user_frequent_hour_cos',
    'user_frequent_season_sin', 'user_frequent_season_cos',
    'user_total_product_purchase_count', 'user_product_tip_prob'
]
X = train_df[feature_columns]
y = train_df['tip'].astype('int')  # Annahme: tip ist binär (0 oder 1)

# 3. Daten in Trainings- und Testset aufteilen (zur Evaluierung)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Modell trainieren
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# 5. Modell evaluieren
y_pred = model.predict(X_test)
print("Genauigkeit:", accuracy_score(y_test, y_pred))
print("\nKlassifikationsbericht:\n", classification_report(y_test, y_pred))

# 6. Vorhersagen für NaN-Zeilen
if not predict_df.empty:
    X_predict = predict_df[feature_columns]
    predict_df['tip'] = model.predict(X_predict).astype('int')
else:
    print("Keine Zeilen mit fehlendem tip zum Vorhersagen.")

# 7. Ergebnis-DataFrame erstellen
result_df = predict_df[['order_id', 'tip']].copy()

# 8. Ergebnis anzeigen und speichern
print("\nErgebnis-DataFrame:")
print(result_df.head())
print(f"Form des Ergebnis-DataFrames: {result_df.shape}")

# Ergebnis als CSV speichern
result_df.to_csv('predicted_tips.csv', index=False)
#print("Vorhersagen gespeichert als 'predicted_tips.csv'")

In [34]:
prediction = pd.read_csv("predicted_tips.csv")
template = pd.read_csv("tip_testdaten_template_V2.csv")

In [41]:
template[["Unnamed: 0","order_id"]].merge(prediction).to_csv('predicted_tips.csv', index=False)