In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [4]:
df = pd.read_csv('online_retail_cleaned.csv')
df.IsReturn.value_counts()

IsReturn
0    789533
1     16087
Name: count, dtype: int64

In [21]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")
df["Year_Month"] = df["InvoiceDate"].dt.strftime('%Y-%m')

In [22]:
(df.groupby('Year_Month')['Invoice'].count()).mean() #average monthly purchases

np.float64(32224.8)

In [23]:
avg_return_value = df[df["IsReturn"] == 1]["TotalLineValue"].median()
avg_return_value

np.float64(19.799999999999997)

In [24]:
top25percent_return_value = df[df["IsReturn"] == 1]["TotalLineValue"].quantile(0.75)
top25percent_return_value

np.float64(40.0)

Simulate Precision@k for Model 9

In [57]:
def simulate_roi_for_high_value_segment(
    total_monthly_orders=32224,
    high_value_percent=0.25,  # You only care about the top 25% of orders
    outreach_cost=1,
    savings_per_return=40,
    model_precisions={"5": 0.1853, "3": 0.2475, "2": 0.3097}
):
    results = []

    # Step 1: Only keep top 25% high-value orders
    high_value_orders = int(total_monthly_orders * high_value_percent)

    for model_id, precision in model_precisions.items():
        # Get the % to target within the high-value segment
        top_k_percent = float(model_id.replace("Top ", "").replace("%", "")) / 100
        targeted_orders = int(high_value_orders * top_k_percent)
        true_positives = int(targeted_orders * precision)

        estimated_savings = round(true_positives * savings_per_return, 2)
        outreach_cost_total = targeted_orders * outreach_cost
        net_roi = round(estimated_savings - outreach_cost_total, 2)
        roi_percent = round((net_roi / outreach_cost_total) * 100, 2) if outreach_cost_total > 0 else 0

        results.append({
            "Top %": model_id,
            "Total Monthly Orders": total_monthly_orders,
            "High-Value Orders (Top 25% of Total Monthly Orders)": high_value_orders,
            "Targeted (High-Risk)": targeted_orders,
            "Returners Caught": true_positives,
            "Estimated Savings ($)": estimated_savings,
            "Outreach Cost ($)": outreach_cost_total,
            "Net ROI ($)": net_roi,
            "ROI (%)": roi_percent
        })

    return pd.DataFrame(results)


In [58]:
roi_df = simulate_roi_for_high_value_segment()
roi_df

Unnamed: 0,Top %,Total Monthly Orders,High-Value Orders (Top 25% of Total Monthly Orders),Targeted (High-Risk),Returners Caught,Estimated Savings ($),Outreach Cost ($),Net ROI ($),ROI (%)
0,5,32224,8056,402,74,2960,402,2558,636.32
1,3,32224,8056,241,59,2360,241,2119,879.25
2,2,32224,8056,161,49,1960,161,1799,1117.39


As you go higher up the ranked list, your model's precision increases, and you get more value for less cost, resulting in a higher ROI % because it's more efficient.