<a href="https://colab.research.google.com/github/tiffchow214/churn_app_huggingface/blob/main/sample_customers_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

def make_synthetic_customers(n=50, seed=42):
    rng = np.random.default_rng(seed)

    # 1) Core fields
    tenure_months = rng.integers(0, 61, size=n)  # 0–60 months
    delivery_distance_km = np.clip(np.round(rng.gamma(2.0, 5.0, size=n), 1), 0, 60)  # skewed 0–60
    num_devices = rng.choice([1,2,3,4,5], size=n, p=[0.25,0.35,0.25,0.10,0.05])
    num_addresses = rng.choice([1,2,3], size=n, p=[0.70,0.25,0.05])

    # 2) Complaint & satisfaction (loosely coupled)
    complaint = rng.choice([0,1], size=n, p=[0.85,0.15]).astype(int)
    base_sat = rng.normal(loc=3.7, scale=0.9, size=n) - complaint*1.2
    satisfaction_score = np.clip(np.round(base_sat), 1, 5).astype(int)

    # 3) Days since last order (heavy tail)
    days_since_last_order = np.clip(rng.exponential(scale=20, size=n).astype(int), 0, 120)

    # 4) Cashback (a bit higher when satisfied, lower when complaint=1)
    cashback_amount = np.round(
        np.clip(rng.normal(120, 60, size=n) + (satisfaction_score-3)*15 - complaint*30, 0, 400), 2
    )

    # 5) Categories & marital status
    preferred_category = rng.choice(
        ["Laptop & Accessory", "Mobile Phone", "Others"], size=n, p=[0.35, 0.35, 0.30]
    )
    marital_status = rng.choice(["Married", "Single"], size=n, p=[0.45, 0.55])

    df = pd.DataFrame({
        "tenure_months": tenure_months,
        "delivery_distance_km": delivery_distance_km,
        "num_devices": num_devices,
        "satisfaction_score": satisfaction_score,
        "num_addresses": num_addresses,
        "days_since_last_order": days_since_last_order,
        "cashback_amount": cashback_amount,
        "complaint": complaint,
        "preferred_category": preferred_category,
        "marital_status": marital_status,
    })

    # Ensure exact column order expected by your app
    cols = [
        "tenure_months","delivery_distance_km","num_devices","satisfaction_score",
        "num_addresses","days_since_last_order","cashback_amount",
        "complaint","preferred_category","marital_status"
    ]
    return df[cols]

# ---- Generate & preview ----
N_ROWS = 50
SEED = 2025

df = make_synthetic_customers(n=N_ROWS, seed=SEED)
print(df.head(10))  # quick preview
save_path = "/content/sample_customers.csv"
df.to_csv(save_path, index=False)
print(f"\nSaved → {save_path}  shape={df.shape}")

   tenure_months  delivery_distance_km  num_devices  satisfaction_score  \
0             27                   3.8            2                   3   
1             60                  14.4            2                   2   
2             60                   7.0            2                   3   
3             23                   9.2            2                   4   
4             58                   9.4            3                   3   
5             50                   7.5            2                   1   
6             39                  10.5            1                   3   
7             51                   3.7            1                   5   
8             46                  11.9            2                   3   
9             59                   0.6            2                   4   

   num_addresses  days_since_last_order  cashback_amount  complaint  \
0              1                      0           112.25          0   
1              1                

In [2]:
from google.colab import files
files.download("/content/sample_customers.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>