#### What is the causal effect of receiving a discount on a first purchase on the likelihood of making a repeat purchase within one month?

In [2]:
import pandas as pd
import numpy as np
import datetime as dt

In [9]:
# 'skus' table
skus = pd.read_csv(
    "https://www.dropbox.com/scl/fi/iloh5dvxa5up1uypw1mfx/JD_sku_data.csv?rlkey=3hqpuoyet5wpfj0xxooqj1jyk&st=8kburfsp&dl=1"
)
# 'users' table
users = pd.read_csv(
    "https://www.dropbox.com/scl/fi/fch92f5vrytxlgqzjhupg/JD_user_data.csv?rlkey=fsyhdeguzfj7x4wf3peiog0gq&st=57mhf8on&dl=1"
)
# 'clicks' table
clicks = pd.read_csv(
    "https://www.dropbox.com/scl/fi/19uqfqcv2rb0z2ribgibm/JD_click_data.csv?rlkey=hvony9w8ixfgmh2330b0xmozz&dl=1"
)
# 'orders' table
orders = pd.read_csv(
    "https://www.dropbox.com/scl/fi/aous0qee7le68ar8pe5ol/JD_order_data.csv?rlkey=7zu1gc23v228yosjl1tpmc2xg&st=b8io8lhi&dl=1"
)
# 'delivery' table
delivery = pd.read_csv(
    "https://www.dropbox.com/scl/fi/y2g5wtg4dw6y0u37fugo9/JD_delivery_data.csv?rlkey=wxxi0v919lmttamw55thwui0g&st=m19md1z7&dl=1"
)
# 'inventory' table
inventory = pd.read_csv(
    "https://www.dropbox.com/scl/fi/dk8yj9bjtpk6k0260irgt/JD_inventory_data.csv?rlkey=7zlceazzdptmu2bvwwbnmy7q9&st=yibm2hfp&dl=1"
)
# 'network' table
network = pd.read_csv(
    "https://www.dropbox.com/scl/fi/3yq9ne4plp8sy9ja5afm0/JD_network_data.csv?rlkey=vwtx0lom21txudqyr88j2b52r&st=887g0ewq&dl=1"
)

In [24]:
users

Unnamed: 0,user_ID,user_level,first_order_month,plus,gender,age,marital_status,education,city_level,purchase_power
0,000089d6a6,1,2017-08,0,F,26-35,S,3,4,3
1,0000babd1f,1,2018-03,0,U,U,U,-1,-1,-1
2,0000bc018b,3,2016-06,0,F,>=56,M,3,2,3
3,0000d0e5ab,3,2014-06,0,M,26-35,M,3,2,2
4,0000dce472,3,2012-08,1,U,U,U,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...
457293,ffff38690b,1,2018-03,0,U,U,U,-1,-1,-1
457294,ffffa1a495,4,2011-09,1,M,26-35,S,3,1,2
457295,ffffb20ef7,3,2017-11,0,M,36-45,M,2,4,2
457296,ffffc45330,1,2016-04,0,F,26-35,M,-1,-1,-1


#### Treatment variable (X):

- treatment = 1 if user received any discount on their first purchase (direct_discount_per_unit > 0 or final_unit_price < original_unit_price)

- treatment = 0 otherwise

#### Outcome variable (Y):

- repeat_purchase = 1 if the user made another purchase within 30 days of their first purchase date

- repeat_purchase = 0 otherwise


**Identify First-Time Buyers**

In [None]:
first_time_buyers = users[users["first_order_month"] == "2018-03"]
first_time_buyers

Unnamed: 0,user_ID,user_level,first_order_month,plus,gender,age,marital_status,education,city_level,purchase_power
1,0000babd1f,1,2018-03,0,U,U,U,-1,-1,-1
14,00026e5698,1,2018-03,0,U,U,U,-1,-1,-1
19,0003bc5bec,2,2018-03,0,F,26-35,U,-1,4,-1
21,000479b177,1,2018-03,0,U,U,U,-1,-1,-1
34,00074309ac,1,2018-03,0,F,36-45,S,3,1,2
...,...,...,...,...,...,...,...,...,...,...
457226,fff683e3b6,1,2018-03,0,F,26-35,U,-1,4,-1
457256,fffaeaf576,1,2018-03,0,F,36-45,U,-1,4,-1
457285,fffece9f0b,1,2018-03,0,F,36-45,M,3,4,2
457288,fffeefb9f9,1,2018-03,0,U,U,U,-1,-1,-1


**Assign Treatment Status**

Join with orders and compute for each user whether they received a discount on their first order.

In [None]:
# Determining discount status
orders["any_discount"] = (
    (orders["direct_discount_per_unit"] > 0)
    | (orders["quantity_discount_per_unit"] > 0)
    | (orders["coupon_discount_per_unit"] > 0)
    | (orders["bundle_discount_per_unit"] > 0)
)

# First order per user
first_orders = (
    orders.sort_values(by="order_time").groupby("user_ID").first().reset_index()
)

# Merge with user info
first_orders = first_orders.merge(users, on="user_ID")
first_orders

Unnamed: 0,user_ID,order_ID,sku_ID,order_date,order_time,quantity,type,promise,original_unit_price,final_unit_price,...,any_discount,user_level,first_order_month,plus,gender,age,marital_status,education,city_level,purchase_power
0,000089d6a6,6fb419a6de,e99eb7d131,2018-03-14,2018-03-14 14:50:39.0,1,2,-,330.0,215.0,...,True,1,2017-08,0,F,26-35,S,3,4,3
1,0000babd1f,6f20820bed,7185ef8e8c,2018-03-22,2018-03-22 14:40:10.0,1,2,-,39.0,39.0,...,False,1,2018-03,0,U,U,U,-1,-1,-1
2,0000bc018b,ebbf0f8a69,fa823767ca,2018-03-30,2018-03-30 15:58:03.0,1,1,1,79.0,79.0,...,False,3,2016-06,0,F,>=56,M,3,2,3
3,0000d0e5ab,e8081938a6,2523d051fd,2018-03-28,2018-03-28 11:48:21.0,1,1,1,298.0,228.0,...,True,3,2014-06,0,M,26-35,M,3,2,2
4,0000dce472,3f68275300,f0e625dda4,2018-03-18,2018-03-18 23:27:45.0,1,1,2,166.0,111.5,...,True,3,2012-08,1,U,U,U,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454892,ffff38690b,20aeac8b56,068f4481b3,2018-03-16,2018-03-16 08:24:47.0,1,1,2,298.0,237.0,...,True,1,2018-03,0,U,U,U,-1,-1,-1
454893,ffffa1a495,474639b6fd,bb43802626,2018-03-09,2018-03-09 17:10:03.0,1,2,-,0.0,0.0,...,False,4,2011-09,1,M,26-35,S,3,1,2
454894,ffffb20ef7,b4747d4058,4e5e5c04a2,2018-03-24,2018-03-24 14:05:39.0,1,2,-,99.0,59.0,...,True,3,2017-11,0,M,36-45,M,2,4,2
454895,ffffc45330,ca94671f8c,5c4440c9d2,2018-03-17,2018-03-17 15:29:37.0,1,1,3,49.0,49.0,...,False,1,2016-04,0,F,26-35,M,-1,-1,-1


**Compute the Outcome (Repeat Purchase)**

In [None]:
from datetime import timedelta

#  Ensure 'order_time' is datetime
orders["order_time"] = pd.to_datetime(
    orders["order_time"], format="mixed", errors="coerce"
)

# Identify the first order time for each user
first_orders_time = (
    orders.sort_values(by="order_time")
    .groupby("user_ID", as_index=False)
    .first()[["user_ID", "order_time"]]
    .rename(columns={"order_time": "first_order_time"})
)

#  Merge first order time into full orders table
orders = orders.drop(columns=["first_order_time"], errors="ignore")  # drop if exists
orders = orders.merge(first_orders_time, on="user_ID", how="left")

# Compute number of days since first order
orders["days_since_first"] = (orders["order_time"] - orders["first_order_time"]).dt.days

# Identify users with at least one repeat order within 30 days
repeat_orders = (
    orders[(orders["days_since_first"] > 0) & (orders["days_since_first"] <= 30)]
    .groupby("user_ID")
    .size()
    .reset_index(name="repeat_purchase")
)

# Convert count to binary: 1 = repeat purchase within 30 days
repeat_orders["repeat_purchase"] = 1
repeat_orders

Unnamed: 0,user_ID,repeat_purchase
0,000517abd3,1
1,001282c65e,1
2,0013804aa0,1
3,00182fd9ea,1
4,001917a08e,1
...,...,...
11906,ffea8dfcda,1
11907,ffeee86eec,1
11908,fff7767f6a,1
11909,fffe8dd91f,1


In [None]:
# Merge back repeat outcome and fill in missing
first_orders = first_orders.merge(
    repeat_orders[["user_ID", "repeat_purchase"]], on="user_ID", how="left"
)
first_orders["repeat_purchase"] = first_orders["repeat_purchase"].fillna(0)

**Logistic Regression**

In [None]:
# Logistic Regression (baseline)
# Estimate the effect of treatment (received a discount) on repeat purchase:
import statsmodels.api as sm
import statsmodels.formula.api as smf

model = smf.logit(
    "repeat_purchase ~ any_discount + age + gender + purchase_power + city_level",
    data=first_orders,
).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.120999
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:        repeat_purchase   No. Observations:               454897
Model:                          Logit   Df Residuals:                   454885
Method:                           MLE   Df Model:                           11
Date:                Sat, 19 Apr 2025   Pseudo R-squ.:                0.001790
Time:                        12:17:25   Log-Likelihood:                -55042.
converged:                       True   LL-Null:                       -55141.
Covariance Type:            nonrobust   LLR p-value:                 2.555e-36
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept               -3.7940      0.035   -107.462      0.000      -3.863      -3.725

## Key Takeaways from the Output:

 **Treatment Variable: `any_discount[T.True]`**
- **Coefficient:** `-0.0306`
- **p-value:** `0.147` → **Not statistically significant** at conventional levels (e.g. p < 0.05)
- **Interpretation:** After controlling for other variables (like age, gender, purchase power, etc.), receiving a discount **does not significantly increase or decrease** the likelihood of a repeat purchase within 30 days.

This supports our **exploratory finding**:  
Discounts drive **initial purchases**, but **do not foster loyalty** or repeat behavior.

---

### Other Notable Results (control variables):

| Variable               | Coef | Interpretation                                                                 |
|------------------------|------|---------------------------------------------------------------------------------|
| `gender[T.M]`          | -0.1665 | Men are **less likely** to repeat-purchase than women (stat. significant).       |
| `age[T.26-35]`, `36-45`| +0.0625 +0.0843    | These age groups are **slightly more likely** to repurchase than the reference group. |
| `purchase_power`       | +0.0765 | Higher purchasing power = **higher likelihood** of repeat purchase.            |
| `city_level`           | +0.0294 | Higher city level = slightly more likely to repeat purchase.                  |
| `age[T.U]`             | +1.47 | Unknown ages are **way more likely** to repeat purchase — likely a data quirk. |
| `gender[T.U]`          | -1.2697 | Unknown gender much **less likely** to repeat purchase.                        |

---

We've **empirically tested** the causal question using a valid model and **found no evidence** that offering a discount on the first purchase improves repeat purchasing behavior — even when accounting for key demographic and behavioral factors.

---

> Our logistic regression analysis (n = 454,897) revealed that receiving a discount on a first-time purchase had **no statistically significant effect** on whether a customer made a repeat purchase within 30 days (p = 0.147). This suggests that, while discounts may increase short-term conversions, they do not appear to build customer loyalty or drive repeat purchases.

> In contrast, demographic and behavioral features — such as age, gender, and purchasing power — were significantly associated with repeat behavior. These results indicate that JD.com should consider shifting away from broad, site-wide discounts toward loyalty strategies tailored to high-potential customer segments.


**Interpret the coefficient on any_discount as the log odds of repeat purchase for marginal effects:**

In [None]:
model.get_margeff().summary()

0,1
Dep. Variable:,repeat_purchase
Method:,dydx
At:,overall

Unnamed: 0,dy/dx,std err,z,P>|z|,[0.025,0.975]
any_discount[T.True],-0.0008,0.001,-1.452,0.147,-0.002,0.0
age[T.26-35],0.0016,0.001,2.549,0.011,0.0,0.003
age[T.36-45],0.0021,0.001,2.878,0.004,0.001,0.004
age[T.46-55],-0.001,0.001,-0.73,0.466,-0.004,0.002
age[T.<=15],0.0139,0.026,0.535,0.593,-0.037,0.065
age[T.>=56],-0.0007,0.001,-0.48,0.631,-0.004,0.002
age[T.U],0.0375,0.011,3.566,0.0,0.017,0.058
gender[T.M],-0.0042,0.001,-7.177,0.0,-0.005,-0.003
gender[T.U],-0.0324,0.01,-3.095,0.002,-0.053,-0.012
purchase_power,0.0019,0.0,8.451,0.0,0.001,0.002


## What are marginal effects?

Marginal effects tell  the **change in probability** of the outcome (here: **repeat purchase within 30 days**) when a predictor variable increases by one unit — **holding other variables constant**.

So instead of log-odds (hard to interpret), this gives a practical, intuitive interpretation.

---

## Key Findings:

`any_discount[T.True]`: **-0.0008**
- Not statistically significant (p = 0.147).
- **Interpretation:** On average, receiving a discount **reduces the probability of a repeat purchase by 0.08 percentage points**, but this effect is **not statistically meaningful**.
- This **confirms the earlier conclusion**: Discounts help with initial sales but **don’t improve customer retention**.

---

### Demographics that **increase** repeat purchase probability:
| Variable | Marginal Effect | Interpretation |
|----------|------------------|----------------|
| `age[T.26-35]` | +0.0016 | This age group is **0.16 percentage points** more likely to return than the baseline age group. |
| `age[T.36-45]` | +0.0021 | Slightly higher chance of repeat — around **0.21 percentage points**. |
| `age[T.U]` | +0.0375 | Huge increase — **3.75 percentage points** more likely to repurchase. Could be due to system-default or proxy accounts. |
| `purchase_power` | +0.0019 | Each increase in purchasing power raises repeat chance by **0.19 percentage points**. |
| `city_level` | +0.0008 | Slightly higher likelihood of repeat purchases in higher-tier cities. |

---

### Demographics that **reduce** repeat purchase probability:
| Variable | Marginal Effect | Interpretation |
|----------|------------------|----------------|
| `gender[T.M]` | -0.0042 | Men are **0.42 percentage points** less likely to repeat buy than women. |
| `gender[T.U]` | -0.0324 | Users with unknown gender are **3.24 percentage points** less likely to return — possibly bots or non-engaged users. |
| `age[T.46-55]`, `>=56` | Both negative, small, and **not significant**.

---

> We estimated marginal effects from our logistic regression to interpret the practical impact of first-time discounts on customer retention. The results show that receiving a discount reduces the probability of a repeat purchase by 0.08 percentage points; however, this effect is **not statistically significant** (p = 0.147). These findings support our exploratory analysis and suggest that while discounts may help convert first-time buyers, they **do not encourage long-term loyalty**.

> Demographic factors such as being aged 26–45, having higher purchasing power, and living in higher-tier cities **positively influence** the likelihood of repeat purchases. In contrast, male customers and users with unknown gender are **less likely** to return.
