# 04 â€“ Pricing and Segmentation

This notebook analyses plan pricing, revenue mix, customer behaviour and acquisition channels to understand how pricing and packaging are performing across the subscription base.


In [2]:
import os
import pandas as pd
import numpy as np

DATA_DIR = "data/processed"

customers_path = os.path.join(DATA_DIR, "customers.csv")
plans_path = os.path.join(DATA_DIR, "plans.csv")
subs_path = os.path.join(DATA_DIR, "subscriptions.csv")
activity_path = os.path.join(DATA_DIR, "monthly_activity.csv")

customers_df = pd.read_csv(customers_path)
plans_df = pd.read_csv(plans_path)
subs_df = pd.read_csv(subs_path)
activity_df = pd.read_csv(activity_path, parse_dates=["month"])

customers_df.head()


Unnamed: 0,customer_id,country,acquisition_channel,device_type,signup_date
0,1,UK,Organic,mobile,2023-12-01
1,2,SE,Organic,mobile,2025-09-15
2,3,US,Paid Ads,mobile,2025-02-06
3,4,SE,Partnership,desktop,2024-09-11
4,5,IN,Partnership,desktop,2022-12-16


## Join reference data

Link monthly activity to plan and customer attributes so that pricing, geography and acquisition behaviour can be viewed together.


In [3]:
activity_df = (
    activity_df
    .merge(plans_df, on="plan_id", how="left")
    .merge(customers_df[["customer_id", "country", "acquisition_channel", "device_type"]],
           on="customer_id", how="left")
)

activity_df.head()


Unnamed: 0,subscription_id,customer_id,plan_id,month,cohort_month,is_active,is_new,churned_this_month,mrr_usd,plan_name,billing_period,price_usd,country,acquisition_channel,device_type
0,1,5,4,2023-01-01,2023-01-01,True,True,False,24.92,Basic,annual,299.0,IN,Partnership,desktop
1,1,5,4,2023-02-01,2023-01-01,True,False,False,24.92,Basic,annual,299.0,IN,Partnership,desktop
2,1,5,4,2023-03-01,2023-01-01,True,False,False,24.92,Basic,annual,299.0,IN,Partnership,desktop
3,1,5,4,2023-04-01,2023-01-01,True,False,False,24.92,Basic,annual,299.0,IN,Partnership,desktop
4,1,5,4,2023-05-01,2023-01-01,True,False,False,24.92,Basic,annual,299.0,IN,Partnership,desktop


## Price points and ARPU by plan

Summarise list price and average realised MRR by plan and billing period.


In [4]:
price_points = (
    plans_df
    .sort_values(["plan_name", "billing_period"])
    .reset_index(drop=True)
)

price_points


Unnamed: 0,plan_id,plan_name,billing_period,price_usd
0,4,Basic,annual,299.0
1,1,Basic,monthly,29.0
2,6,Premium,annual,1199.0
3,3,Premium,monthly,119.0
4,5,Standard,annual,599.0
5,2,Standard,monthly,59.0


In [5]:
arpu_plan = (
    activity_df
    .groupby(["plan_name", "billing_period"], as_index=False)["mrr_usd"]
    .mean()
    .rename(columns={"mrr_usd": "avg_mrr_usd"})
    .sort_values(["plan_name", "billing_period"])
)

arpu_plan


Unnamed: 0,plan_name,billing_period,avg_mrr_usd
0,Basic,annual,15.411681
1,Basic,monthly,18.621808
2,Premium,annual,64.68731
3,Premium,monthly,79.98813
4,Standard,annual,32.560072
5,Standard,monthly,39.212642


## Revenue mix by plan and billing period

Look at how MRR is distributed across plans and between monthly and annual billing.


In [6]:
rev_mix = (
    activity_df
    .groupby(["plan_name", "billing_period"], as_index=False)["mrr_usd"]
    .sum()
    .rename(columns={"mrr_usd": "total_mrr_usd"})
)

total_mrr = rev_mix["total_mrr_usd"].sum()
rev_mix["revenue_share_pct"] = (rev_mix["total_mrr_usd"] / total_mrr * 100).round(2)

rev_mix.sort_values("total_mrr_usd", ascending=False)


Unnamed: 0,plan_name,billing_period,total_mrr_usd,revenue_share_pct
3,Premium,monthly,1448825.0,32.77
2,Premium,annual,1145482.88,25.91
5,Standard,monthly,638970.0,14.45
4,Standard,annual,577224.96,13.05
1,Basic,monthly,353684.0,8.0
0,Basic,annual,257498.36,5.82


## Customer-level value and churn behaviour

Aggregate activity to subscription level to look at typical bill size and churn events.


In [7]:
sub_value = (
    activity_df
    .groupby("subscription_id")
    .agg(
        customer_id=("customer_id", "first"),
        plan_id=("plan_id", "first"),
        plan_name=("plan_name", "first"),
        billing_period=("billing_period", "first"),
        mrr_mean=("mrr_usd", "mean"),
        mrr_max=("mrr_usd", "max"),
        months_active=("is_active", "sum"),
        churn_events=("churned_this_month", "sum"),
    )
    .reset_index()
)

sub_value.head()


Unnamed: 0,subscription_id,customer_id,plan_id,plan_name,billing_period,mrr_mean,mrr_max,months_active,churn_events
0,1,5,4,Basic,annual,8.795294,24.92,11,1
1,2,7,6,Premium,annual,63.363902,99.92,25,1
2,3,8,6,Premium,annual,99.92,99.92,3,0
3,4,8,2,Standard,monthly,59.0,59.0,3,0
4,5,9,5,Standard,annual,49.92,49.92,12,0


## Simple behavioural segments

Segmentation is based on quartiles of average realised MRR, with churn-event priority applied before price-tier classification.


In [8]:
q25 = sub_value["mrr_mean"].quantile(0.25)
q75 = sub_value["mrr_mean"].quantile(0.75)

def assign_segment(row):
    if row["mrr_mean"] >= q75:
        return "High value"
    if row["churn_events"] > 0:
        return "At risk"
    if row["mrr_mean"] <= q25:
        return "Low value"
    return "Stable mid-tier"

sub_value["segment"] = sub_value.apply(assign_segment, axis=1)

sub_value["segment"].value_counts()


Unnamed: 0_level_0,count
segment,Unnamed: 1_level_1
At risk,2159
High value,1826
Stable mid-tier,1468


## Segment profile by plan and billing period

Check how each behavioural segment maps onto the pricing grid.


In [9]:
segment_plan = (
    sub_value
    .groupby(["segment", "plan_name", "billing_period"], as_index=False)
    .agg(
        subscriptions=("subscription_id", "nunique"),
        avg_mrr_mean=("mrr_mean", "mean"),
        avg_months_active=("months_active", "mean"),
    )
)

segment_plan.sort_values(["segment", "plan_name", "billing_period"])


Unnamed: 0,segment,plan_name,billing_period,subscriptions,avg_mrr_mean,avg_months_active
0,At risk,Basic,annual,419,9.857886,8.200477
1,At risk,Basic,monthly,432,11.595521,8.699074
2,At risk,Premium,annual,294,24.922769,5.078231
3,At risk,Premium,monthly,233,25.061884,4.343348
4,At risk,Standard,annual,418,19.895857,8.090909
5,At risk,Standard,monthly,363,23.282713,8.724518
6,High value,Premium,annual,629,96.415107,15.205087
7,High value,Premium,monthly,702,111.766476,15.336182
8,High value,Standard,monthly,495,59.0,14.719192
9,Stable mid-tier,Basic,annual,443,24.92,14.623025


## Acquisition channel and pricing

Understand which channels are driving which plans and billing cadences.


In [10]:
# subscriptions by channel and plan
subs_channel_plan = (
    activity_df
    .groupby(["acquisition_channel", "plan_name", "billing_period"], as_index=False)["subscription_id"]
    .nunique()
    .rename(columns={"subscription_id": "unique_subscriptions"})
)

subs_channel_plan.sort_values("unique_subscriptions", ascending=False)


Unnamed: 0,acquisition_channel,plan_name,billing_period,unique_subscriptions
7,Organic,Basic,monthly,372
9,Organic,Premium,monthly,330
11,Organic,Standard,monthly,325
8,Organic,Premium,annual,314
10,Organic,Standard,annual,306
6,Organic,Basic,annual,302
16,Paid Ads,Standard,annual,248
12,Paid Ads,Basic,annual,226
15,Paid Ads,Premium,monthly,221
13,Paid Ads,Basic,monthly,221


In [11]:
# conversion from signup to active subscription by channel
subs_by_channel = (
    subs_df
    .merge(customers_df[["customer_id", "acquisition_channel"]], on="customer_id", how="left")
    .groupby("acquisition_channel", as_index=False)["subscription_id"]
    .nunique()
    .rename(columns={"subscription_id": "subscriptions"})
)

signups_by_channel = (
    customers_df
    .groupby("acquisition_channel", as_index=False)["customer_id"]
    .nunique()
    .rename(columns={"customer_id": "signups"})
)

channel_conv = subs_by_channel.merge(signups_by_channel, on="acquisition_channel", how="left")
channel_conv["conversion_rate_pct"] = (
    channel_conv["subscriptions"] / channel_conv["signups"] * 100
).round(2)

channel_conv.sort_values("conversion_rate_pct", ascending=False)


Unnamed: 0,acquisition_channel,subscriptions,signups,conversion_rate_pct
4,Referral,839,1198,70.03
1,Organic,1949,2850,68.39
2,Paid Ads,1350,1994,67.7
0,Email,768,1139,67.43
3,Partnership,547,819,66.79


## Billing cadence by channel

Compare monthly versus annual mix for each acquisition route.


In [12]:
billing_mix_channel = (
    activity_df
    .groupby(["acquisition_channel", "billing_period"], as_index=False)["subscription_id"]
    .nunique()
    .rename(columns={"subscription_id": "unique_subscriptions"})
)

total_by_channel = (
    billing_mix_channel
    .groupby("acquisition_channel", as_index=False)["unique_subscriptions"]
    .sum()
    .rename(columns={"unique_subscriptions": "total_subscriptions"})
)

billing_mix_channel = billing_mix_channel.merge(total_by_channel, on="acquisition_channel", how="left")
billing_mix_channel["share_pct"] = (
    billing_mix_channel["unique_subscriptions"] / billing_mix_channel["total_subscriptions"] * 100
).round(2)

billing_mix_channel.sort_values(["acquisition_channel", "billing_period"])


Unnamed: 0,acquisition_channel,billing_period,unique_subscriptions,total_subscriptions,share_pct
0,Email,annual,370,768,48.18
1,Email,monthly,398,768,51.82
2,Organic,annual,922,1949,47.31
3,Organic,monthly,1027,1949,52.69
4,Paid Ads,annual,689,1350,51.04
5,Paid Ads,monthly,661,1350,48.96
6,Partnership,annual,268,547,48.99
7,Partnership,monthly,279,547,51.01
8,Referral,annual,445,839,53.04
9,Referral,monthly,394,839,46.96


## Billing cadence loyalty indicator

Flag subscriptions that have spent more than one cycle on an annual plan.


In [13]:
annual_loyalty = (
    activity_df
    .assign(is_annual=lambda d: (d["billing_period"] == "annual").astype(int))
    .groupby("subscription_id", as_index=False)["is_annual"]
    .sum()
    .rename(columns={"is_annual": "annual_months"})
)

annual_loyalty["annual_loyalty_flag"] = (annual_loyalty["annual_months"] > 1).astype(int)

annual_loyalty["annual_loyalty_flag"].value_counts()


Unnamed: 0_level_0,count
annual_loyalty_flag,Unnamed: 1_level_1
0,2850
1,2603
