## Install churnmodels
got to the project root folder 
run 
> python setup.py sdist

a new package will be created and stored in the subfolder "./dist"

now run pip install, e.g.:
> pip install dist/churnmodels-0.0.1.tar.gz


In [1]:
import churnmodels
print(churnmodels.__version__)
# expected outcome : the actual version number

0.0.4


## Open a DB session

In [2]:
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from churnmodels.schema import Subscription, Event, Account

# the following example will depend on a sqlite db
sqlfile="c:/tmp/churn.db"
engine = create_engine(f"sqlite:///{sqlfile}")
session = sessionmaker(bind=engine)()


## Net Retention (§2.1)


In [3]:
from sqlalchemy import func, or_
from churnmodels.schema import Subscription, Event
import pandas as pd

d_start_date = "2020-01-01"
d_end_date = "2020-03-01"

"""
-- PostGres SQL for start_accounts: 
    select  account_id, sum (mrr) as total_mrr    
    from subscription s inner join date_range d on
        s.start_date <= d.start_date    
        and (s.end_date > d.start_date or s.end_date is null)
    group by account_id    
"""

# I) start_accounts
q_start_accounts = session.query(Subscription.account_id, func.sum(Subscription.mrr).label("total_mrr"))\
    .filter(
            # SQL: s.start_date <= d.start_date
        Subscription.start_date <= d_start_date,
            # SQL: s.end_date > d.start_date or s.end_date is null
        or_(Subscription.end_date > d_start_date, Subscription.end_date == None))\
    .group_by(Subscription.account_id) # SQL: group by account_id
# getting the result from the DB stored into a pandas DataFrame
start_accounts = pd.read_sql(q_start_accounts.statement, engine).set_index("account_id")
print(start_accounts)

            total_mrr
account_id           
86               50.0
117              50.0
158              50.0
163              50.0
220              50.0
...               ...
9889            100.0
9912             50.0
9923            100.0
9924             50.0
9990             50.0

[328 rows x 1 columns]


In [4]:
# II) end_accounts
q_end_accounts = session.query(Subscription.account_id, func.sum(Subscription.mrr).label("total_mrr")).filter(
    Subscription.start_date <= d_end_date,
    or_(Subscription.end_date > d_end_date, Subscription.end_date == None)).group_by(Subscription.account_id)
# q = q.filter(Subscription.account_id==64)
end_accounts = pd.read_sql(q_end_accounts.statement, engine).set_index("account_id")
print(end_accounts)

            total_mrr
account_id           
1                50.0
2               100.0
3                50.0
5                50.0
6               100.0
...               ...
11949            50.0
11953           100.0
11983            50.0
11987            50.0
12067           200.0

[10292 rows x 1 columns]


In [5]:
# III) retained_accounts <- inner join on start_accounts and end_accounts
# the line shows how to realize an inner join with pandas:
retained_accounts = pd.merge(start_accounts, end_accounts, on="account_id")
print(retained_accounts)
# the resulting columns total_mrr_x, total_mrr_y are identical

            total_mrr_x  total_mrr_y
account_id                          
86                 50.0        100.0
117                50.0         50.0
158                50.0         50.0
163                50.0         50.0
220                50.0         50.0
...                 ...          ...
9837              200.0        200.0
9870               50.0         50.0
9889              100.0        100.0
9912               50.0         50.0
9990               50.0         50.0

[280 rows x 2 columns]


Alternatively to the pandas call we can combine all to a single SQL statement
If subqueries are large it is more efficient not to store the subselects into pandas DataFrames
With sqlalchemy we can simply join two subqueries:

In [6]:
# III) retained_accounts (better alternative)
# Alternatively to the pandas call we can combine all to a single SQL statement 
# If subqueries are large it is more efficient not to store the subselects into pandas DataFrames 
# With sqlalchemy we can simply join two subqueries:

qe_s=q_end_accounts.subquery()
q_retained_accounts = q_start_accounts.join(qe_s, Subscription.account_id == qe_s.c.account_id)  #<- see the reference to "c" (for columns) of the subquery ⌈object

# let's have a look at the SQL statement sqlalchemy produces
#print(f"{qretained.statement}\n")

# reading into pandas
retained_accounts = pd.read_sql(q_retained_accounts.statement, engine).set_index("account_id")
print(retained_accounts)
                              

            total_mrr
account_id           
86               50.0
117              50.0
158              50.0
163              50.0
220              50.0
...               ...
9837            200.0
9870             50.0
9889            100.0
9912             50.0
9990             50.0

[280 rows x 1 columns]


In [7]:
start_mrr = session.query(func.sum(q_start_accounts.subquery().c.total_mrr)).one()[0] or 0
retain_mrr = session.query(func.sum(q_retained_accounts.subquery().c.total_mrr)).one()[0] or 0
net_mrr_retention_rate=retain_mrr /start_mrr # <- churned
net_mrr_churn_rate = 1.0 - retain_mrr /start_mrr # <- "survived", i.e. complentary to net_mrr_retention_rate

df=pd.DataFrame.from_dict({
    "net_mrr_retention_rate":[net_mrr_retention_rate],
    "net_mrr_churn_rate":[net_mrr_churn_rate],
    "start_mrr":[start_mrr],
    "retain_mrr":[retain_mrr]
})
print(df)


   net_mrr_retention_rate  net_mrr_churn_rate  start_mrr  retain_mrr
0                0.858293            0.141707    31050.0     26650.0


## Churn Rate (§2.2)

In [8]:
# we keep the start and end accounts from above

# 2.2.III) churned_accounts <- start_accounts LEFT OUTER JOIN to end_accounts
# the line shows how to realize an inner join with pandas:

qe_s=q_end_accounts.subquery()
q_churned_accounts = q_start_accounts.join(qe_s, Subscription.account_id == qe_s.c.account_id, isouter=True).filter(qe_s.c.account_id == None)

# reading into pandas
churned_accounts = pd.read_sql(q_churned_accounts.statement, engine).set_index("account_id")
print(churned_accounts)



            total_mrr
account_id           
890              50.0
1230             50.0
1402             50.0
1799             50.0
2232             50.0
2331             50.0
2349             50.0
2442            200.0
2485            200.0
2535             50.0
2850            100.0
2883            200.0
3448            100.0
3500             50.0
3622            200.0
4363             50.0
4496            200.0
4520             50.0
4594             50.0
4627             50.0
4833             50.0
4960            200.0
5147             50.0
5183            100.0
5196             50.0
5350            200.0
5760             50.0
6001            100.0
6033            200.0
6541             50.0
6568            100.0
6678             50.0
6696             50.0
6712            100.0
7347            100.0
7962             50.0
7987             50.0
8085            100.0
8252            200.0
8296             50.0
8397             50.0
8785            100.0
9192            100.0
9600      

In [9]:

n_start = session.query(func.count(q_start_accounts.subquery().c.account_id)).one()[0] or 0
n_churn = session.query(func.count(q_churned_accounts.subquery().c.account_id)).one()[0] or 0

churn_rate=n_churn /n_start # <- churned
retention_rate = 1.0 - n_churn /n_start # <- "survived", i.e. complentary = 1-churn_rate

df=pd.DataFrame.from_dict({
    "churn_rate":[churn_rate],
    "retention_rate":[retention_rate],
    "n_start":[n_start],
    "n_churn":[n_churn]
})
print(df)


   churn_rate  retention_rate  n_start  n_churn
0    0.146341        0.853659      328       48


## Activity Churn (§2.3)

In [10]:
# not yet

## MRR Churn (§2.4)

In [11]:
qs=q_start_accounts.subquery()
qe=q_end_accounts.subquery()
q_downsell_accounts = session.query(qs.c.account_id, (qs.c.total_mrr-qe.c.total_mrr).label("downsell_amount"))\
    .join(qe, qs.c.account_id == qe.c.account_id)\
    .filter(qe.c.total_mrr < qs.c.total_mrr)

pd.read_sql(q_downsell_accounts.statement, engine).set_index("account_id")

Unnamed: 0_level_0,downsell_amount
account_id,Unnamed: 1_level_1
336,100.0
417,100.0
534,100.0
748,100.0
1004,100.0
1169,100.0
1978,100.0
2382,100.0
2532,100.0
2803,100.0


In [12]:
start_mrr = session.query(func.sum(q_start_accounts.subquery().c.total_mrr)).one()[0] or 0
churn_mrr = session.query(func.sum(q_churned_accounts.subquery().c.total_mrr)).one()[0] or 0
downsell_mrr = session.query(func.sum(q_downsell_accounts.subquery().c.downsell_amount)).one()[0] or 0

mrr_churn_rate = (churn_mrr+downsell_mrr) /start_mrr 

df=pd.DataFrame.from_dict({
    "mrr_churn_rate":[mrr_churn_rate],
    "start_mrr":[start_mrr],
    "churn_mrr":[churn_mrr],
    "downsell_mrr":[downsell_mrr],
})
print(df)


   mrr_churn_rate  start_mrr  churn_mrr  downsell_mrr
0        0.202899    31050.0     4400.0        1900.0


## Churn Rate scaled (§2.5)

In [16]:
from datetime import datetime

def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d")
    d2 = datetime.strptime(d2, "%Y-%m-%d")
    return abs((d2 - d1).days)

period_days=days_between(d_start_date, d_end_date)

measured_churn =  n_churn / n_start
#period_days = end_date - start_date
annual_churn = 1- pow(1- measured_churn, 365.0 / period_days)
monthly_churn = 1- pow(1- measured_churn, 365.0 /12.0 / period_days)

df=pd.DataFrame.from_dict({
    "n_start":[n_start],
    "n_churn":[n_churn],
    "measured_churn":[measured_churn],
    "period_days":[period_days],
    "annual_churn":[annual_churn],
    "monthly_churn":[monthly_churn]
})
print(df)


   n_start  n_churn  measured_churn  period_days  annual_churn  monthly_churn
0      328       48        0.146341           60      0.618074       0.077078
