## Install churnmodels
In he project root folder open a terminal and run 
> python setup.py sdist

A new package will be created and stored in the subfolder "./dist"
To install the package run pip install, e.g.:
> pip install dist/churnmodels-0.0.1.tar.gz


In [1]:
import churnmodels
print(churnmodels.__version__)
# expected outcome : the actual version number

0.0.4


## Open a DB session

### DBs with schemas

In [12]:
# if connecting to an DB using schemas we need to run the following line BEFORE importing the churnmodels schema

# I) set the environment variable CHURN_DB_SCHEMA
# os.environ["CHURN_DB_SCHEMA"]="biznet"

# II) import the churnmodels library
from churnmodels.schema import Subscription, Event, Account


### Open an SQLite DB session

In [2]:
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker


if os.environ["CHURN_DB_DIALECT"] == "sqlite":
    
    from churnmodels.schema import Subscription, Event, Account
    # the following example will depend on a sqlite db
    sqlfile="../churn.db"
    engine = create_engine(f"sqlite:///{sqlfile}")
    session = sessionmaker(bind=engine)()

### Open a PostGres DB session

In [1]:
# for a postgres we do this...
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

# it is very important that we first set the environment variables ...
# which type of DB do we have?
os.environ["CHURN_DB_DIALECT"]= "postgres" # given that sqlite is the default we actually do not need this line

if os.environ["CHURN_DB_DIALECT"] == "postgres":

    # we need to give DB-name, user and password
    model="biznet1"
    os.environ["CHURN_DB"]= "churn"
    os.environ["CHURN_DB_SCHEMA"]= model
    os.environ["CHURN_DB_USER"]= "postgres"
    os.environ["CHURN_DB_PASS"]= "password"

    # ...having set the environment variables, our SQLAlchemy model will incorporate them, recognizing that we want to set up a Postgres DB
    from churnmodels import schema
    
    user=os.environ["CHURN_DB_USER"]
    pw=os.environ["CHURN_DB_PASS"]
    dbname=os.environ["CHURN_DB"]
    schema=os.environ["CHURN_DB_SCHEMA"]

    database_uri = f"postgresql://{user}:{pw}@localhost:5432/{dbname}"
    engine = create_engine(database_uri)

    session = sessionmaker(bind=engine)()


## Open a DB session

## Extra Code for dates
We need these libs and functions to deal with dates

In [2]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d")
    d2 = datetime.strptime(d2, "%Y-%m-%d")
    return abs((d2 - d1).days)


## Net Retention (§2.1)


In [3]:
from sqlalchemy import func, or_
from churnmodels.schema import Subscription, Event
import pandas as pd


"""
-- PostGres SQL for start_accounts: 
    select  account_id, sum (mrr) as total_mrr    
    from subscription s inner join date_range d on
        s.start_date <= d.start_date    
        and (s.end_date > d.start_date or s.end_date is null)
    group by account_id    
"""

d_start_date = "2020-01-01"
d_end_date = "2020-03-01"

d_start_date = "2020-03-01"
d_end_date = "2020-04-01"



# I) start_accounts
q_start_accounts = session.query(Subscription.account_id, func.sum(Subscription.mrr).label("total_mrr"))\
    .filter(
            # SQL: s.start_date <= d.start_date
        Subscription.start_date <= d_start_date,
            # SQL: s.end_date > d.start_date or s.end_date is null
        or_(Subscription.end_date > d_start_date, Subscription.end_date == None))\
    .group_by(Subscription.account_id) # SQL: group by account_id
# getting the result from the DB stored into a pandas DataFrame
start_accounts = pd.read_sql(q_start_accounts.statement, engine).set_index("account_id")
print(start_accounts)

            total_mrr
account_id           
6114             50.0
4790            100.0
273             100.0
3936             50.0
5761            100.0
...               ...
7227             50.0
790             100.0
10896            50.0
5642            100.0
2850            100.0

[10192 rows x 1 columns]


In [4]:
# II) end_accounts
q_end_accounts = session.query(Subscription.account_id, func.sum(Subscription.mrr).label("total_mrr")).filter(
    Subscription.start_date <= d_end_date,
    or_(Subscription.end_date > d_end_date, Subscription.end_date == None)).group_by(Subscription.account_id)
# q = q.filter(Subscription.account_id==64)
end_accounts = pd.read_sql(q_end_accounts.statement, engine).set_index("account_id")
print(end_accounts)

            total_mrr
account_id           
6114             50.0
11233           200.0
4790            100.0
273             100.0
11719            50.0
...               ...
7227             50.0
790             100.0
10896            50.0
5642            200.0
2850            100.0

[10552 rows x 1 columns]


In [5]:
# III) retained_accounts <- inner join on start_accounts and end_accounts
# the line shows how to realize an inner join with pandas:
retained_accounts = pd.merge(start_accounts, end_accounts, on="account_id")
print(retained_accounts)
# the resulting columns total_mrr_x, total_mrr_y are identical

            total_mrr_x  total_mrr_y
account_id                          
6114               50.0         50.0
4790              100.0        100.0
273               100.0        100.0
3936               50.0         50.0
5761              100.0        100.0
...                 ...          ...
7227               50.0         50.0
790               100.0        100.0
10896              50.0         50.0
5642              100.0        200.0
2850              100.0        100.0

[9489 rows x 2 columns]


Alternatively to the pandas call we can combine all to a single SQL statement
If subqueries are large it is more efficient not to store the subselects into pandas DataFrames
With sqlalchemy we can simply join two subqueries:

In [6]:
# III) retained_accounts (better alternative)
# Alternatively to the pandas call we can combine all to a single SQL statement 
# If subqueries are large it is more efficient not to store the subselects into pandas DataFrames 
# With sqlalchemy we can simply join two subqueries:

qe_s=q_end_accounts.subquery()
q_retained_accounts = q_start_accounts.join(qe_s, Subscription.account_id == qe_s.c.account_id)  #<- see the reference to "c" (for columns) of the subquery ⌈object

# let's have a look at the SQL statement sqlalchemy produces
#print(f"{qretained.statement}\n")

# reading into pandas
retained_accounts = pd.read_sql(q_retained_accounts.statement, engine).set_index("account_id")
print(retained_accounts)
                              

            total_mrr
account_id           
6114             50.0
4790            100.0
273             100.0
3936             50.0
5761            100.0
...               ...
7227             50.0
790             100.0
10896            50.0
5642            100.0
2850            100.0

[9489 rows x 1 columns]


In [7]:
start_mrr = session.query(func.sum(q_start_accounts.subquery().c.total_mrr)).one()[0] or 0
retain_mrr = session.query(func.sum(q_retained_accounts.subquery().c.total_mrr)).one()[0] or 0
net_mrr_retention_rate=retain_mrr /start_mrr # <- churned
net_mrr_churn_rate = 1.0 - retain_mrr /start_mrr # <- "survived", i.e. complentary to net_mrr_retention_rate

df=pd.DataFrame.from_dict({
    "net_mrr_retention_rate":[net_mrr_retention_rate],
    "net_mrr_churn_rate":[net_mrr_churn_rate],
    "start_mrr":[start_mrr],
    "retain_mrr":[retain_mrr]
})
print(df)


   net_mrr_retention_rate  net_mrr_churn_rate  start_mrr  retain_mrr
0                0.936551            0.063449   971650.0    910000.0


## Churn Rate (§2.2)

In [10]:
# we keep the start and end accounts from above

# 2.2.III) churned_accounts <- start_accounts LEFT OUTER JOIN to end_accounts
# the line shows how to realize an inner join with pandas:

qe_s=q_end_accounts.subquery()
q_churned_accounts = q_start_accounts.join(qe_s, Subscription.account_id == qe_s.c.account_id, isouter=True).filter(qe_s.c.account_id == None)

# reading into pandas
churned_accounts = pd.read_sql(q_churned_accounts.statement, engine).set_index("account_id")
print(churned_accounts)



            total_mrr
account_id           
22               50.0
33               50.0
42               50.0
43               50.0
50               50.0
...               ...
10967            50.0
10975           100.0
11246            50.0
11613            50.0
11630           200.0

[703 rows x 1 columns]


In [11]:

n_start = session.query(func.count(q_start_accounts.subquery().c.account_id)).one()[0] or 0
n_churn = session.query(func.count(q_churned_accounts.subquery().c.account_id)).one()[0] or 0

churn_rate=n_churn /n_start # <- churned
retention_rate = 1.0 - n_churn /n_start # <- "survived", i.e. complentary = 1-churn_rate

df=pd.DataFrame.from_dict({
    "churn_rate":[churn_rate],
    "retention_rate":[retention_rate],
    "n_start":[n_start],
    "n_churn":[n_churn]
})
print(df)


   churn_rate  retention_rate  n_start  n_churn
0    0.068976        0.931024    10192      703


## Activity Churn (§2.3)

In [16]:
# 
start_date = datetime.strptime(d_start_date, "%Y-%m-%d")
end_date = datetime.strptime(d_end_date, "%Y-%m-%d")

inactivity_interval=relativedelta(months=+1)
start_date_int = start_date-inactivity_interval
end_date_int = end_date-inactivity_interval


# I) start_accounts
q_start_accounts_int = session.query(Event.account_id)\
    .filter(Event.event_time > start_date_int, Event.event_time <= d_start_date)\
    .distinct()
start_accounts_int = pd.read_sql(q_start_accounts_int.statement, engine).set_index("account_id")
nn_start = session.query(func.count(q_start_accounts_int.subquery().c.account_id)).one()[0] or 0


In [15]:
# II) end_accounts
q_end_accounts_int = session.query(Event.account_id)\
    .filter(Event.event_time > end_date_int, Event.event_time <= d_end_date)\
    .distinct()
end_accounts_int = pd.read_sql(q_end_accounts_int.statement, engine).set_index("account_id")
nn_end = session.query(func.count(q_end_accounts_int.subquery().c.account_id)).one()[0] or 0


In [21]:
qe_s_int=q_end_accounts_int.subquery()
q_churned_accounts_int = q_start_accounts_int.join(qe_s_int, Event.account_id == qe_s_int.c.account_id, isouter=True)\
    .filter(qe_s_int.c.account_id == None)\
    .distinct()

# reading into pandas
churned_accounts_int = pd.read_sql(q_churned_accounts_int.statement, engine).set_index("account_id")
nn_churn = session.query(func.count(q_churned_accounts_int.subquery().c.account_id)).one()[0] or 0

churn_rate_int=nn_churn / nn_start # <- churned
retention_rate_int = 1.0 - nn_churn / nn_start # <- "survived", i.e. complentary = 1-churn_rate

df=pd.DataFrame.from_dict({
    "churn_rate_int":[churn_rate_int],
    "retention_rate_int":[retention_rate_int],
    "nn_start":[nn_start],
    "nn_churn":[nn_churn]
})
print(df)


   churn_rate_int  retention_rate_int  nn_start  nn_churn
0        0.058185            0.941815     10965       638


## MRR Churn (§2.4)

In [11]:
qs=q_start_accounts.subquery()
qe=q_end_accounts.subquery()
q_downsell_accounts = session.query(qs.c.account_id, (qs.c.total_mrr-qe.c.total_mrr).label("downsell_amount"))\
    .join(qe, qs.c.account_id == qe.c.account_id)\
    .filter(qe.c.total_mrr < qs.c.total_mrr)

pd.read_sql(q_downsell_accounts.statement, engine).set_index("account_id")

Unnamed: 0_level_0,downsell_amount
account_id,Unnamed: 1_level_1
39,100.0
58,100.0
391,100.0
496,100.0
2817,100.0
2829,100.0
4190,100.0
4413,100.0
4994,100.0
5742,100.0


In [12]:
start_mrr = session.query(func.sum(q_start_accounts.subquery().c.total_mrr)).one()[0] or 0
churn_mrr = session.query(func.sum(q_churned_accounts.subquery().c.total_mrr)).one()[0] or 0
downsell_mrr = session.query(func.sum(q_downsell_accounts.subquery().c.downsell_amount)).one()[0] or 0

mrr_churn_rate = (churn_mrr+downsell_mrr) /start_mrr 

df=pd.DataFrame.from_dict({
    "mrr_churn_rate":[mrr_churn_rate],
    "start_mrr":[start_mrr],
    "churn_mrr":[churn_mrr],
    "downsell_mrr":[downsell_mrr],
})
print(df)


   mrr_churn_rate  start_mrr  churn_mrr  downsell_mrr
0        0.174184    32150.0     3900.0        1700.0


## Churn Rate scaled (§2.5)

In [13]:

period_days=days_between(d_start_date, d_end_date)

measured_churn =  n_churn / n_start
#period_days = end_date - start_date
annual_churn = 1- pow(1- measured_churn, 365.0 / period_days)
monthly_churn = 1- pow(1- measured_churn, 365.0 /12.0 / period_days)

df=pd.DataFrame.from_dict({
    "n_start":[n_start],
    "n_churn":[n_churn],
    "measured_churn":[measured_churn],
    "period_days":[period_days],
    "annual_churn":[annual_churn],
    "monthly_churn":[monthly_churn]
})
print(df)


   n_start  n_churn  measured_churn  period_days  annual_churn  monthly_churn
0      338       37        0.109467           60      0.506027       0.057079
