In [1]:
"""
got to the project root folder 
run 
> python setup.py sdist

a new package will be created and stored in the subfolder "./dist"

now run pip install, e.g.:
> pip install dist/churnmodels-0.0.1.tar.gz
"""
import churnmodels
print(churnmodels.__version__)
# expected outcome : the actual version number


0.0.1


In [2]:
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from churnmodels import Subscription, Event, Account

sqlfile="c:/tmp/churn4.db"
engine = create_engine(f"sqlite:///{sqlfile}")

session = sessionmaker(bind=engine)()

In [3]:
from sqlalchemy import func, or_
from churnmodels import Subscription, Event
import pandas as pd


d_start_date = "2020-01-01"
d_end_date = "2020-03-01"

"""
-- PostGres SQL for start_accounts: 
    select  account_id, sum (mrr) as total_mrr    
    from subscription s inner join date_range d on
        s.start_date <= d.start_date    
        and (s.end_date > d.start_date or s.end_date is null)
    group by account_id    
"""

# start_accounts
qa = session.query(Subscription.account_id, func.sum(Subscription.mrr).label("total_mrr"))\
    .filter(
            # SQL: s.start_date <= d.start_date
        Subscription.start_date <= d_start_date,
            # SQL: s.end_date > d.start_date or s.end_date is null
        or_(Subscription.end_date > d_start_date, Subscription.end_date == None))\
    .group_by(Subscription.account_id) # SQL: group by account_id
# getting the result from the DB stored into a pandas DataFrame
start_accounts = pd.read_sql(qa.statement, engine).set_index("account_id")
print(start_accounts)

            total_mrr
account_id           
1                50.0
4                50.0
82               50.0
101              50.0
120              50.0
...               ...
9885            100.0
9897             50.0
9900            100.0
9967             50.0
9983            200.0

[305 rows x 1 columns]


In [4]:
# end_accounts
qe = session.query(Subscription.account_id, func.sum(Subscription.mrr).label("total_mrr")).filter(
    Subscription.start_date <= d_end_date,
    or_(Subscription.end_date > d_end_date, Subscription.end_date == None)).group_by(Subscription.account_id)
# q = q.filter(Subscription.account_id==64)
end_accounts = pd.read_sql(qe.statement, engine).set_index("account_id")
print(end_accounts)

            total_mrr
account_id           
1                50.0
2               100.0
3               200.0
4                50.0
5                50.0
...               ...
12008           200.0
12041           100.0
12080           100.0
12084            50.0
12100           200.0

[10346 rows x 1 columns]


In [5]:
# pandas calls
# retained_accounts: inner join
retained_accounts = pd.merge(start_accounts, end_accounts, on="account_id")
print(retained_accounts)
# the resulting columns total_mrr_x, total_mrr_y are identical

            total_mrr_x  total_mrr_y
account_id                          
1                  50.0         50.0
4                  50.0         50.0
82                 50.0         50.0
101                50.0         50.0
120                50.0        100.0
...                 ...          ...
9885              100.0        100.0
9897               50.0         50.0
9900              100.0        100.0
9967               50.0         50.0
9983              200.0        200.0

[272 rows x 2 columns]


In [6]:
from sqlalchemy.orm import aliased
a1 = aliased(Account)
qall = session.query(qa.subquery(), qe.subquery())
print(qall.statement)

SELECT anon_1.account_id, anon_1.total_mrr, anon_2.account_id, anon_2.total_mrr 
FROM (SELECT subscription.account_id AS account_id, sum(subscription.mrr) AS total_mrr 
FROM subscription 
WHERE subscription.start_date <= :start_date_1 AND (subscription.end_date > :end_date_1 OR subscription.end_date IS NULL) GROUP BY subscription.account_id) AS anon_1, (SELECT subscription.account_id AS account_id, sum(subscription.mrr) AS total_mrr 
FROM subscription 
WHERE subscription.start_date <= :start_date_2 AND (subscription.end_date > :end_date_2 OR subscription.end_date IS NULL) GROUP BY subscription.account_id) AS anon_2


In [8]:
"""
if the subqueries qa and qe are large it is more efficient not to store the subselects into pandas DataFrames
With sqlalchemy we can simply join two subqueries:
"""
qe_s=qe.subquery()
qall = qa.join(qe_s, Subscription.account_id == qe_s.columns.account_id)  #<- see the reference to "columns" of the subquery ⌈object

# let's have a look at the SQL statement sqlalchemy produces
print(qall.statement)

# reading into pandas
joined_accounts = pd.read_sql(qall.statement, engine).set_index("account_id")
print(joined_accounts)
                              

SELECT subscription.account_id, sum(subscription.mrr) AS total_mrr 
FROM subscription JOIN (SELECT subscription.account_id AS account_id, sum(subscription.mrr) AS total_mrr 
FROM subscription 
WHERE subscription.start_date <= :start_date_1 AND (subscription.end_date > :end_date_1 OR subscription.end_date IS NULL) GROUP BY subscription.account_id) AS anon_1 ON subscription.account_id = anon_1.account_id 
WHERE subscription.start_date <= :start_date_2 AND (subscription.end_date > :end_date_2 OR subscription.end_date IS NULL) GROUP BY subscription.account_id
            total_mrr
account_id           
1                50.0
4                50.0
82               50.0
101              50.0
120              50.0
...               ...
9885            100.0
9897             50.0
9900            100.0
9967             50.0
9983            200.0

[272 rows x 1 columns]
