## Access to the DB

### SQLite

In [1]:
import os
from sqlalchemy import create_engine, MetaData
from sqlalchemy.orm import sessionmaker
import pandas as pd

# the following example will depend on a sqlite db
sqlfile="../churn.db"
engine = create_engine(f"sqlite:///{sqlfile}")
session = sessionmaker(bind=engine)()
db_options={}

from churnmodels.schema import ActiveWeek, ActivePeriod, Account, Metric, MetricName, Subscription, Event, EventType


### PostGres

In [1]:
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd

from churnmodels.schema import get_schema, get_db_uri

options = {"user": "postgres",
           "pass": "password",
           "dbname": "churn",
           "schema": "biznet1"
           #"host" : "localhost" # ...if needed
           #"port" : "5432" # ...if needed
           }
# if we try to write to the DB with pandas, we need to specify the schema
db_options={"schema" : options["schema"]}


if True:
    # tables is a (dynamical) module containg Wrapper classes for our data base
    T=get_schema(options)

    # connect to the database
    db_uri=get_db_uri(options, "postgres") # "postgres" names the dialect we are using
    engine=create_engine(db_uri)
    session = sessionmaker(bind=engine)()

    # ..how to bring all tables in T to the global namespace
    for tbl in T.__dict__.keys():
        if not tbl[0].isupper():
            continue
        exec(f"{tbl} = T.{tbl}")



In [2]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d")
    d2 = datetime.strptime(d2, "%Y-%m-%d")
    return abs((d2 - d1).days)



In [3]:
from sqlalchemy import func, or_
import pandas as pd

d_start_date = "2020-01-01"
d_end_date = "2020-03-01"

d_start_date = "2020-03-01"
d_end_date = "2020-04-01"


## ongoing active accounts (§4.1)

In [4]:
"""
with RECURSIVE active_period_params as    
(
    select interval '%gap_interval'  as allowed_gap,
    '%to_yyyy-mm-dd'::date as calc_date
),
active as  
(
	select distinct account_id, min(start_date) as start_date    
	from subscription inner join active_period_params 
on start_date <= calc_date    
		and (end_date > calc_date or end_date is null)
	group by account_id

	UNION

	select s.account_id, s.start_date  
	from subscription s 
	cross join active_period_params 
	inner join active e on s.account_id=e.account_id  
		and s.start_date < e.start_date  
		and s.end_date >= (e.start_date-allowed_gap)::date  

) 
insert into active_period (account_id, start_date, churn_date)     
select account_id, min(start_date) as start_date, NULL::date as churn_date  
from active
group by account_id, churn_date  
"""

from sqlalchemy import distinct, literal, and_, Float, cast, select


# we set the allowed gap to one week
allowed_gap = 7
# ...the calculation date to
d_calc_date="2020-05-10"

active = session.query(
        Subscription.account_id, 
        func.min(Subscription.start_date).label("start_date"))\
    .filter(Subscription.start_date <= d_calc_date,
        or_(Subscription.end_date > d_calc_date, Subscription.end_date == None))\
    .group_by(Subscription.account_id)\
    .distinct(Subscription.account_id)\
    .cte(recursive=True, name="active")

active_L = session.query(
        Subscription.account_id, 
        Subscription.start_date)\
    .join(active, Subscription.account_id == active.c.account_id)\
    .filter(and_(Subscription.start_date < active.c.start_date,
        Subscription.end_date >= (active.c.start_date-allowed_gap)))

union_all = active.union_all(active_L)
subq=session.query(union_all).subquery()

# for postgres we can use this query
qr=session.query(
        subq.c.account_id, 
        func.min(subq.c.start_date).label("start_date"),
        #(d_calc_date-func.min(subq.c.start_date)).label("subscriber_tenure_days")
        literal(None).label("churn_date")
    ) \
    .group_by(subq.c.account_id, "churn_date")\
    .order_by("account_id")\
    #.filter(subq.c.account_id<10)

# ... but for sqlite we need to use "julianday" do calculate the date diferences
qr=session.query(
        subq.c.account_id, 
        func.min(subq.c.start_date).label("start_date"),
        #(func.julianday(d_calc_date)-func.julianday(func.min(subq.c.start_date))).label("subscriber_tenure_days")
        literal(None).label("churn_date")
        )\
    .group_by(subq.c.account_id, "churn_date")\
    .order_by("account_id")\
    #.filter(subq.c.account_id<10)
    
account_tenure = pd.read_sql(qr.statement, engine)
print(account_tenure)

# delete if existing already
session.commit()
old_metrics=session.query(ActivePeriod)
old_metrics.delete()
session.commit()


new_active_period_insert=qr.cte("new_active_period_insert")
select_stm=select([
        new_active_period_insert.c.account_id, 
        new_active_period_insert.c.start_date,
        new_active_period_insert.c.churn_date])
columns=['account_id', 'start_date', 'churn_date']
session.execute(ActivePeriod.__table__.insert().from_select(columns, select_stm))
session.commit()


       account_id  start_date churn_date
0               1  2020-01-21       None
1               2  2020-01-13       None
2               4  2020-01-29       None
3               5  2020-01-07       None
4               7  2020-01-27       None
...           ...         ...        ...
11792       14634  2020-05-03       None
11793       14637  2020-05-06       None
11794       14638  2020-05-08       None
11795       14640  2020-05-09       None
11796       14641  2020-05-10       None

[11797 rows x 3 columns]


## churned periods (§4.2)

In [None]:
"""
with RECURSIVE active_period_params as    
(
	select INTERVAL '%gap_interval' as allowed_gap,
	       '%to_yyyy-mm-dd'::date as observe_end,
	       '%from_yyyy-mm-dd'::date as observe_start
),
end_dates as    
(
	select distinct account_id, start_date, end_date, 
(end_date +  allowed_gap)::date as extension_max
	from subscription inner join active_period_params 
	on end_date between observe_start and observe_end    
), 
resignups as     
(
	select distinct e.account_id, e.end_date   
	from end_dates e inner join subscription s on e.account_id = s.account_id
		and s.start_date <= e.extension_max
		and (s.end_date > e.end_date or s.end_date is null)      
),
churns as    
(
	select e.account_id, e.start_date, e.end_date as churn_date    
	from end_dates e left outer join resignups r  
	on e.account_id = r.account_id    
		and e.end_date = r.end_date
	where r.end_date is null    

	UNION

	select s.account_id, s.start_date, e.churn_date    
	from subscription s 
	cross join active_period_params
	inner join churns e on s.account_id=e.account_id
		and s.start_date < e.start_date
		and s.end_date >= (e.start_date- allowed_gap)::date
) 
insert into active_period (account_id, start_date, churn_date)    
select account_id, min(start_date) as start_date, churn_date  
from churns
group by account_id, churn_date
"""

# we set the allowed gap to two weeks
allowed_gap = 14
d_observe_start = "2020-02-09"
d_observe_end = "2020-05-10"

end_dates=session.query(
        Subscription.account_id,
        Subscription.start_date,
        Subscription.end_date,
        func.DATE(func.julianday(Subscription.end_date) +  allowed_gap).label("extension_max") 
        )\
    .filter( and_(Subscription.end_date>=d_observe_start, Subscription.end_date<=d_observe_end))\
    .distinct(Subscription.account_id)\
    .cte("end_dates")

resignups=session.query(
        end_dates.c.account_id,
        end_dates.c.end_date
        )\
    .join(Subscription, end_dates.c.account_id==Subscription.account_id)\
    .filter( Subscription.start_date <= end_dates.c.extension_max,
        or_(Subscription.end_date > end_dates.c.end_date, Subscription.end_date == None))\
    .distinct(end_dates.c.account_id)\
    .cte("resignups")

churns = session.query(
        end_dates.c.account_id, 
        end_dates.c.start_date, 
        end_dates.c.end_date.label("churn_date"))\
    .join(resignups, and_(
            resignups.c.account_id == end_dates.c.account_id,
            resignups.c.end_date == end_dates.c.end_date
            ), isouter=True)\
    .filter(resignups.c.end_date==None)\
    .cte(recursive=True, name="churns")

churns_L = session.query(
        Subscription.account_id, 
        Subscription.start_date, 
        churns.c.churn_date)\
    .join(churns, and_(
            Subscription.account_id == churns.c.account_id,
            Subscription.start_date == churns.c.start_date,
            Subscription.end_date >=  (churns.c.start_date -  allowed_gap)
            ))\

union_all = churns.union_all(churns_L)
subq=session.query(union_all).subquery()
qr=session.query(subq)#.limit(15)

#ddf=pd.read_sql(qr.statement, engine)
#print(ddf)

# delete if existing already
session.commit()
old_metrics=session.query(ActivePeriod)
old_metrics.delete()
session.commit()


new_active_period_insert=qr.cte("new_active_period_insert")
select_stm=select([
        new_active_period_insert.c.account_id, 
        new_active_period_insert.c.start_date,
        new_active_period_insert.c.churn_date])
columns=['account_id', 'start_date', 'churn_date']
session.execute(ActivePeriod.__table__.insert().from_select(columns, select_stm))
session.commit()



## active event weeks (§4.3)

In [19]:
"""
WITH
   periods as (    
	select i::timestamp as period_start, i::timestamp + '7 day'::interval as period_end 
	from generate_series('%from_yyyy-mm-dd', '%to_yyyy-mm-dd', '7 day'::interval) i
)
insert into active_week (account_id, start_date, end_date)
select account_id, 
period_start::date,     
period_end::date
from event inner join periods on event_time>=period_start    
	and event_time < period_end     
group by account_id, period_start, period_end    

"""
from sqlalchemy import func, and_, select
from sqlalchemy import MetaData, Table, Column, Date, Integer
from churnmodels.helpers import make_day_interval

freq = 7
d_period_start = "2020-02-09"
d_period_end = "2020-05-10"


# making the Date Intervals available in the data base
# example
freq="7D" # week has 7 days
periods=1 # 4 weeks back in time
days_df=make_day_interval(d_period_start, d_period_end, periods, freq)

meta=MetaData(bind=engine)
if "schema" in db_options:
    meta.schema = db_options["schema"]
TmpPeriodsVec = Table('tmp_periods_vec', meta, 
                   Column('id', Integer, primary_key=True, autoincrement=True),
                   Column('start_date', Date), Column('end_date', Date))
if TmpPeriodsVec.exists():
    TmpPeriodsVec.drop()
TmpPeriodsVec.create()
# storing a temporary table to the data base with pandas
days_df.to_sql("tmp_periods_vec",con=engine, **db_options, if_exists='append', index=False)
session.commit()

periods=session.query(
        TmpPeriodsVec.c.start_date.label("period_start"),
        TmpPeriodsVec.c.end_date.label("period_end")
    )\
    .cte("periods")

qr=session.query(
        Event.account_id,
        periods.c.period_start, periods.c.period_end,
        )\
    .join(Event, and_(Event.event_time>=periods.c.period_start, 
            Event.event_time<periods.c.period_end))\
    .group_by(Event.account_id, periods.c.period_start, periods.c.period_end)

#qr=session.query(periods)
#print(qr.statement)
#ddf=pd.read_sql(qr.statement, engine)
#print(ddf)

new_active_period_insert=qr.cte("new_active_period_insert")
select_stm=select([
        new_active_period_insert.c.account_id, 
        new_active_period_insert.c.period_start,
        new_active_period_insert.c.period_end])
target_columns=['account_id', 'start_date', 'end_date']
session.execute(ActiveWeek.__table__.insert().from_select(target_columns, select_stm))
session.commit()


## observation dates (§4.4)

In [None]:
"""
with RECURSIVE observation_params as    
(    
	select interval '%obs_interval' as obs_interval,
	       interval '%lead_time'  as lead_time,
	       '%from_yyyy-mm-dd'::date as obs_start,
	       '%to_yyyy-mm-dd'::date as obs_end
),observations as (    
	select  account_id,
	    start_date,
	    1 as obs_count,
	    (start_date+obs_interval-lead_time)::date as obs_date,
	    case 
	         when churn_date >= (start_date +   obs_interval-lead_time)::date 
		      and churn_date <  (start_date + 2*obs_interval-lead_time)::date
				then true 
		    else false 
		end as is_churn    
	from active_period inner join observation_params
	on (churn_date > (obs_start+obs_interval-lead_time)::date   
        or churn_date is null)

	UNION    

	SELECT  o.account_id,
	    o.start_date,
		 obs_count+1 as obs_count,
	    (o.start_date+(obs_count+1)*obs_interval-lead_time)::date as obs_date,
		case 
	        when churn_date >= (o.start_date + (obs_count+1)*obs_interval-lead_time)::date
		      and churn_date < (o.start_date + (obs_count+2)*obs_interval-lead_time)::date
				then true 
			else false 
		end as is_churn     
	from observations o inner join observation_params
	on   (o.start_date+(obs_count+1)*obs_interval-lead_time)::date <= obs_end
	inner join active_period s on s.account_id=o.account_id    
	and  (o.start_date+(obs_count+1)*obs_interval-lead_time)::date >= s.start_date
	and ((o.start_date+(obs_count+1)*obs_interval-lead_time)::date < s.churn_date or churn_date is null)
) 
insert into observation (account_id, observation_date, is_churn)
select distinct account_id, obs_date, is_churn
from observations
inner join observation_params on obs_date between obs_start and obs_end
"""