## Access to the DB

### SQLite

In [1]:
import os
import sqlalchemy as sa
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd
from churnmodels.schema import get_db_uri

options = {
        "dialect":"sqlite",
        "file":"../churn.db"
    }

if True:
    # connect to the database
    db_uri=get_db_uri(options, "sqlite") # "postgres" names the dialect we are using
    engine=create_engine(db_uri)
    session = sessionmaker(bind=engine)()

    # we get the log function from an extension library for sqlite
    from sqlalchemy import event
    @event.listens_for(engine, "connect")
    def connect(dbapi_connection, connection_rec):
        dbapi_connection.enable_load_extension(True)
        dbapi_connection.execute('SELECT load_extension("libsqlitefunctions")')


### PostGres

In [1]:
import os
import sqlalchemy as sa
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd
from churnmodels.schema import get_db_uri

options = {
        "dialect":"postgresql",
        "user": "postgres",
           "pass": "password",
           "dbname": "churn",
           "schema": "biznet1"
           #"host" : "localhost" # ...if needed
           #"port" : "5432" # ...if needed
           }

if True:
    # connect to the database
    db_uri=get_db_uri(options, "postgres") # "postgres" names the dialect we are using
    engine=create_engine(db_uri)
    engine.dialect.has_schema(engine, options["schema"]) 
    session = sessionmaker(bind=engine)()


In [2]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sqlalchemy import func, or_
import pandas as pd
from churnmodels import DBHelper
from churnmodels.helpers import days_between, pretty_sql
from churnmodels.schema import get_schema_rfl

#d_start_date = "2020-01-01"
#d_end_date = "2020-03-01"

d_start_date = "2020-03-01"
d_end_date = "2020-04-01"

metric_period=7
d_obs_start = "2020-02-09"
d_obs_end = "2020-05-10"

# tables is a (dynamical) module containg Wrapper classes for our data base
T=get_schema_rfl(options)
# ..how to bring all tables in T to the global namespace
for tbl in T.__dict__.keys():
    if not tbl[0].isupper():
        continue
    exec(f"{tbl} = T.{tbl}")

dbhelper=DBHelper(options)


In [3]:
from sqlalchemy import func
if session.bind.dialect.name == "sqlite":
    # sqlite problematic when computing days
    to_days = lambda some_date: func.julianday(some_date)
else:
    # dummy func because of sqlite
    to_days = lambda some_date: func.DATE(some_date)


## ratio metric (§7.1)

In [5]:
# actual list of metric names
pd.read_sql(session.query(MetricName).statement, engine)

Unnamed: 0,id,metric_name_id,metric_name
0,1,1,post_per_month
1,2,2,newfriend_per_month
2,3,3,like_per_month
3,4,4,adview_per_month
4,5,5,dislike_per_month
5,6,6,unfriend_per_month
6,7,7,message_per_month
7,8,8,reply_per_month
8,9,9,account_tenure
9,10,10,adview_per_post


In [6]:
# adding new metric

# this step should be exercised before §3.3 to have a new metric id when inserting to the Metric Table
"""
insert into metric_name values (%new_metric_id,'%new_metric_name')
"""
from sqlalchemy import func
max_id=9
new_metrics=[
    "adview_per_post",
    "reply_per_message",
    "like_per_post",
    "post_per_message",
    "unfriend_per_newfriend",
    
    "unfriend_per_newfriend_scaled",
    "dislike_pcnt",
    "newfriend_pcnt_chng",
    "unfriend_28day_avg_84day_obs",
    "unfriend_28day_avg_84day_obs_scaled",
    "days_since_newfriend",
    ]

# we simply delete old metrics
new_metric_id=max_id
old_metrics=MetricName.__table__.delete().where(MetricName.metric_name_id>new_metric_id)
session.execute(old_metrics)

# ... and add the new ones
max_id=session.query(func.max(MetricName.metric_name_id)).one()[0] or 0
for metric_name in new_metrics:
    max_id+=1
    session.execute(MetricName.__table__.insert(), {"metric_name": metric_name, "metric_name_id":max_id})

session.commit()



In [7]:
# you can omit these routine by usiong its implementation from DBHelper...
from sqlalchemy import func, and_, case, literal

d_obs_start = "2020-03-01"
d_obs_end = "2020-05-10"
obs_start=to_days(func.DATE(d_obs_start))
obs_end=to_days(func.DATE(d_obs_end))

def make_relative_metrics_part(d_obs_start, d_obs_end, cte_name, target_field, metric_name):
    num_metric=session.query(
            Metric.account_id, 
            Metric.metric_time, 
            Metric.metric_value.label(target_field)
            )\
        .join(MetricName, Metric.metric_name_id==MetricName.metric_name_id)\
        .filter(MetricName.metric_name == metric_name,
            Metric.metric_time.between(d_obs_start, d_obs_end)
           )\
        .order_by(Metric.account_id, Metric.metric_time)\
        .cte(cte_name)
    return num_metric

def make_relative_metrics_sub(num_metric, den_metric, metric_name_lu=-1):
    qr=session.query(
            num_metric.c.account_id,
            func.DATE(num_metric.c.metric_time),
            literal(metric_name_lu).label("metric_name_id"),
            case([
                (num_metric.c.num_value == None, 0),
                (den_metric.c.den_value == None, 0),
                (den_metric.c.den_value == 0, 0),
                (den_metric.c.den_value != 0, num_metric.c.num_value/den_metric.c.den_value)
            ], else_=0). label("metric_value"),
            case([
                (num_metric.c.num_value == None, 0),
                (den_metric.c.den_value == None, 0),
                (den_metric.c.den_value == 0, 0),
                (num_metric.c.num_value/den_metric.c.den_value > 0, func.log(num_metric.c.num_value/den_metric.c.den_value))
            ], else_=0). label("metric_value_log")
            )\
        .join(den_metric, and_(
            num_metric.c.account_id==den_metric.c.account_id,
            num_metric.c.metric_time==den_metric.c.metric_time
        ), isouter=True)\
        .order_by(num_metric.c.account_id, num_metric.c.metric_time)\
    
    return qr

num_metric = make_relative_metrics_part(d_obs_start, d_obs_end, "num_metric", "num_value", "like_per_month")
den_metric = make_relative_metrics_part(d_obs_start, d_obs_end, "den_metric", "den_value", "dislike_per_month")
qr = make_relative_metrics_sub(num_metric, den_metric)
#print(qr.statement)

In [8]:
qr=dbhelper.make_relative_metrics(d_obs_start, d_obs_end, "like_per_month", "dislike_per_month")
ddf=pd.read_sql(qr.statement, engine)
print(ddf)

       account_id metric_time  metric_name_id  metric_value  metric_value_log
0               1  2020-03-01              -1     21.000000          3.044522
1               1  2020-03-08              -1     21.500000          3.068053
2               1  2020-03-15              -1     23.500000          3.157000
3               1  2020-03-22              -1     29.000000          3.367296
4               1  2020-03-29              -1     61.000000          4.110874
...           ...         ...             ...           ...               ...
54373       12098  2020-03-29              -1      5.750000          1.749200
54374       12099  2020-03-22              -1     22.750000          3.124565
54375       12099  2020-03-29              -1     25.428571          3.235873
54376       12100  2020-03-22              -1     66.000000          4.189655
54377       12100  2020-03-29              -1     86.333333          4.458216

[54378 rows x 5 columns]


In [26]:
from sqlalchemy import select

todo={
     "like_per_dislike":["like_per_month", "dislike_per_month"],
     "adview_per_post":["adview_per_month", "post_per_month"],
    "reply_per_message":["reply_per_month", "message_per_month"],
    "like_per_post":["like_per_month", "post_per_month"],
    "post_per_message":["post_per_month", "message_per_month"],
    "unfriend_per_newfriend":["unfriend_per_month", "newfriend_per_month"],
    
    # run this step later...
    #"unfriend_per_newfriend_scaled":["unfriend_28day_avg_84day_obs_scaled", "newfriend_per_month"],
}

for newmetricname, pairs in todo.items():
    metric_name_id=-1
    metric_name_lu=session.query(MetricName.metric_name_id)\
        .filter(MetricName.metric_name == newmetricname).first() #or 0
    if metric_name_lu is not None:
        metric_name_lu = metric_name_lu[0]
        metric_name_id = metric_name_lu
    print(f"new metric_name_id={metric_name_id} for {newmetricname}")
    #num_metric=make_relative_metrics_part(d_obs_start, d_obs_end, "num_metric", "num_value", pairs[0])
    #den_metric=make_relative_metrics_part(d_obs_start, d_obs_end, "den_metric", "den_value", pairs[1])
    #qr=qr=make_relative_metrics_sub(num_metric, den_metric, metric_name_id)
    qr=dbhelper.make_relative_metrics(d_obs_start, d_obs_end, pairs[0], pairs[1])
    #print(pretty_sql(qr))
    ddf=pd.read_sql(qr.statement, dbhelper.engine)
    print(ddf)
    
    if metric_name_id>0:
        # delete all old values "new_metric_id"
        session.commit()
        old_metrics=session.query(Metric).filter(Metric.metric_name_id==metric_name_id).filter(Metric.metric_name_id==-1)
        old_metrics.delete()
        session.commit()
        
        new_metrics_insert=qr.cte("new_metrics_insert")
        select_stm=select([
            new_metrics_insert.c.account_id, 
            func.DATE(new_metrics_insert.c.metric_time).label("metric_time"), 
            literal(metric_name_id).label("metric_name_id"), #new_metrics_insert.c.metric_name_id, 
            new_metrics_insert.c.metric_value
        ])
        target_columns=['account_id', 'metric_time', 'metric_name_id', 'metric_value']
        session.execute(Metric.__table__.insert().from_select(target_columns, select_stm))
        session.commit()





new metric_name_id=-1 for like_per_dislike
       account_id metric_time  metric_name_id  metric_value  metric_value_log
0               1  2020-03-01              -1     21.000000          3.044522
1               1  2020-03-08              -1     21.500000          3.068053
2               1  2020-03-15              -1     23.500000          3.157000
3               1  2020-03-22              -1     29.000000          3.367296
4               1  2020-03-29              -1     61.000000          4.110874
...           ...         ...             ...           ...               ...
54373       12098  2020-03-29              -1      5.750000          1.749200
54374       12099  2020-03-22              -1     22.750000          3.124565
54375       12099  2020-03-29              -1     25.428571          3.235873
54376       12100  2020-03-22              -1     66.000000          4.189655
54377       12100  2020-03-29              -1     86.333333          4.458216

[54378 rows x 5 colu

## getting metric datasets (§7.2)

In [10]:
from sqlalchemy import case, func, literal
import numpy as np

# same as before
def get_dataset(d_obs_start, d_end_end, metric_period):
    fields=[
        Metric.account_id,
        Observation.observation_date,
        Observation.is_churn
    ]
    targets={}
    df_metricnames=pd.read_sql(session.query(MetricName).statement,engine)
    for index, row in df_metricnames.iterrows():
        newfield=func.sum(case([
            (Metric.metric_name_id == row.metric_name_id, Metric.metric_value)
            ], else_=0)).label(row.metric_name)
        fields.append(newfield)

    qr=session.query(*fields)\
        .join(Observation, Metric.account_id==Observation.account_id)\
        .filter(
            Metric.metric_time> func.DATE(to_days(Observation.observation_date)-metric_period), 
            Metric.metric_time<= Observation.observation_date)\
        .group_by(Metric.account_id, Metric.metric_time,
                  Observation.observation_date, Observation.is_churn)\
        .order_by(Observation.observation_date, Metric.account_id)

    #print(pretty_sql(qr))
    ddf=pd.read_sql(qr.statement, engine)
    #ddf=ddf.set_index("account_id")
    ddf=ddf.set_index(["account_id", "observation_date"])
    
    return ddf

dataset=dbhelper.get_dataset(d_obs_start, d_obs_end, metric_period)
print(dataset)

                             is_churn  post_per_month  newfriend_per_month  \
account_id observation_date                                                  
16         2020-03-01           False             8.0                  1.0   
24         2020-03-01           False            23.0                  8.0   
118        2020-03-01           False            13.0                  3.0   
144        2020-03-01           False            55.0                 15.0   
157        2020-03-01           False             0.0                  0.0   
...                               ...             ...                  ...   
13181      2020-05-10           False             0.0                  0.0   
13205      2020-05-10           False             0.0                  0.0   
13217      2020-05-10           False             0.0                  0.0   
13280      2020-05-10           False             0.0                  0.0   
13310      2020-05-10           False             0.0           

## total metrics (§7.3)

In [11]:

metric_pairs=['like_per_month', 'dislike_per_month']

q_total_metric=session.query(
        Metric.account_id,
        func.DATE(Metric.metric_time).label("metric_time"),
        func.sum(Metric.metric_value).label("metric_total"),
        )\
    .join(MetricName, MetricName.metric_name_id==Metric.metric_name_id)\
    .filter(MetricName.metric_name.in_(metric_pairs))\
    .group_by(Metric.account_id, Metric.metric_time)

total_metric=pd.read_sql(q_total_metric.statement, engine)
print(total_metric)

#print(pretty_sql(q_total_metric))


       account_id metric_time  metric_total
0               1  2020-03-01          44.0
1               1  2020-03-08          45.0
2               1  2020-03-15          49.0
3               1  2020-03-22          60.0
4               1  2020-03-29          62.0
...           ...         ...           ...
54920       12098  2020-03-29          27.0
54921       12099  2020-03-22          95.0
54922       12099  2020-03-29         185.0
54923       12100  2020-03-22         134.0
54924       12100  2020-03-29         262.0

[54925 rows x 3 columns]


## relative changes of metric (§7.4)

In [12]:
from sqlalchemy import select, literal, union_all
from datetime import timedelta

# additional function to have two column table with start and end dates
# ... it's a copy from §3
def days_interval(d_start_date, d_end_date, step=7, label="date"):
    cnt = session.query(func.DATE(d_start_date).label(label))\
       .cte(name="cnt", recursive=True)
    next_date=func.DATE(to_days(cnt.c[label])+(step)).label(label)
    end_crit=next_date <= d_end_date
    if step<0:
        end_crit=next_date >= d_end_date
    union_all = cnt.union_all(select([next_date], cnt).where(end_crit))
    return session.query(union_all)
    #return union_all

some_start='2020-02-02'
some_end='2020-05-10'
step=7

subq=days_interval(some_start, func.Date(to_days(some_end) - 28), step=step, label="date").subquery()
q1=session.query(
    subq.c.date.label("start_date"),
    func.DATE(to_days(subq.c.date)+28).label("end_date")
    )
df=pd.read_sql(q1.statement, engine)
print(pretty_sql(engine, q1))
print(df)


WITH RECURSIVE cnt(date) AS
  (SELECT DATE('2020-02-02') AS date
   UNION ALL SELECT DATE(julianday(cnt.date) + 7) AS date
   FROM cnt
   WHERE DATE(julianday(cnt.date) + 7) <= Date(julianday('2020-05-10') - 28))
SELECT anon_1.date AS start_date,
       DATE(julianday(anon_1.date) + 28) AS end_date
FROM
  (SELECT cnt.date AS date
   FROM cnt) AS anon_1
    start_date    end_date
0   2020-02-02  2020-03-01
1   2020-02-09  2020-03-08
2   2020-02-16  2020-03-15
3   2020-02-23  2020-03-22
4   2020-03-01  2020-03-29
5   2020-03-08  2020-04-05
6   2020-03-15  2020-04-12
7   2020-03-22  2020-04-19
8   2020-03-29  2020-04-26
9   2020-04-05  2020-05-03
10  2020-04-12  2020-05-10


In [13]:

def make_timespot(fieldname, metricname, d_start, d_end):
    qr=session.query(
        Metric.account_id,
        func.DATE(Metric.metric_time).label("metric_time"),
        MetricName.metric_name,
        Metric.metric_value.label(fieldname),
        )\
    .join(MetricName, MetricName.metric_name_id==Metric.metric_name_id)\
    .filter(Metric.metric_time.between(d_start, d_end))\
    .filter(MetricName.metric_name==metricname)\
    .order_by(Metric.account_id, Metric.metric_time)
    return qr

q_relative=make_timespot("end_value", "post_per_month", '2020-01-01', '2020-05-10')
total_metric=pd.read_sql(q_relative.statement, engine)
print(total_metric)

print(pretty_sql(engine, q_relative))



       account_id metric_time     metric_name  end_value
0               1  2020-03-01  post_per_month        3.0
1               1  2020-03-08  post_per_month        4.0
2               1  2020-03-15  post_per_month        4.0
3               1  2020-03-22  post_per_month        5.0
4               1  2020-03-29  post_per_month        3.0
...           ...         ...             ...        ...
53568       12098  2020-03-29  post_per_month        6.0
53569       12099  2020-03-22  post_per_month       20.0
53570       12099  2020-03-29  post_per_month       49.0
53571       12100  2020-03-22  post_per_month      114.0
53572       12100  2020-03-29  post_per_month      282.0

[53573 rows x 4 columns]
SELECT metric.account_id,
       DATE(metric.metric_time) AS metric_time,
       metric_name.metric_name,
       metric.metric_value AS end_value
FROM metric
JOIN metric_name ON metric_name.metric_name_id = metric.metric_name_id
WHERE metric.metric_time BETWEEN '2020-01-01' AND '2020-05-10

### continuous increments better than discrete

In [14]:
from math import log, exp

# the discrete rate is set to 10%
the_rate=0.1 

# the problem of dicrete increment calculation
some_value=100
# 10% increase
some_value*=1+the_rate
# 10% decrease
some_value*=1-the_rate

# if we increase by 10% and decrease by 10% we should end up with the starting level, but...
print(f"the discrete inc/dec result:{some_value}")

# same calculation with natural logarithmics (ln)
some_value=100
# 10% increase
some_value*=exp(the_rate)
# 10% increase
some_value*=exp(-the_rate)

# that's better!
print(f"the log-inc/dec result:{some_value}")

#convert the continues rate (log) to a discrete value (normally annual)
print(f"the the contionuous rate:{exp(the_rate)-1}")

#calculate with 10% discrete to a continuous rate
print(f"given the discrete rate of {the_rate} the continuous rate is:{log(1+the_rate)}")



the discrete inc/dec result:99.00000000000001
the log-inc/dec result:100.0
the the contionuous rate:0.10517091807564771
given the discrete rate of 0.1 the continuous rate is:0.09531017980432493


### conversion of different time intervals

In [15]:
import numpy as np
import math 
# array of weekly spot values
values=np.array([100,120,90,130,100])
print(f"the time series:{values}")
diffs=np.diff(values) #absolute diffs
print(f"the absolute diffs:{diffs}")

# we calcualte the discrete relative increments
incs=(diffs/values[:-1])
# the discrete increments
print(f"discrete relative increments:{incs}")

incs_ln=np.diff(np.log(values))
print(f"continuous relative increments:{incs_ln}")


# we started with 100 and ended with 100 but the sum if increments is not zero
print(f"sum of discrete relative increments:{np.sum(incs)}")
# ...whilst in the continuous case, it is
print(f"sum of continuous relative increments:{np.sum(incs_ln)}")



the time series:[100 120  90 130 100]
the absolute diffs:[ 20 -30  40 -30]
discrete relative increments:[ 0.2        -0.25        0.44444444 -0.23076923]
continuous relative increments:[ 0.18232156 -0.28768207  0.36772478 -0.26236426]
sum of discrete relative increments:0.16367521367521365
sum of continuous relative increments:0.0


### using the continuous increments of a time series
The 4 weeks increment should be calculated over the mean of the 1-week (or daily) increments and then scaled (multiply by 4/multiply by 28)


In [16]:
from sqlalchemy import and_, case, func, or_
from sqlalchemy.orm import aliased
import numpy as np
import math

def casecheck(thefield, label):
    res=case([
            (and_(thefield != None, thefield >0), 
            func.LOG(thefield)),
        ], else_=0).label(label)
    return res

def casecheck_delta(thefield, thefield_prev, label):
    res=case([
            (and_(thefield != None, thefield >0,
                 thefield_prev != None, thefield_prev >0), 
            func.LOG(thefield)-func.LOG(thefield_prev)),
            (and_(or_(thefield == None, thefield <0),
                 thefield_prev != None, thefield_prev >0), 
            -func.LOG(thefield_prev)),
            (and_(or_(thefield_prev == None, thefield_prev <0),
                 thefield != None, thefield >0), 
            0), # func.LOG(thefield)),
        ], else_=0).label(label)
    return res

def metric_percentage(metricname):
    MetricPrev=aliased(Metric)
    ddelta=7
    subq=session.query(
            Metric.account_id,
            MetricName.metric_name,
            func.DATE(MetricPrev.metric_time).label("time_prev"),
            func.DATE(Metric.metric_time).label("time"),
            MetricPrev.metric_value.label("spot_prev"),
            Metric.metric_value.label("spot"),
            casecheck(MetricPrev.metric_value, "lnx_prev"),
            casecheck(Metric.metric_value, "lnx"),
            casecheck_delta(Metric.metric_value, MetricPrev.metric_value, "lnx_inc")
        )\
        .join(MetricName, MetricName.metric_name_id==Metric.metric_name_id)\
        .join(MetricPrev, and_(
              MetricPrev.account_id==Metric.account_id,
              MetricPrev.metric_name_id==Metric.metric_name_id,
              func.DATE(MetricPrev.metric_time)==func.DATE(to_days((Metric.metric_time))-ddelta)
             ), isouter=True)
    # ordering
    subq=subq.order_by(Metric.account_id, MetricName.metric_name, Metric.metric_time)
    # we should skip all start value of the series (per account_id and metric) given their is no real increment
    subq=subq.filter(MetricPrev.metric_time != None) 
    # filtering to only one metric:
    subq=subq.filter(MetricName.metric_name==metricname)
    return subq

subq=metric_percentage("newfriend_per_month")
#print(pretty_sql(subq))
df=pd.read_sql(subq.statement, engine)
print(df)


       account_id          metric_name   time_prev        time  spot_prev  \
0               1  newfriend_per_month  2020-03-01  2020-03-08        1.0   
1               1  newfriend_per_month  2020-03-08  2020-03-15        1.0   
2               1  newfriend_per_month  2020-03-15  2020-03-22        1.0   
3               1  newfriend_per_month  2020-03-22  2020-03-29        2.0   
4               2  newfriend_per_month  2020-03-01  2020-03-08        8.0   
...           ...                  ...         ...         ...        ...   
37556       12095  newfriend_per_month  2020-03-15  2020-03-22        7.0   
37557       12095  newfriend_per_month  2020-03-22  2020-03-29       14.0   
37558       12097  newfriend_per_month  2020-03-22  2020-03-29        2.0   
37559       12099  newfriend_per_month  2020-03-22  2020-03-29        9.0   
37560       12100  newfriend_per_month  2020-03-22  2020-03-29        6.0   

       spot  lnx_prev       lnx   lnx_inc  
0       1.0  0.000000  0.000000

In [17]:
from sqlalchemy import select, literal

metricname_base="newfriend_per_month"
newmetricname="newfriend_pcnt_chng"

todo={
     "newfriend_pcnt_chng":"newfriend_per_month",
     "dislike_pcnt":"dislike_per_month",
}

for newmetricname, metricname_base in todo.items():

    metric_name_id=-1
    metric_name_lu=session.query(MetricName.metric_name_id)\
        .filter(MetricName.metric_name == newmetricname).first() #or 0
    if metric_name_lu is not None:
        metric_name_lu = metric_name_lu[0]
        metric_name_id = metric_name_lu
        print(f"new metric_name_id={metric_name_id} for {newmetricname}")
        qr=metric_percentage(metricname_base)
        #print(pretty_sql(qr))
        ddf=pd.read_sql(qr.statement, dbhelper.engine)
        print(ddf)

        if metric_name_id>0:
            # delete all old values "new_metric_id"
            session.commit()
            old_metrics=session.query(Metric).filter(Metric.metric_name_id==metric_name_id).filter(Metric.metric_name_id==-1)
            old_metrics.delete()
            session.commit()

            new_metrics_insert=qr.cte("new_metrics_insert")
            select_stm=select([
                new_metrics_insert.c.account_id, 
                new_metrics_insert.c.time, 
                literal(metric_name_id).label("metric_name_id"), #new_metrics_insert.c.metric_name_id, 
                new_metrics_insert.c.lnx_inc
            ])
            target_columns=['account_id', 'metric_time', 'metric_name_id', 'metric_value']
            session.execute(Metric.__table__.insert().from_select(target_columns, select_stm))
            session.commit()




new metric_name_id=17 for newfriend_pcnt_chng
       account_id          metric_name   time_prev        time  spot_prev  \
0               1  newfriend_per_month  2020-03-01  2020-03-08        1.0   
1               1  newfriend_per_month  2020-03-08  2020-03-15        1.0   
2               1  newfriend_per_month  2020-03-15  2020-03-22        1.0   
3               1  newfriend_per_month  2020-03-22  2020-03-29        2.0   
4               2  newfriend_per_month  2020-03-01  2020-03-08        8.0   
...           ...                  ...         ...         ...        ...   
37556       12095  newfriend_per_month  2020-03-15  2020-03-22        7.0   
37557       12095  newfriend_per_month  2020-03-22  2020-03-29       14.0   
37558       12097  newfriend_per_month  2020-03-22  2020-03-29        2.0   
37559       12099  newfriend_per_month  2020-03-22  2020-03-29        9.0   
37560       12100  newfriend_per_month  2020-03-22  2020-03-29        6.0   

       spot  lnx_prev       l

## fat tail scores (§7.5)

In [18]:
# the functions below can be imported from curnmodels.statistics
import pandas as pd
import numpy as np
import os

def transform_skew_columns(data,skew_col_names):
    for col in skew_col_names:
        if col in data.columns:
            data[col] = np.log(1.0+data[col])

def transform_fattail_columns(data,fattail_col_names):
    for col in fattail_col_names:
        if col in data.columns:
            data[col] = np.log(data[col] + np.sqrt(np.power(data[col],2) + 1.0))


def fat_tail_scores(churn_data, stats, skew_thresh=4.0,**kwargs):

    data_scores = churn_data.copy()
    if "is_churn" in data_scores.columns:
        data_scores.drop('is_churn',inplace=True,axis=1)

    #stat_path = data_set_path.replace('.csv', '_summarystats.csv')
    #assert os.path.isfile(stat_path),'You must running listing 5.2 first to generate stats'
    #stats = pd.read_csv(stat_path,index_col=0)
    if "is_churn" in stats.columns:
        stats.drop('is_churn',inplace=True)

    skewed_columns=(stats['skew']>skew_thresh) & (stats['min'] >= 0)
    transform_skew_columns(data_scores,skewed_columns[skewed_columns].keys())

    fattail_columns=(stats['skew']>skew_thresh) & (stats['min'] < 0)
    transform_fattail_columns(data_scores,fattail_columns[fattail_columns].keys())

    mean_vals = data_scores.mean()
    std_vals = data_scores.std()
    data_scores=(data_scores-mean_vals)/std_vals

    if "is_churn" in churn_data.columns:
        data_scores['is_churn']=churn_data['is_churn']

    param_df = pd.DataFrame({'skew_score': skewed_columns,
                             'fattail_score': fattail_columns,
                             'mean': mean_vals,
                             'std': std_vals})
    return param_df, data_scores

In [27]:
from churnmodels.statistics import metric_scores, dataset_stats, fat_tail_scores
dataset=dbhelper.get_dataset(d_obs_start, d_obs_end, metric_period)
stats=dataset_stats(dataset)
scores=metric_scores(dataset, stats)
param_df, data_scores=fat_tail_scores(dataset, stats)
print(param_df)
print(data_scores)
    

                                     skew_score  fattail_score       mean  \
account_tenure                            False          False  44.140608   
adview_per_month                           True          False   2.444507   
adview_per_post                            True          False   1.300263   
days_since_newfriend                       True          False   0.139360   
dislike_pcnt                              False          False   0.081421   
dislike_per_month                          True          False   1.843871   
is_churn                                   True          False        NaN   
like_per_month                             True          False   3.044163   
like_per_post                              True          False   1.809296   
message_per_month                          True          False   2.523749   
newfriend_pcnt_chng                       False          False   0.055029   
newfriend_per_month                        True          False   1.365929   

## Days since event (§7.6)

In [20]:
from sqlalchemy import func
step=7
d1='2020-05-03'
d2='2020-05-10'
date_vals=dbhelper.days_interval(func.Date(to_days(d1)), func.Date(to_days(d2)), step=step, label="metric_date").cte("date_vals")

def query_days_since_newfriend(ev_type="like"):
    last_event=session.query(
        Event.account_id,
        date_vals.c.metric_date,
        func.max(func.DATE(Event.event_time)).label("last_date")
        )\
        .join(date_vals, func.DATE(Event.event_time)<=date_vals.c.metric_date)\
        .join(EventType, Event.event_type_id==EventType.event_type_id)\
        .filter(EventType.event_type_name==ev_type)\
        .group_by(Event.account_id, date_vals.c.metric_date)\
        .order_by(Event.account_id, date_vals.c.metric_date)\
        .cte("last_event")

    q1=session.query(
        last_event.c.account_id,
        last_event.c.metric_date,
        last_event.c.last_date,
        (to_days(last_event.c.metric_date) - to_days(last_event.c.last_date) ).label("days_since_event"),
        )
    return q1

q1= query_days_since_newfriend("like")
df=pd.read_sql(q1.statement, engine)
#print(pretty_sql(q1))
print(df)


       account_id metric_date   last_date  days_since_event
0               1  2020-05-03  2020-05-03               0.0
1               1  2020-05-10  2020-05-10               0.0
2               2  2020-05-03  2020-03-13              51.0
3               2  2020-05-10  2020-03-13              58.0
4               3  2020-05-03  2020-02-09              84.0
...           ...         ...         ...               ...
26973       14629  2020-05-10  2020-05-10               0.0
26974       14630  2020-05-10  2020-05-10               0.0
26975       14633  2020-05-10  2020-05-10               0.0
26976       14638  2020-05-03  2020-05-03               0.0
26977       14638  2020-05-10  2020-05-10               0.0

[26978 rows x 4 columns]


In [21]:
from sqlalchemy import literal, select

newmetricname="days_since_newfriend"

metric_name_id=-1
metric_name_lu=session.query(MetricName.metric_name_id)\
    .filter(MetricName.metric_name == newmetricname).first() #or 0
if metric_name_lu is not None:
    metric_name_lu = metric_name_lu[0]
    metric_name_id = metric_name_lu
    print(f"new metric_name_id={metric_name_id} for {newmetricname}")
    qr= query_days_since_newfriend("like")
    #print(pretty_sql(qr))
    #ddf=pd.read_sql(qr.statement, dbhelper.engine)
    #print(ddf)

    if metric_name_id>0:
        # delete all old values "new_metric_id"
        session.commit()
        old_metrics=session.query(Metric).filter(Metric.metric_name_id==metric_name_id).filter(Metric.metric_name_id==-1)
        old_metrics.delete()
        session.commit()

        new_metrics_insert=qr.cte("new_metrics_insert")
        select_stm=select([
            new_metrics_insert.c.account_id, 
            new_metrics_insert.c.metric_date, 
            literal(metric_name_id).label("metric_name_id"), #new_metrics_insert.c.metric_name_id, 
            new_metrics_insert.c.days_since_event
        ])
        target_columns=['account_id', 'metric_time', 'metric_name_id', 'metric_value']
        session.execute(Metric.__table__.insert().from_select(target_columns, select_stm))
        session.commit()

            

new metric_name_id=20 for days_since_newfriend


## Scaled Events (§7.7)

In [22]:
from sqlalchemy import func, and_, cast, Float, literal
step=7
frac=28.0/84.0
d1='2020-05-03'
d2='2020-05-10'
date_vals=dbhelper.days_interval(func.Date(to_days(d1)), func.Date(to_days(d2)), step=step, label="metric_date").cte("date_vals")

def query_scaling_number(ev_type=None):
    last_event=session.query(
        Event.account_id,
        date_vals.c.metric_date,
        func.count().label("total_count"),
        (cast(func.count(),Float)*frac).label("n")
        )\
        .join(date_vals, literal(True))\
        .join(EventType, Event.event_type_id==EventType.event_type_id)\
        .filter(EventType.event_type_name==ev_type)\
        .filter(and_( Event.event_time <=date_vals.c.metric_date,
                   Event.event_time >func.DATE(to_days(date_vals.c.metric_date) -84)))\
        .group_by(Event.account_id, date_vals.c.metric_date)\
        .order_by(Event.account_id, date_vals.c.metric_date)\
        .cte("last_event")

    q1=session.query(
        last_event.c.account_id,
        last_event.c.metric_date,
        last_event.c.total_count,
        last_event.c.n,
        )
    return q1

q1= query_scaling_number("unfriend")
df=pd.read_sql(q1.statement, engine)
#print(pretty_sql(q1))
print(df)


       account_id metric_date  total_count         n
0               1  2020-05-03            1  0.333333
1               1  2020-05-10            1  0.333333
2               2  2020-05-03            2  0.666667
3               2  2020-05-10            2  0.666667
4               4  2020-05-03            4  1.333333
...           ...         ...          ...       ...
22037       14582  2020-05-10            1  0.333333
22038       14605  2020-05-10            1  0.333333
22039       14609  2020-05-10            1  0.333333
22040       14629  2020-05-10            1  0.333333
22041       14638  2020-05-10            1  0.333333

[22042 rows x 4 columns]


In [23]:
from sqlalchemy import literal, select

newmetricname="unfriend_28day_avg_84day_obs"

metric_name_id=-1
metric_name_lu=session.query(MetricName.metric_name_id)\
    .filter(MetricName.metric_name == newmetricname).first() #or 0
if metric_name_lu is not None:
    metric_name_lu = metric_name_lu[0]
    metric_name_id = metric_name_lu
    print(f"new metric_name_id={metric_name_id} for {newmetricname}")
    qr= query_scaling_number("unfriend")
    #print(pretty_sql(qr))
    #ddf=pd.read_sql(qr.statement, dbhelper.engine)
    #print(ddf)

    if metric_name_id>0:
        # delete all old values "new_metric_id"
        session.commit()
        old_metrics=session.query(Metric).filter(Metric.metric_name_id==metric_name_id).filter(Metric.metric_name_id==-1)
        old_metrics.delete()
        session.commit()

        new_metrics_insert=qr.cte("new_metrics_insert")
        select_stm=select([
            new_metrics_insert.c.account_id, 
            new_metrics_insert.c.metric_date, 
            literal(metric_name_id).label("metric_name_id"), #new_metrics_insert.c.metric_name_id, 
            new_metrics_insert.c.n
        ])
        target_columns=['account_id', 'metric_time', 'metric_name_id', 'metric_value']
        session.execute(Metric.__table__.insert().from_select(target_columns, select_stm))
        session.commit()

 

new metric_name_id=18 for unfriend_28day_avg_84day_obs


## Tenure Scaled Events per month (§7.8)

In [24]:
from sqlalchemy import func, and_, cast, Float, literal
step=7
frac=28.0/84.0
d1='2020-05-03'
d2='2020-05-10'
date_vals=dbhelper.days_interval(func.Date(to_days(d1)), func.Date(to_days(d2)), step=step, label="metric_date").cte("date_vals")

def get_least(a,b):
    return case([
        (a<=b, a)
        ],
        else_=b)
def query_scaling_number_tenure(ev_type=None, me_name=None):
    
    last_event=session.query(
        Metric.account_id,
        func.DATE(Metric.metric_time).label("metric_date"),
        Metric.metric_value.label("tenure_metric"),
        func.count().label("count_unscaled"),
        (28.0/ get_least(Metric.metric_value, 84.0)).label("scaling"),
        ((28.0/ get_least(Metric.metric_value, 84.0))*func.count()).label("message_permonth_84day_scaled"),
        )\
        .join(Event, and_(
              Event.account_id==Metric.account_id,
              Event.event_time<=Metric.metric_time,
              func.DATE(Event.event_time)>func.DATE(to_days(Metric.metric_time) -84),
             ))\
        .join(EventType, Event.event_type_id==EventType.event_type_id)\
        .join(MetricName, Metric.metric_name_id==MetricName.metric_name_id)\
        .filter(EventType.event_type_name==ev_type)\
        .filter(MetricName.metric_name==me_name)\
        .filter(Metric.metric_value>14)\
        .group_by(Metric.account_id, Metric.metric_time, Metric.metric_value)\
        .order_by(Metric.account_id, Metric.metric_time, Metric.metric_value)\
        .cte("last_event")

    q1=session.query(
        last_event.c.account_id,
        last_event.c.metric_date,
        last_event.c.tenure_metric,
        last_event.c.count_unscaled,
        last_event.c.scaling,
        last_event.c.message_permonth_84day_scaled,
        )
    return q1

q1= query_scaling_number_tenure("unfriend", "account_tenure")
df=pd.read_sql(q1.statement, engine)
#print(pretty_sql(q1))
print(df)


       account_id metric_date  tenure_metric  count_unscaled   scaling  \
0               2  2020-03-01           46.0               2  0.608696   
1               2  2020-03-08           53.0               2  0.528302   
2               4  2020-03-01           47.0               4  0.595745   
3               4  2020-03-08           54.0               4  0.518519   
4               4  2020-03-15           61.0               4  0.459016   
...           ...         ...            ...             ...       ...   
40175       12088  2020-03-29           22.0               1  1.272727   
40176       12092  2020-03-29           20.0               1  1.400000   
40177       12093  2020-03-22           19.0               1  1.473684   
40178       12093  2020-03-29           26.0               3  1.076923   
40179       12095  2020-03-29           21.0               1  1.333333   

       message_permonth_84day_scaled  
0                           1.217391  
1                           1.056

In [25]:
from sqlalchemy import literal, select

newmetricname="unfriend_28day_avg_84day_obs_scaled"

metric_name_id=-1
metric_name_lu=session.query(MetricName.metric_name_id)\
    .filter(MetricName.metric_name == newmetricname).first() #or 0
if metric_name_lu is not None:
    metric_name_lu = metric_name_lu[0]
    metric_name_id = metric_name_lu
    print(f"new metric_name_id={metric_name_id} for {newmetricname}")
    qr= query_scaling_number_tenure("unfriend", "account_tenure")
    #print(pretty_sql(qr))
    #ddf=pd.read_sql(qr.statement, dbhelper.engine)
    #print(ddf)

    if metric_name_id>0:
        # delete all old values "new_metric_id"
        session.commit()
        old_metrics=session.query(Metric).filter(Metric.metric_name_id==metric_name_id).filter(Metric.metric_name_id==-1)
        old_metrics.delete()
        session.commit()

        new_metrics_insert=qr.cte("new_metrics_insert")
        select_stm=select([
            new_metrics_insert.c.account_id, 
            new_metrics_insert.c.metric_date, 
            literal(metric_name_id).label("metric_name_id"), #new_metrics_insert.c.metric_name_id, 
            new_metrics_insert.c.message_permonth_84day_scaled
        ])
        target_columns=['account_id', 'metric_time', 'metric_name_id', 'metric_value']
        session.execute(Metric.__table__.insert().from_select(target_columns, select_stm))
        session.commit()

 

new metric_name_id=19 for unfriend_28day_avg_84day_obs_scaled


In [29]:
from sqlalchemy import select, literal
todo={
    "unfriend_per_newfriend_scaled":["unfriend_28day_avg_84day_obs_scaled", "newfriend_per_month"],
}

for newmetricname, pairs in todo.items():
    metric_name_id=-1
    metric_name_lu=session.query(MetricName.metric_name_id)\
        .filter(MetricName.metric_name == newmetricname).first() #or 0
    if metric_name_lu is not None:
        metric_name_lu = metric_name_lu[0]
        metric_name_id = metric_name_lu
    print(f"new metric_name_id={metric_name_id} for {newmetricname}")
    #num_metric=make_relative_metrics_part(d_obs_start, d_obs_end, "num_metric", "num_value", pairs[0])
    #den_metric=make_relative_metrics_part(d_obs_start, d_obs_end, "den_metric", "den_value", pairs[1])
    #qr=qr=make_relative_metrics_sub(num_metric, den_metric, metric_name_id)
    qr=dbhelper.make_relative_metrics(d_obs_start, d_obs_end, pairs[0], pairs[1])
    #print(pretty_sql(qr))
    ddf=pd.read_sql(qr.statement, dbhelper.engine)
    print(ddf)
    
    if metric_name_id>0:
        # delete all old values "new_metric_id"
        session.commit()
        old_metrics=session.query(Metric).filter(Metric.metric_name_id==metric_name_id).filter(Metric.metric_name_id==-1)
        old_metrics.delete()
        session.commit()
        
        new_metrics_insert=qr.cte("new_metrics_insert")
        select_stm=select([
            new_metrics_insert.c.account_id, 
            func.DATE(new_metrics_insert.c.metric_time).label("metric_time"), 
            literal(metric_name_id).label("metric_name_id"), #new_metrics_insert.c.metric_name_id, 
            new_metrics_insert.c.metric_value
        ])
        target_columns=['account_id', 'metric_time', 'metric_name_id', 'metric_value']
        session.execute(Metric.__table__.insert().from_select(target_columns, select_stm))
        session.commit()



new metric_name_id=15 for unfriend_per_newfriend_scaled
       account_id metric_time  metric_name_id  metric_value  metric_value_log
0               2  2020-03-01              -1      0.152174         -1.882731
1               2  2020-03-08              -1      0.132075         -2.024382
2               4  2020-03-01              -1      0.476596         -0.741087
3               4  2020-03-08              -1      0.345679         -1.062245
4               4  2020-03-15              -1      0.306011         -1.184134
...           ...         ...             ...           ...               ...
40175       12088  2020-03-29              -1      0.181818         -1.704748
40176       12092  2020-03-29              -1      1.400000          0.336472
40177       12093  2020-03-22              -1      0.491228         -0.710847
40178       12093  2020-03-29              -1      0.461538         -0.773190
40179       12095  2020-03-29              -1      0.055556         -2.890372

[40180 

## Count active users (§7.9)

In [46]:
"""
with date_vals AS (
     select i::timestamp as metric_date
     from generate_series('%from_yyyy-mm-dd', '%to_yyyy-mm-dd', '7 day'::interval) i
)
select account_id, metric_date, count(distinct user_id) as n_distinct_users
from event e inner join date_vals d
on e.event_time <= metric_date
and e.event_time > metric_date - interval '%obs_period days'
group by account_id, metric_date
order by metric_date, account_id;
"""
from sqlalchemy import select, literal, distinct
step=7
obs_period=84
d1='2020-05-03'
d2='2020-05-10'
date_vals=dbhelper.days_interval(func.Date(to_days(d1)), func.Date(to_days(d2)), step=step, label="metric_date").cte("date_vals")

qr = session.query(
    Event.account_id, 
    date_vals.c.metric_date, 
    func.count(distinct(Event.account_id)).label("n_distinct_users")
    )\
    .join(date_vals, literal(True))\
    .filter(func.DATE(Event.event_time) <= date_vals.c.metric_date)\
    .filter(func.DATE(Event.event_time) > func.DATE(to_days(date_vals.c.metric_date)-obs_period))\
    .group_by(Event.account_id, date_vals.c.metric_date)\
    .order_by(Event.account_id, date_vals.c.metric_date)\

print(pretty_sql(engine, qr))
ddf=pd.read_sql(qr.statement, dbhelper.engine)
print(ddf)



WITH RECURSIVE cnt(metric_date) AS
  (SELECT DATE(Date(julianday('2020-05-03'))) AS metric_date
   UNION ALL SELECT DATE(julianday(cnt.metric_date) + 7) AS metric_date
   FROM cnt
   WHERE DATE(julianday(cnt.metric_date) + 7) <= Date(julianday('2020-05-10'))),
               date_vals AS
  (SELECT cnt.metric_date AS metric_date
   FROM cnt)
SELECT event.account_id,
       date_vals.metric_date,
       count(DISTINCT event.account_id) AS n_distinct_users
FROM event
JOIN date_vals ON 1
WHERE DATE(event.event_time) <= date_vals.metric_date
  AND DATE(event.event_time) > DATE(julianday(date_vals.metric_date) - 84)
GROUP BY event.account_id,
         date_vals.metric_date
ORDER BY event.account_id,
         date_vals.metric_date
       account_id metric_date  n_distinct_users
0               1  2020-05-03                 1
1               1  2020-05-10                 1
2               2  2020-05-03                 1
3               2  2020-05-10                 1
4               4  2020-05