In [62]:
import numpy as np
import pandas as pd
from math import floor, ceil
from numpy.linalg import cholesky, inv, solve
from scipy.stats import wishart, invwishart
from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.utils import calibration_and_holdout_data, summary_data_from_transaction_data
from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases, plot_period_transactions
from datetime import datetime, timedelta
import matplotlib.pyplot as plt


In [2]:
def load_dataset(datafile, parse_dates=None):
    df = pd.read_csv(datafile, delimiter=',', parse_dates=parse_dates)
    return df


In [3]:
g_datafolder = '/development/data'
g_customer_dataset = '{}/olist_customers_dataset.csv'.format(g_datafolder)
g_orders_dataset = '{}/olist_orders_dataset.csv'.format(g_datafolder)
g_payments_dataset = '{}/olist_order_payments_dataset.csv'.format(g_datafolder)
g_orderitems_dataset = '{}/olist_order_items_dataset.csv'.format(g_datafolder)


In [4]:
customer_df = load_dataset(g_customer_dataset)
parse_dates = ['order_purchase_timestamp', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']
orders_df = load_dataset(g_orders_dataset, parse_dates=parse_dates)
payments_df = load_dataset(g_payments_dataset)
orderitems_df = load_dataset(g_orderitems_dataset)
cust_ord_df = orders_df.set_index('customer_id').join(customer_df.set_index('customer_id'), how="inner").reset_index()
cust_ord_df = cust_ord_df.set_index('order_id').join(orderitems_df.set_index('order_id'), how="inner").reset_index()
cust_ord_df['monetary_value'] = np.round(cust_ord_df['price'] + cust_ord_df['freight_value'], 0)
cust_ord_df['order_date'] = cust_ord_df.order_purchase_timestamp.dt.date
cust_ord_df['cancelled'] = 0.0
cust_ord_df['cancelled'][cust_ord_df.order_status == 'canceled'] = 1.0
customer_id_col='customer_unique_id'
datetime_col='order_date'
monetary_value_col='monetary_value' 
calibration_period_end = datetime(2018,4,1).date()
observation_period_end = cust_ord_df.order_purchase_timestamp.max().date()


In [5]:

def add_datediffcolumn(df, col_dt1, col_dt2, colname):
    df[colname] = np.round((df[col_dt1] - df[col_dt2]).dt.days, 0) * 1.0
    #df[colname] = np.round((df[col_dt1] - df[col_dt2]) / np.timedelta64(1, 'M'), 0)
    df[colname][df[colname].isna()] = 0.0
    return df

def get_calibration_holdout_data(df
                               , customer_id_col='id'
                               , datetime_col='date'
                               , calibration_period_end=None
                               , observation_period_end=None
                               , monetary_value_col='value'
                               , covariates=None):
    allcols = [customer_id_col, datetime_col, monetary_value_col]
    if covariates is not None:
        allcols = allcols + covariates
    df = df[allcols]
    df['obs'] = 0
    df['obs'][df[datetime_col] >= calibration_period_end] = 1
    df['x'] = 1.0
    sort_cols = [customer_id_col, 'obs', datetime_col]
    agg_map = {'x':'count', monetary_value_col:'mean'}
    if covariates is not None:
        for covariate in covariates:
            agg_map[covariate] = 'sum'
    df = df.groupby([customer_id_col, 'obs', datetime_col]).agg(agg_map).reset_index() #.sort_values(sort_cols).groupby([customer_id_col, 'obs']).cumcount()+1
    df['x'] = df.sort_values(sort_cols).groupby([customer_id_col, 'obs']).cumcount()+1    
    df['x'] = df['x'] - 1.0
    df['x'][df.obs == 1] += 1.0 
    df['first'] = df[datetime_col]       
    df['last'] = df[datetime_col]       
    groupby_cols = [customer_id_col, 'obs']
    all_cols = groupby_cols + [datetime_col, monetary_value_col, 'x', 'first', 'last'] + covariates
    agg_map = {monetary_value_col:'mean', 'x':'max', 'first':'min', 'last':'max' }
    if covariates is not None:
        for covariate in covariates:
            agg_map[covariate] = 'sum'
    df = df.sort_values(sort_cols)[all_cols].groupby(groupby_cols).agg(agg_map).reset_index()
    df['endobs'] = calibration_period_end    
    df['endobs'][df.obs == 1] = observation_period_end
    df = add_datediffcolumn(df, 'last', 'first', 't')
    df = add_datediffcolumn(df, 'endobs', 'first', 'T')
    df['T'][df.obs == 1] = np.round((observation_period_end - calibration_period_end).days, 0) * 1.0
    cols = ['x', 't', 'T', monetary_value_col] + covariates
    cal_df = df[df.obs == 0][[customer_id_col] + cols] 
    cal_df.columns = [customer_id_col] + ['{}_cal'.format(colname) for colname in cols]
    hold_df = df[df.obs == 1][[customer_id_col] + cols] 
    hold_df.columns = [customer_id_col] + ['{}_holdout'.format(colname) for colname in cols]
    df = cal_df.set_index(customer_id_col).join(hold_df.set_index(customer_id_col), how="left").reset_index().fillna(0.0)
    return df


In [6]:
df = get_calibration_holdout_data(cust_ord_df
                                , customer_id_col=customer_id_col
                                , datetime_col=datetime_col
                                , calibration_period_end=calibration_period_end
                                , observation_period_end=observation_period_end
                                , monetary_value_col='monetary_value'
                                , covariates=['cancelled'])

## Implementation

In [None]:
# x ==> number of repeat purchases
# t ==> First purchase to last purchase
# T ==> First purchase to end of observation period


In [87]:
# Setup Regressors (Covariates) for location of 1st-stage prior, i.e. beta = [log(lambda), log(mu)]
def set_regressors(data, covariates=[]):
    data['intercept'] = 1.0
    covariates = ['intercept'] + covariates
    covars = np.matrix(data[covariates])
    K = len(covariates)
    return covariates, covars, K

def get_diag(shape, val):
    d = np.zeros(shape=shape)
    np.fill_diagonal(d, val) 
    return d

def get_map_from_array(x):
    a_map = {}
    count = 0
    for val in x:
        a_map[val] = count
        count += 1
    return a_map

# set hyper priors "log_lambda", "log_mu"
def set_hyperpriors(K):  
    beta_0 = np.zeros(shape=(K, 2))
    A_0 = get_diag(shape=(K, K), val=0.01) # diffuse precision matrix
    # set diffuse hyper-parameters for 2nd-stage prior of gamma_0; follows defaults from rmultireg example
    nu_00 = 3 + K  # 30
    gamma_00 = get_diag(shape=(2, 2), val=nu_00) # diffuse precision matrix
    hyper_prior = {'beta_0': beta_0, 'A_0':A_0, 'nu_00':nu_00, 'gamma_00':gamma_00}
    return hyper_prior

def draw_z(data, level_1, level_1_params_map):
    tx = data['x_cal']
    Tcal = data['T_cal']
    p_lambda = level_1[level_1_params_map['lambda'], ]
    p_mu = level_1[level_1_params_map['mu'], ]

    mu_lam = p_mu + p_lambda
    t_diff = Tcal - tx

    prob = 1 / (1 + (p_mu / mu_lam) * (np.exp(mu_lam * t_diff) - 1))
    z = (np.random.uniform(size=len(prob)) < prob)
    z[z == True] = 1
    z = z.astype(int)
    return list(z.values)

def draw_tau(data, level_1, level_1_params_map):
    N = len(data)
    tx = data['x_cal']
    Tcal = data['T_cal']
    p_lambda = level_1[level_1_params_map['lambda'], ]
    p_mu = level_1[level_1_params_map['mu'], ]

    mu_lam = p_mu + p_lambda
    z = level_1[level_1_params_map['z'], ]

    alive = (z == 1)
    tau = np.zeros(shape=(N))

    # Case: still alive - left truncated exponential distribution -> [T.cal, Inf]
    if (np.sum(alive) > 0):
        tau[alive] = Tcal[alive] + np.random.exponential(scale=1.0/p_mu[alive], size=np.sum(alive))

    # Case: churned - double truncated exponential distribution -> [tx, T.cal]
    if (np.sum(~alive) > 0):
        mu_lam_tx = np.minimum(700, mu_lam[~alive] * tx[~alive])
        mu_lam_Tcal = np.minimum(700, mu_lam[~alive] * Tcal[~alive])
        rand = np.random.uniform(size=np.sum(~alive))        
        tau[~alive] = (-1.0 * np.log((1.0 - rand) * np.exp(-1.0 * mu_lam_tx) + rand * np.exp((-1.0 * mu_lam_Tcal)))) / mu_lam[~alive]

    return tau

def draw_wishart(df, scale):
    W = wishart.rvs(df, scale)
    IW = invwishart.rvs(df, scale)
    C = cholesky(W).T
    CI = solve(C, np.eye(W.shape[0], W.shape[1]))
    return W, IW, C, CI

def draw_level_2(covars, level_1, level_1_params_map, hyper_prior):
    # standard multi-variate normal regression update
    # Ported from 
    # https://github.com/cran/bayesm/blob/master/src/rmultireg_rcpp.cpp
    # Arguments:
    #  Y is n x m matrix
    #  X is n x k
    #  Bbar is the prior mean of regression coefficients  (k x m)
    #  A is prior precision matrix
    #  nu, V are parameters for prior on Sigma

    # Output: list of B, Sigma draws of matrix of coefficients and Sigma matrix

    # Model: 
    #  Y=XB+U  cov(u_i) = Sigma
    #  B is k x m matrix of coefficients

    # Prior:  
    #  beta|Sigma  ~ N(betabar,Sigma (x) A^-1)
    #  betabar=vec(Bbar)
    #  beta = vec(B) 
    #  Sigma ~ IW(nu,V) or Sigma^-1 ~ W(nu, V^-1)

    Y = np.log(level_1[[level_1_params_map['lambda'], level_1_params_map['mu']],].T)
    X = covars
    Bbar = hyper_prior['beta_0']
    A = hyper_prior['A_0']
    nu = hyper_prior['nu_00']
    V = hyper_prior['gamma_00']

    n = Y.shape[0]
    m = Y.shape[1]
    k = X.shape[1]    

    #first draw Sigma
    RA = cholesky(A)
    W = np.concatenate((X, RA), axis=0) 
    Z = np.concatenate((Y, RA*Bbar), axis=0)
    # note:  Y,X,A,Bbar must be matrices!
    IR = solve(np.triu(cholesky(W.T*W)), np.eye(k,k)) #trimatu interprets the matrix as upper triangular and makes solve more efficient
    # W'W = R'R  &  (W'W)^-1 = IRIR'  -- this is the UL decomp!
    Btilde = (IR*IR.T) * (W.T*Z);
    # IRIR'(W'Z) = (X'X+A)^-1(X'Y + ABbar)
    E = Z-W*Btilde;
    S = E.T*E;
    # E'E    
    # compute the inverse of V+S
    ucholinv = solve(np.triu(cholesky(V+S)), np.eye(m,m))
    VSinv = ucholinv*ucholinv.T
    W, IW, C, CI = draw_wishart(df=nu+n, scale=VSinv)    
    # now draw B given Sigma
    #   note beta ~ N(vec(Btilde),Sigma (x) Covxxa)
    #       Cov=(X'X + A)^-1  = IR t(IR)  
    #       Sigma=CICI'    
    #       therefore, cov(beta)= Omega = CICI' (x) IR IR' = (CI (x) IR) (CI (x) IR)'
    #  so to draw beta we do beta= vec(Btilde) +(CI (x) IR)vec(Z_mk)  
    #       Z_mk is m x k matrix of N(0,1)
    #  since vec(ABC) = (C' (x) A)vec(B), we have 
    #       B = Btilde + IR Z_mk CI'
    samples = np.random.normal(size=k*m).reshape(k,m)
    B = Btilde + IR*samples*CI.T
    return {'beta': B.T, 'gamma':IW}

def log_post(log_theta, mvmean, x, z, Tcal, tau, inv_gamma):
    log_lambda = log_theta[0,:]
    log_mu = log_theta[1,:]
    diff_lambda = log_lambda - mvmean[:,0]
    diff_mu = log_mu - mvmean[:,1]      
    likel = x * log_lambda + (1.0 - z) * log_mu - (np.exp(log_lambda) + np.exp(log_mu)) * (z * Tcal + (1.0 - z) * tau)
    prior = -0.5 * (diff_lambda ^ 2 * inv_gamma[1, 1] +
                         2.0 * diff_lambda * diff_mu * inv_gamma[0, 1] +
                         diff_mu ^ 2 * inv_gamma[1, 1])
    post = likel + prior
    post[log_mu > 5] = np.NINF  # cap !!
    return post

def step(cur_log_theta, cur_post, gamma, N, mvmean, x, z, Tcal, tau, inv_gamma):
    a = gamma[0, 0] * np.random.standard_t(df=3, size=N)
    b = gamma[1, 1] * np.random.standard_t(df=3, size=N)
    new_log_theta = cur_log_theta + np.concatenate((a, b), axis=0)
    new_log_theta[0,:] = np.maximum(np.minimum(new_log_theta[0,:], 70), -70)
    new_log_theta[1,:] = np.maximum(np.minimum(new_log_theta[1,:], 70), -70)
    new_post = log_post(new_log_theta, mvmean, x, z, Tcal, tau, inv_gamma)
    
    # accept/reject new proposal
    mhratio = np.exp(new_post - cur_post)
    accepted = mhratio > np.random.unifprm(size=N)
    
    return {'cur_log_theta':cur_log_theta, 'cur_post':cur_post}

def draw_level_1(data, covars, level_1, level_1_params_map, level_2):
    # sample (lambda, mu) given (z, tau, beta, gamma)
    N = len(data)
    x = data['x_cal']
    Tcal = data['T_cal']
    z = level_1[level_1_params_map['z'], ]
    tau = level_1[level_1_params_map['tau'], ]
    mvmean = np.matmul(covars, level_2['beta'])
    gamma = level_2['gamma']
    inv_gamma = inv(gamma)
    
    cur_lambda = level_1[level_1_params_map['lambda'], ]
    cur_mu = level_1[level_1_params_map['mu'], ]

    # current state
    cur_log_theta = np.concatenate((np.log(cur_lambda), np.log(cur_mu)), axis=0)
    cur_post = log_post(cur_log_theta, mvmean, x, z, Tcal, tau, inv_gamma)
    
    iter = 1  # how high do we need to set this? 1/5/10/100?
    for i in range(0, iter):
        draw = step(cur_log_theta, cur_post, gamma, N, mvmean, x, z, Tcal, tau, inv_gamma)
        cur_log_theta = draw['cur_log_theta']
        cur_post = draw['cur_post']

    cur_theta = np.exp(cur_log_theta)

    return {'lambda':cur_theta[0,:], 'mu':cur_theta[1,:]}

def run_single_chain(data, covariates, K, hyper_prior, nsample, nburnin, nskip):
    ## initialize arrays for storing draws ##
    LOG_LAMBDA = 0
    LOG_MU = 1
    nr_of_cust = len(data)
    nr_of_draws = nburnin + nsample * nskip

    # The 4 is for "lambda", "mu", "tau", "z"
    level_1_params_map = get_map_from_array(['lambda', 'mu', 'tau', 'z'])
    level_1_draws = np.zeros(shape=(nr_of_draws, 4, nr_of_cust))

    level_2_draws = np.zeros(shape=(nr_of_draws, (2*K)+3))
    nm = ['log_lambda', 'log_mu']
    if (K > 1):
        nm = ['{}_{}'.format(val2, val1) for val1 in covariates for val2 in nm]
    nm.extend(['var_log_lambda', 'cov_log_lambda_log_mu', 'var_log_mu'])
    level_2_params_map = get_map_from_array(nm)
        
    ## initialize parameters ##
    data['t_cal_tmp'] = data['t_cal']
    data['t_cal_tmp'] = data['t_cal']
    data['t_cal_tmp'][data.t_cal == 0] = data['T_cal'][data.t_cal == 0] 
    level_1 = level_1_draws[1,]
    x_cal_mean = np.mean(data['x_cal'])
    t_cal_tmp_mean = np.mean(data['t_cal_tmp'])
    level_1[level_1_params_map['lambda'], ] = x_cal_mean/t_cal_tmp_mean
    level_1[level_1_params_map['mu'], ] = 1 / (data['t_cal'] + 0.5 / level_1[level_1_params_map['lambda'], ])
    
    ## run MCMC chain ##
    hyper_prior['beta_0'][0, LOG_LAMBDA] = np.log(np.mean(level_1[level_1_params_map['lambda'], ]))
    hyper_prior['beta_0'][0, LOG_MU] = np.log(np.mean(level_1[level_1_params_map['mu'], ]))
    
    for i in range(0, nr_of_draws):
        # draw individual-level parameters
        level_1[level_1_params_map['z'], ] = draw_z(data, level_1, level_1_params_map)
        level_1[level_1_params_map['tau'], ] = draw_tau(data, level_1, level_1_params_map)

        level_2 = draw_level_2(covars, level_1, level_1_params_map, hyper_prior)
        
        nk = int(round((i - nburnin) / nskip))
        if (i > nskip and floor(nk) == nk and nk > 0):
            #Store
            level_1_draws[nk,:,:] = level_1 # nolint
            level_2_draws[nk,:] = list(np.array(beta.T).reshape(-1)) + [level_2['gamma'][0, 0], level_2['gamma'][0, 1], level_2['gamma'][1,1]]
        if (i % 100) == 0:
            print('draw: {}'.format(i))
            
    return {"level_1":level_1_draws, "level_2":level_2_draws}    


In [89]:
# Main routine
covariates, covars, K = set_regressors(df, covariates=["cancelled_cal"])
hyper_prior = set_hyperpriors(K)
draws = run_single_chain(df, covariates=covariates, K=K, hyper_prior=hyper_prior, nsample=nsample, nburnin=nburnin, nskip=nskip)


draw: 0
draw: 10
draw: 20
draw: 30
draw: 40
draw: 50
draw: 60
draw: 70
draw: 80
draw: 90
draw: 100
draw: 110
draw: 120
draw: 130
draw: 140
draw: 150
draw: 160
draw: 170
draw: 180
draw: 190
draw: 200
draw: 210
draw: 220
draw: 230
draw: 240
draw: 250
draw: 260
draw: 270
draw: 280
draw: 290
draw: 300
draw: 310
draw: 320
draw: 330
draw: 340
draw: 350
draw: 360


In [90]:
draws['level_1'].shape

(370, 4, 63881)

In [91]:
draws['level_2'].shape

(370, 7)

In [8]:

covariates, covars, K = set_regressors(df, covariates=["cancelled_cal"])
hyper_prior = set_hyperpriors(K)

# Prep code 
data = df
LOG_LAMBDA = 0
LOG_MU = 1
nr_of_cust = len(data)
nburnin = 70
nsample = 30
nskip = 10
nr_of_draws = nburnin + nsample * nskip

# The 4 is for "lambda", "mu", "tau", "z"
level_1_params_map = get_map_from_array(['lambda', 'mu', 'tau', 'z'])
level_1_draws = np.zeros(shape=(nr_of_draws, 4, nr_of_cust))

level_2_draws = np.zeros(shape=(nr_of_draws, (2*K)+3))
nm = ['log_lambda', 'log_mu']
if (K > 1):
    nm = ['{}_{}'.format(val2, val1) for val1 in covariates for val2 in nm]
nm.extend(['var_log_lambda', 'cov_log_lambda_log_mu', 'var_log_mu'])
level_2_params_map = get_map_from_array(nm)
        
## initialize parameters ##
data['t_cal_tmp'] = data['t_cal']
data['t_cal_tmp'] = data['t_cal']
data['t_cal_tmp'][data.t_cal == 0] = data['T_cal'][data.t_cal == 0] 
level_1 = level_1_draws[1,]
x_cal_mean = np.mean(data['x_cal'])
t_cal_tmp_mean = np.mean(data['t_cal_tmp'])
level_1[level_1_params_map['lambda'], ] = x_cal_mean/t_cal_tmp_mean
level_1[level_1_params_map['mu'], ] = 1 / (data['t_cal'] + 0.5 / level_1[level_1_params_map['lambda'], ])
    
## run MCMC chain ##
hyper_prior['beta_0'][0, LOG_LAMBDA] = np.log(np.mean(level_1[level_1_params_map['lambda'], ]))
hyper_prior['beta_0'][0, LOG_MU] = np.log(np.mean(level_1[level_1_params_map['mu'], ]))
    
# draw individual-level parameters
level_1[level_1_params_map['z'], ] = draw_z(data, level_1, level_1_params_map)
level_1[level_1_params_map['tau'], ] = draw_tau(data, level_1, level_1_params_map)
level_2 = draw_level_2(covars, level_1, level_1_params_map, hyper_prior)


In [None]:
level_2

In [88]:
run_single_chain(df, covariates=covariates, K=K, hyper_prior=hyper_prior, nsample=nsample, nburnin=nburnin, nskip=nskip)

draw: 0
draw: 10
draw: 20
draw: 30
draw: 40
draw: 50
draw: 60
draw: 70
draw: 80
draw: 90
draw: 100
draw: 110
draw: 120
draw: 130
draw: 140
draw: 150
draw: 160
draw: 170
draw: 180
draw: 190
draw: 200
draw: 210
draw: 220
draw: 230
draw: 240
draw: 250
draw: 260
draw: 270
draw: 280
draw: 290
draw: 300
draw: 310
draw: 320
draw: 330
draw: 340
draw: 350
draw: 360


{'level_1': array([[[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
         [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
         [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
         [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],
 
        [[1.19238170e-04, 1.19238170e-04, 1.19238170e-04, ...,
          1.19238170e-04, 1.19238170e-04, 1.19238170e-04],
         [2.38476341e-04, 2.38476341e-04, 2.38476341e-04, ...,
          2.38476341e-04, 2.38476341e-04, 2.38476341e-04],
         [1.05226389e+04, 4.74334369e+01, 3.36677370e+02, ...,
          2.16144891e+03, 1.71518736e+03, 5.10626305e+03],
         [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
          1.00000000e+00, 1.00000000e+00, 1.00000000e+00]],
 
       

In [None]:
level_1[level_1_params_map['z'], ]

In [None]:
N = len(data)
tx = data['x_cal']
Tcal = data['T_cal']
p_lambda = level_1[level_1_params_map['lambda'], ]
p_mu = level_1[level_1_params_map['mu'], ]

mu_lam = p_mu + p_lambda
z = level_1[level_1_params_map['z'], ]

alive = (z == 1)
tau = np.zeros(shape=(N))

tau[alive] = Tcal[alive] + np.random.exponential(scale=1.0/p_mu[alive], size=np.sum(alive))

mu_lam_tx = np.minimum(700, mu_lam[~alive] * tx[~alive])
mu_lam_Tcal = np.minimum(700, mu_lam[~alive] * Tcal[~alive])
rand = np.random.uniform(size=np.sum(~alive))
tau[~alive] = (-1.0 * np.log((1.0 - rand) * np.exp(-1.0 * mu_lam_tx) + rand * np.exp((-1.0 * mu_lam_Tcal)))) / mu_lam[~alive]



In [None]:
list(tau)

In [None]:
1.0/p_mu[alive]

In [None]:
hyper_prior['beta_0']

In [None]:
level_1[level_1_params_map['mu'], ]

In [None]:
np.mean(data['x_cal'])

In [None]:
data.columns

In [None]:
data['t_cal_tmp'] = data['t_cal']
data['t_cal_tmp'] = data['t_cal']
data['t_cal_tmp'][data.t_cal == 0] = data['T_cal'][data.t_cal == 0] 

In [None]:
data[data.t_cal > 0].head()