In [1]:
import numpy as np
import pandas as pd
from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.utils import calibration_and_holdout_data, summary_data_from_transaction_data
from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases, plot_period_transactions
from datetime import datetime, timedelta
import matplotlib.pyplot as plt


In [6]:
def load_dataset(datafile, parse_dates=None):
    df = pd.read_csv(datafile, delimiter=',', parse_dates=parse_dates)
    return df


In [2]:
g_datafolder = '/development/data'
g_customer_dataset = '{}/olist_customers_dataset.csv'.format(g_datafolder)
g_orders_dataset = '{}/olist_orders_dataset.csv'.format(g_datafolder)
g_payments_dataset = '{}/olist_order_payments_dataset.csv'.format(g_datafolder)
g_orderitems_dataset = '{}/olist_order_items_dataset.csv'.format(g_datafolder)


In [76]:
customer_df = load_dataset(g_customer_dataset)
parse_dates = ['order_purchase_timestamp', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']
orders_df = load_dataset(g_orders_dataset, parse_dates=parse_dates)
payments_df = load_dataset(g_payments_dataset)
orderitems_df = load_dataset(g_orderitems_dataset)
cust_ord_df = orders_df.set_index('customer_id').join(customer_df.set_index('customer_id'), how="inner").reset_index()
cust_ord_df = cust_ord_df.set_index('order_id').join(orderitems_df.set_index('order_id'), how="inner").reset_index()
cust_ord_df['monetary_value'] = np.round(cust_ord_df['price'] + cust_ord_df['freight_value'], 0)
cust_ord_df['order_date'] = cust_ord_df.order_purchase_timestamp.dt.date
cust_ord_df['cancelled'] = 0.0
cust_ord_df['cancelled'][cust_ord_df.order_status == 'canceled'] = 1.0
customer_id_col='customer_unique_id'
datetime_col='order_date'
monetary_value_col='monetary_value' 
calibration_period_end = datetime(2018,4,1).date()
observation_period_end = cust_ord_df.order_purchase_timestamp.max().date()


In [104]:

def add_datediffcolumn(df, col_dt1, col_dt2, colname):
    df[colname] = np.round((df[col_dt1] - df[col_dt2]).dt.days, 0) * 1.0
    #df[colname] = np.round((df[col_dt1] - df[col_dt2]) / np.timedelta64(1, 'M'), 0)
    df[colname][df[colname].isna()] = 0.0
    return df

def get_calibration_holdout_data(df
                               , customer_id_col='id'
                               , datetime_col='date'
                               , calibration_period_end=None
                               , observation_period_end=None
                               , monetary_value_col='value'
                               , covariates=None):
    allcols = [customer_id_col, datetime_col, monetary_value_col]
    if covariates is not None:
        allcols = allcols + covariates
    df = df[allcols]
    df['obs'] = 0
    df['obs'][df[datetime_col] >= calibration_period_end] = 1
    df['x'] = 1.0
    sort_cols = [customer_id_col, 'obs', datetime_col]
    agg_map = {'x':'count', monetary_value_col:'mean'}
    if covariates is not None:
        for covariate in covariates:
            agg_map[covariate] = 'sum'
    df = df.groupby([customer_id_col, 'obs', datetime_col]).agg(agg_map).reset_index() #.sort_values(sort_cols).groupby([customer_id_col, 'obs']).cumcount()+1
    df['x'] = df.sort_values(sort_cols).groupby([customer_id_col, 'obs']).cumcount()+1    
    df['x'] = df['x'] - 1.0
    df['x'][df.obs == 1] += 1.0 
    df['first'] = df[datetime_col]       
    df['last'] = df[datetime_col]       
    groupby_cols = [customer_id_col, 'obs']
    all_cols = groupby_cols + [datetime_col, monetary_value_col, 'x', 'first', 'last'] + covariates
    agg_map = {monetary_value_col:'mean', 'x':'max', 'first':'min', 'last':'max' }
    if covariates is not None:
        for covariate in covariates:
            agg_map[covariate] = 'sum'
    df = df.sort_values(sort_cols)[all_cols].groupby(groupby_cols).agg(agg_map).reset_index()
    df['endobs'] = calibration_period_end    
    df['endobs'][df.obs == 1] = observation_period_end
    df = add_datediffcolumn(df, 'last', 'first', 't')
    df = add_datediffcolumn(df, 'endobs', 'first', 'T')
    df['T'][df.obs == 1] = np.round((observation_period_end - calibration_period_end).days, 0) * 1.0
    cols = ['x', 't', 'T', monetary_value_col] + covariates
    cal_df = df[df.obs == 0][[customer_id_col] + cols] 
    cal_df.columns = [customer_id_col] + ['{}_cal'.format(colname) for colname in cols]
    hold_df = df[df.obs == 1][[customer_id_col] + cols] 
    hold_df.columns = [customer_id_col] + ['{}_holdout'.format(colname) for colname in cols]
    df = cal_df.set_index(customer_id_col).join(hold_df.set_index(customer_id_col), how="left").reset_index().fillna(0.0)
    return df


In [105]:
df = get_calibration_holdout_data(cust_ord_df
                                , customer_id_col=customer_id_col
                                , datetime_col=datetime_col
                                , calibration_period_end=calibration_period_end
                                , observation_period_end=observation_period_end
                                , monetary_value_col='monetary_value'
                                , covariates=['cancelled'])

## Implementation

In [None]:
# x ==> number of repeat purchases
# t ==> First purchase to last purchase
# T ==> First purchase to end of observation period


In [296]:
# Setup Regressors (Covariates) for location of 1st-stage prior, i.e. beta = [log(lambda), log(mu)]
def set_regressors(data, covariates=[]):
    data['intercept'] = 1.0
    covariates = ['intercept'] + covariates
    covars = np.matrix(data[covariates])
    K = len(covariates)
    return covariates, covars, K

def get_diag(shape, val):
    d = np.zeros(shape=shape)
    np.fill_diagonal(d, val) 
    return d

def get_map_from_array(x):
    a_map = {}
    count = 0
    for val in x:
        a_map[val] = count
        count += 1
    return a_map

# set hyper priors "log_lambda", "log_mu"
def set_hyperpriors(K):  
    beta_0 = np.zeros(shape=(K, 2))
    A_0 = get_diag(shape=(K, K), val=0.01) # diffuse precision matrix
    # set diffuse hyper-parameters for 2nd-stage prior of gamma_0; follows defaults from rmultireg example
    nu_00 = 3 + K  # 30
    gamma_00 = get_diag(shape=(2, 2), val=nu_00) # diffuse precision matrix
    hyper_prior = {'beta_0': beta_0, 'A_0':A_0, 'nu_00':nu_00, 'gamma_00':gamma_00}
    return hyper_prior

def draw_z(data, level_1, level_1_params_map):
    tx = data['x_cal']
    Tcal = data['T_cal']
    p_lambda = level_1[level_1_params_map['lambda'], ]
    p_mu = level_1[level_1_params_map['mu'], ]

    mu_lam = p_mu + p_lambda
    t_diff = Tcal - tx

    prob = 1 / (1 + (p_mu / mu_lam) * (np.exp(mu_lam * t_diff) - 1))
    z = (np.random.uniform(size=len(prob)) < prob)
    z[z == True] = 1
    z = z.astype(int)
    return list(z.values)

def draw_tau(data, level_1, level_1_params_map):
    N = len(data)
    tx = data['x_cal']
    Tcal = data['T_cal']
    p_lambda = level_1[level_1_params_map['lambda'], ]
    p_mu = level_1[level_1_params_map['mu'], ]

    mu_lam = p_mu + p_lambda
    z = level_1[level_1_params_map['z'], ]

    alive = (z == 1)
    tau = np.zeros(shape=(N))

    # Case: still alive - left truncated exponential distribution -> [T.cal, Inf]
    if (np.sum(alive) > 0):
        tau[alive] = Tcal[alive] + np.random.exponential(scale=1.0/p_mu[alive], size=np.sum(alive))

    # Case: churned - double truncated exponential distribution -> [tx, T.cal]
    if (np.sum(~alive) > 0):
        mu_lam_tx = np.minimum(700, mu_lam[~alive] * tx[~alive])
        mu_lam_Tcal = np.minimum(700, mu_lam[~alive] * Tcal[~alive])
        rand = np.random.uniform(size=np.sum(~alive))        
        tau[~alive] = (-1.0 * np.log((1.0 - rand) * np.exp(-1.0 * mu_lam_tx) + rand * np.exp((-1.0 * mu_lam_Tcal)))) / mu_lam[~alive]

    return tau


In [None]:
def draw_level_2(covars, level_1, level_1_params_map, hyper_prior):
    # standard multi-variate normal regression update
    draw <- bayesm::rmultireg(Y = log(t(level_1[c("lambda", "mu"), ])),
                              X = covars,
                              Bbar = hyper_prior$beta_0,
                              A = hyper_prior$A_0,
                              nu = hyper_prior$nu_00,
                              V = hyper_prior$gamma_00)
    return(list(beta = t(draw$B), gamma = draw$Sigma))


In [None]:
def run_single_chain(data, covariates, K, hyper_prior, nsample, nburnin, nskip):
    ## initialize arrays for storing draws ##
    LOG_LAMBDA = 0
    LOG_MU = 1
    nr_of_cust = len(data)
    nr_of_draws = nburnin + nsample * nskip

    # The 4 is for "lambda", "mu", "tau", "z"
    level_1_params_map = get_map_from_array(['lambda', 'mu', 'tau', 'z'])
    level_1_draws = np.zeros(shape=(nr_of_draws, 4, nr_of_cust))

    level_2_draws = np.zeros(shape=(nr_of_draws, (2*K)+3))
    nm = ['log_lambda', 'log_mu']
    if (K > 1):
        nm = ['{}_{}'.format(val2, val1) for val1 in covariates for val2 in nm]
    nm.extend(['var_log_lambda', 'cov_log_lambda_log_mu', 'var_log_mu'])
    level_2_params_map = get_map_from_array(nm)
        
    ## initialize parameters ##
    data['t_cal_tmp'] = data['t_cal']
    data['t_cal_tmp'] = data['t_cal']
    data['t_cal_tmp'][data.t_cal == 0] = data['T_cal'][data.t_cal == 0] 
    level_1 = level_1_draws[1,]
    x_cal_mean = np.mean(data['x_cal'])
    t_cal_tmp_mean = np.mean(data['t_cal_tmp'])
    level_1[level_1_params_map['lambda'], ] = x_cal_mean/t_cal_tmp_mean
    level_1[level_1_params_map['mu'], ] = 1 / (data['t_cal'] + 0.5 / level_1[level_1_params_map['lambda'], ])
    
    ## run MCMC chain ##
    hyper_prior['beta_0'][0, LOG_LAMBDA] = np.log(np.mean(level_1[level_1_params_map['lambda'], ]))
    hyper_prior['beta_0'][0, LOG_MU] = np.log(np.mean(level_1[level_1_params_map['mu'], ]))
    
    for i in range(0, nr_of_draws):
        # draw individual-level parameters
        level_1[level_1_params_map['z'], ] = draw_z(data, level_1, level_1_params_map)
        level_1[level_1_params_map['tau'], ] = draw_tau(data, level_1, level_1_params_map)

        level_2 = draw_level_2(covars, level_1, hyper_prior)
        
        nk = int(round((i - nburnin) / nskip))
        if (i > nskip and floor(nk) == nk and nk > 0):
            #Store
            

    for (step in 1:(burnin + mcmc)) {
      if (step %% trace == 0)
        cat("chain:", chain_id, "step:", step, "of", (burnin + mcmc), "\n")

      # draw individual-level parameters
      level_1["z", ] <- draw_z(data, level_1)
      level_1["tau", ] <- draw_tau(data, level_1)

      level_2 <- draw_level_2(covars, level_1, hyper_prior)

      draw <- draw_level_1(data, covars, level_1, level_2)
      level_1["lambda", ] <- draw$lambda
      level_1["mu", ] <- draw$mu

      # store
      if ( (step - burnin) > 0 & (step - 1 - burnin) %% thin == 0) {
        idx <- (step - 1 - burnin) %/% thin + 1
        level_1_draws[idx, , ] <- level_1 # nolint
        level_2_draws[idx, ] <- c(level_2$beta, level_2$gamma[1, 1], level_2$gamma[1, 2], level_2$gamma[2,
          2])
      }
    }

    # convert MCMC draws into coda::mcmc objects
    return(list(
      "level_1" = lapply(1:nr_of_cust,
                         function(i) mcmc(level_1_draws[, , i], start = burnin, thin = thin)), # nolint
      "level_2" = mcmc(level_2_draws, start = burnin, thin = thin)))
    


In [None]:
run_single_chain(df, K=K, hyper_prior=hyper_prior, nsample, nburnin, nskip)

In [260]:
level_1[level_1_params_map['z'], ]

array([1., 1., 1., ..., 1., 1., 1.])

In [292]:
N = len(data)
tx = data['x_cal']
Tcal = data['T_cal']
p_lambda = level_1[level_1_params_map['lambda'], ]
p_mu = level_1[level_1_params_map['mu'], ]

mu_lam = p_mu + p_lambda
z = level_1[level_1_params_map['z'], ]

alive = (z == 1)
tau = np.zeros(shape=(N))

tau[alive] = Tcal[alive] + np.random.exponential(scale=1.0/p_mu[alive], size=np.sum(alive))

mu_lam_tx = np.minimum(700, mu_lam[~alive] * tx[~alive])
mu_lam_Tcal = np.minimum(700, mu_lam[~alive] * Tcal[~alive])
rand = np.random.uniform(size=np.sum(~alive))
tau[~alive] = (-1.0 * np.log((1.0 - rand) * np.exp(-1.0 * mu_lam_tx) + rand * np.exp((-1.0 * mu_lam_Tcal)))) / mu_lam[~alive]



In [294]:
list(tau)

[7303.665576879865,
 873.9122551865988,
 725.7801670199656,
 2357.5831551679726,
 4059.7527119820847,
 9007.522687664185,
 937.8990222767435,
 5691.816548139941,
 9699.257757365569,
 1881.710997504712,
 8418.942308712056,
 733.625716885339,
 9029.475212139781,
 2710.519666683823,
 1791.0866227670197,
 1008.2604892912171,
 893.4854602293963,
 5125.5389012468495,
 8056.4847765,
 690.1921356144121,
 780.6483851569822,
 67.45854554717704,
 11590.155824423031,
 3606.579123458521,
 609.8160103757914,
 4522.35079617258,
 4355.653326710903,
 2028.8852353138823,
 2238.709466672138,
 3643.041692911666,
 7424.922109536365,
 945.6257733081019,
 5581.108395685735,
 3828.195937057576,
 1946.079464821689,
 4268.3431530010375,
 2197.7709128655342,
 4334.825253015081,
 9131.152233910827,
 7285.548805923165,
 595.2926400449993,
 13845.794313399765,
 2808.8304179098136,
 1872.7073375386176,
 5962.074693361846,
 297.88958715066144,
 3412.821406030089,
 11922.002776618932,
 2940.531527789208,
 1097.6722576

In [274]:
1.0/p_mu[alive]

array([4193.28808971, 4193.28808971, 4193.28808971, ..., 4193.28808971,
       4193.28808971, 4193.28808971])

In [233]:
hyper_prior['beta_0']

array([[-9.03438763, -8.34164398],
       [ 0.        ,  0.        ]])

In [215]:
level_1[level_1_params_map['mu'], ]

array([0.00023848, 0.00023848, 0.00023848, ..., 0.00023848, 0.00023848,
       0.00023848])

In [197]:
np.mean(data['x_cal'])

0.020240760163428875

In [198]:
data.columns

Index(['customer_unique_id', 'x_cal', 't_cal', 'T_cal', 'monetary_value_cal',
       'cancelled_cal', 'x_holdout', 't_holdout', 'T_holdout',
       'monetary_value_holdout', 'cancelled_holdout', 'intercept'],
      dtype='object')

In [199]:
data['t_cal_tmp'] = data['t_cal']
data['t_cal_tmp'] = data['t_cal']
data['t_cal_tmp'][data.t_cal == 0] = data['T_cal'][data.t_cal == 0] 

In [201]:
data[data.t_cal > 0].head()

Unnamed: 0,customer_unique_id,x_cal,t_cal,T_cal,monetary_value_cal,cancelled_cal,x_holdout,t_holdout,T_holdout,monetary_value_holdout,cancelled_holdout,intercept,t_cal_tmp
68,004288347e5e88a27ded2bb23747066c,1.0,171.0,248.0,177.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,171.0
88,0058f300f57d7b93c477a131a59b36c3,1.0,31.0,41.0,88.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,31.0
276,011b4adcd54683b480c4d841250a987f,1.0,177.0,222.0,80.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,177.0
310,013f4353d26bb05dc6652f1269458d8d,1.0,4.0,128.0,137.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0
333,015557c9912277312b9073947804a7ba,1.0,39.0,374.0,157.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,39.0
