In [18]:
#Inital imports
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lifetimes
from lifetimes.plotting import plot_period_transactions, plot_calibration_purchases_vs_holdout_purchases
from lifetimes import BetaGeoFitter
from datetime import timedelta
from datetime import datetime
from dateutil import parser
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [15]:
!ls data

OnlineRetail_2yrs.csv             lifetimes_object_df_uncleaned
customer_data_base.zip            lifetimes_object_df_uncleaned.csv
lifetimes_object_df.csv


In [16]:
#Load in our pre-prepared dataframes
lifetimes_object_df = pd.read_csv("data/lifetimes_object_df.csv")
lifetimes_object_df_uncleaned = pd.read_csv("data/lifetimes_object_df_uncleaned.csv")

### Preparation and evaluation functions

In [5]:
#Load in our data wrangler
from lifetimes.utils import summary_data_from_transaction_data
#Import holdout 
from lifetimes.utils import calibration_and_holdout_data

In [60]:
#class to return a calibration and holdout df
class df_ch():
    def __init__(self, eval_period=None,transaction_df=None,purchase_timestamp_col=None,customer_id_col=None,datetime_col=None,monetary_value_col=None):
        #initialized attributes
        self.eval_period = eval_period
        self.transaction_df = transaction_df
        self.purchase_timestamp_col=purchase_timestamp_col
        self.customer_id_col=customer_id_col
        self.datetime_col=datetime_col
        self.monetary_value_col=monetary_value_col
        #save off more attributes
        self.min_obs_date = parser.parse(transaction_df[purchase_timestamp_col].min())
        self.max_obs_date = parser.parse(transaction_df[purchase_timestamp_col].max())
        self.max_calib_date = self.max_obs_date - timedelta(days=eval_period)  
        self.calib_range_days = (self.max_calib_date - self.min_obs_date).days
    def df_ch_getdf(self):
        df = calibration_and_holdout_data(
        transactions = self.transaction_df, 
        customer_id_col=self.customer_id_col,
        datetime_col=self.datetime_col,
        monetary_value_col=self.monetary_value_col,
        calibration_period_end = self.max_calib_date, 
        observation_period_end = self.max_obs_date, 
        freq = "D")
        return df
        

In [7]:
#function to return a df of real and predicted transacitons in eval period
def bgf_real_v_pred_df(ch,bgf):
    # transactions in the observation period equals frequency_holdout + 1
    rfm_cal_holdout = pd.DataFrame()
    ch_df = ch.df_ch_getdf()
    rfm_cal_holdout["n_transactions_cal_real"]  = ch_df["frequency_cal"] + 1
    rfm_cal_holdout["n_transactions_holdout_real"]  = ch_df["frequency_holdout"] + 1
    # the predicted number of transactions
    rfm_cal_holdout["n_transactions_holdout_pred"] = bgf.predict(t=ch.eval_period, 
                                                    frequency=ch_df['frequency_cal'], 
                                                    recency=ch_df['recency_cal'], 
                                                    T=ch_df['T_cal'])
    return rfm_cal_holdout[["n_transactions_cal_real","n_transactions_holdout_real", "n_transactions_holdout_pred"]]

In [8]:
#function to capture RMSE for a BGF model
def bgf_rmse(ch,bgf):
    df_ch = ch.df_ch_getdf()
    df_ch["n_transactions_holdout_real"] = df_ch["frequency_holdout"] + 1
    y_true = df_ch["n_transactions_holdout_real"]
    y_pred = bgf.predict(t=ch.eval_period, frequency=df_ch['frequency_cal'],
                         recency=df_ch['recency_cal'],
                         T=df_ch['T_cal'])

    return mean_squared_error(y_true,y_pred)

In [9]:
#function to return predicted # transactions for given customer in evaluation period
def samp_cust_pred_trans(df_ch,sample_customer_id,eval_period):
    sample_customer = df_ch.loc[sample_customer_id]
    n_transactions_pred = bgf.predict(t=eval_period,
                                  frequency=sample_customer['frequency_cal'], 
                                  recency=sample_customer['recency_cal'], 
                                  T=sample_customer['T_cal'])
    return(n_transactions_pred)

###  Dummy model
- Predict the same purchase as the observed in the calibration period

In [64]:
df_ch_1

Unnamed: 0_level_0,frequency_cal,recency_cal,T_cal,monetary_value_cal,frequency_holdout,monetary_value_holdout,duration_holdout
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0000f46a3911fa3c0805444483337064,0.0,0.0,346.0,0.0,0.0,0.0,240.0
0000f6ccb0745a6a4b88665a16c9f078,0.0,0.0,130.0,0.0,0.0,0.0,240.0
0004aac84e0df4da2b147fca70cf8255,0.0,0.0,97.0,0.0,0.0,0.0,240.0
0005e1862207bf6ccc02e4228effd9a0,0.0,0.0,352.0,0.0,0.0,0.0,240.0
0006fdc98a402fceb4eb0ee528f6a8d4,0.0,0.0,216.0,0.0,0.0,0.0,240.0
...,...,...,...,...,...,...,...
fffbf87b7a1a6fa8b03f081c5f51a201,0.0,0.0,54.0,0.0,0.0,0.0,240.0
fffcf5a5ff07b0908bd4e2dbc735a684,0.0,0.0,256.0,0.0,0.0,0.0,240.0
fffea47cd6d3cc0a88bd621562a9d061,0.0,0.0,71.0,0.0,0.0,0.0,240.0
ffff371b4d645b6ecea244b27531430a,0.0,0.0,377.0,0.0,0.0,0.0,240.0


In [65]:
ch_1.eval_period

240

In [75]:
dummy_ch_df = pd.DataFrame()
dummy_ch_df["purchases_per_period_cal"] = df_ch_1["frequency_cal"] + 1
#533 is the calib range
dummy_ch_df["purchases_per_period_cal"] = dummy_ch_df["purchases_per_period_cal"]/533
dummy_ch_df["dummy_pred_purchases_holdout"] = dummy_ch_df["purchases_per_period_cal"] * 240
dummy_ch_df["actual_purchases_holdout"] = df_ch_1["frequency_holdout"] + 1
dummy_ch_df

Unnamed: 0_level_0,purchases_per_period_cal,dummy_pred_purchases_holdout,actual_purchases_holdout
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0000f46a3911fa3c0805444483337064,0.001876,0.450281,1.0
0000f6ccb0745a6a4b88665a16c9f078,0.001876,0.450281,1.0
0004aac84e0df4da2b147fca70cf8255,0.001876,0.450281,1.0
0005e1862207bf6ccc02e4228effd9a0,0.001876,0.450281,1.0
0006fdc98a402fceb4eb0ee528f6a8d4,0.001876,0.450281,1.0
...,...,...,...
fffbf87b7a1a6fa8b03f081c5f51a201,0.001876,0.450281,1.0
fffcf5a5ff07b0908bd4e2dbc735a684,0.001876,0.450281,1.0
fffea47cd6d3cc0a88bd621562a9d061,0.001876,0.450281,1.0
ffff371b4d645b6ecea244b27531430a,0.001876,0.450281,1.0


In [78]:
mean_squared_error(y_true=dummy_ch_df["dummy_pred_purchases_holdout"],y_pred=dummy_ch_df["actual_purchases_holdout"])

0.32524156574867047

###  1.0 BG/NBD model
Model features
- Uncleaned data

Model results
- Terrrible
- Our model is way off because we have such an imbalence of customers who never made a repeat purchase.

Next steps
- Drop customers with less than 2 orders

In [33]:
ch_1 = df_ch(eval_period=240,transaction_df=lifetimes_object_df_uncleaned,purchase_timestamp_col='order_purchase_timestamp',
                   customer_id_col='customer_unique_id',datetime_col='order_purchase_timestamp',
                   monetary_value_col='payment_value')

In [59]:
#import numpy as np
a = (ch_1.max_calib_date - ch_1.min_obs_date).days
a

533

In [26]:
df_ch_1 = ch_1.df_ch_getdf()
df_ch_1

Unnamed: 0_level_0,frequency_cal,recency_cal,T_cal,monetary_value_cal,frequency_holdout,monetary_value_holdout,duration_holdout
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0000f46a3911fa3c0805444483337064,0.0,0.0,346.0,0.0,0.0,0.0,240.0
0000f6ccb0745a6a4b88665a16c9f078,0.0,0.0,130.0,0.0,0.0,0.0,240.0
0004aac84e0df4da2b147fca70cf8255,0.0,0.0,97.0,0.0,0.0,0.0,240.0
0005e1862207bf6ccc02e4228effd9a0,0.0,0.0,352.0,0.0,0.0,0.0,240.0
0006fdc98a402fceb4eb0ee528f6a8d4,0.0,0.0,216.0,0.0,0.0,0.0,240.0
...,...,...,...,...,...,...,...
fffbf87b7a1a6fa8b03f081c5f51a201,0.0,0.0,54.0,0.0,0.0,0.0,240.0
fffcf5a5ff07b0908bd4e2dbc735a684,0.0,0.0,256.0,0.0,0.0,0.0,240.0
fffea47cd6d3cc0a88bd621562a9d061,0.0,0.0,71.0,0.0,0.0,0.0,240.0
ffff371b4d645b6ecea244b27531430a,0.0,0.0,377.0,0.0,0.0,0.0,240.0


In [27]:
bgf_1 = BetaGeoFitter(penalizer_coef=0)

In [28]:
bgf_1.fit(
        frequency = df_ch_1["frequency_cal"], 
        recency = df_ch_1["recency_cal"], 
        T = df_ch_1["T_cal"],   
        weights = None,  
        verbose = True)

Optimization terminated successfully.
         Current function value: 0.069008
         Iterations: 63
         Function evaluations: 64
         Gradient evaluations: 64


<lifetimes.BetaGeoFitter: fitted with 55206 subjects, a: 1.41, alpha: 64.28, b: 0.24, r: 0.02>

In [30]:
bgf_1_rmse = bgf_rmse(ch_1,bgf_1)
bgf_1_rmse

1.009275270001345

In [79]:
bgf_real_v_pred_df(ch_1,bgf_1)

Unnamed: 0_level_0,n_transactions_cal_real,n_transactions_holdout_real,n_transactions_holdout_pred
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0000f46a3911fa3c0805444483337064,1.0,1.0,0.007896
0000f6ccb0745a6a4b88665a16c9f078,1.0,1.0,0.014077
0004aac84e0df4da2b147fca70cf8255,1.0,1.0,0.016051
0005e1862207bf6ccc02e4228effd9a0,1.0,1.0,0.007802
0006fdc98a402fceb4eb0ee528f6a8d4,1.0,1.0,0.010707
...,...,...,...
fffbf87b7a1a6fa8b03f081c5f51a201,1.0,1.0,0.019733
fffcf5a5ff07b0908bd4e2dbc735a684,1.0,1.0,0.009645
fffea47cd6d3cc0a88bd621562a9d061,1.0,1.0,0.018078
ffff371b4d645b6ecea244b27531430a,1.0,1.0,0.007433
