In [1]:
import pandas as pd
import datetime as dt
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter

In [2]:
# Read the data: Private Dataset from FLO
df_ = pd.read_csv("C:\\Users\\utku\\Desktop\\miuul\\FLOCLTVPrediction\\flo_data_20k.csv")
df = df_.copy()

In [3]:
# Define the detect_and_replace_outliers function to detect and replace outliers for suppressing
# For CLTV calculation, the frequency values should be integer. Therefore, round the lower and upper limits with round().

def detect_and_replace_outliers(dataframe):
    """
    Detects outliers in the given DataFrame and suppresses them.

    Parameters:
    dataframe (pd.DataFrame): DataFrame to be processed.

    Returns:
    pd.DataFrame: DataFrame with suppressed outliers.
    """
    outlier_columns = []

    for col in dataframe.columns:
        if pd.api.types.is_numeric_dtype(dataframe[col]):
            Q1 = dataframe[col].quantile(0.01)
            Q3 = dataframe[col].quantile(0.99)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            outliers = dataframe[(dataframe[col] < lower_bound) | (dataframe[col] > upper_bound)]
            if not outliers.empty:
                outlier_columns.append(col)

            dataframe.loc[dataframe[col] < lower_bound, col] = round(lower_bound, 0)
            dataframe.loc[dataframe[col] > upper_bound, col] = round(upper_bound, 0)
    print("Columns with Outliers:", outlier_columns)

    return dataframe


In [4]:
# With Outliers
df.describe()

Unnamed: 0,order_num_total_ever_online,order_num_total_ever_offline,customer_value_total_ever_offline,customer_value_total_ever_online
count,19945.0,19945.0,19945.0,19945.0
mean,3.110855,1.913913,253.922597,497.32169
std,4.225647,2.06288,301.532853,832.601886
min,1.0,1.0,10.0,12.99
25%,1.0,1.0,99.99,149.98
50%,2.0,1.0,179.98,286.46
75%,4.0,2.0,319.97,578.44
max,200.0,109.0,18119.14,45220.13


In [5]:
detect_and_replace_outliers(df)

Columns with Outliers: ['order_num_total_ever_online', 'order_num_total_ever_offline', 'customer_value_total_ever_offline', 'customer_value_total_ever_online']


Unnamed: 0,master_id,order_channel,last_order_channel,first_order_date,last_order_date,last_order_date_online,last_order_date_offline,order_num_total_ever_online,order_num_total_ever_offline,customer_value_total_ever_offline,customer_value_total_ever_online,interested_in_categories_12
0,cc294636-19f0-11eb-8d74-000d3a38a36f,Android App,Offline,2020-10-30,2021-02-26,2021-02-21,2021-02-26,4.0,1.0,139.99,799.38,[KADIN]
1,f431bd5a-ab7b-11e9-a2fc-000d3a38a36f,Android App,Mobile,2017-02-08,2021-02-16,2021-02-16,2020-01-10,19.0,2.0,159.97,1853.58,"[ERKEK, COCUK, KADIN, AKTIFSPOR]"
2,69b69676-1a40-11ea-941b-000d3a38a36f,Android App,Android App,2019-11-27,2020-11-27,2020-11-27,2019-12-01,3.0,2.0,189.97,395.35,"[ERKEK, KADIN]"
3,1854e56c-491f-11eb-806e-000d3a38a36f,Android App,Android App,2021-01-06,2021-01-17,2021-01-17,2021-01-06,1.0,1.0,39.99,81.98,"[AKTIFCOCUK, COCUK]"
4,d6ea1074-f1f5-11e9-9346-000d3a38a36f,Desktop,Desktop,2019-08-03,2021-03-07,2021-03-07,2019-08-03,1.0,1.0,49.99,159.99,[AKTIFSPOR]
...,...,...,...,...,...,...,...,...,...,...,...,...
19940,727e2b6e-ddd4-11e9-a848-000d3a38a36f,Android App,Offline,2019-09-21,2020-07-05,2020-06-05,2020-07-05,1.0,2.0,289.98,111.98,"[ERKEK, AKTIFSPOR]"
19941,25cd53d4-61bf-11ea-8dd8-000d3a38a36f,Desktop,Desktop,2020-03-01,2020-12-22,2020-12-22,2020-03-01,1.0,1.0,150.48,239.99,[AKTIFSPOR]
19942,8aea4c2a-d6fc-11e9-93bc-000d3a38a36f,Ios App,Ios App,2019-09-11,2021-05-24,2021-05-24,2019-09-11,2.0,1.0,139.98,492.96,[AKTIFSPOR]
19943,e50bb46c-ff30-11e9-a5e8-000d3a38a36f,Android App,Android App,2019-03-27,2021-02-13,2021-02-13,2021-01-08,1.0,5.0,711.79,297.98,"[ERKEK, AKTIFSPOR]"


In [6]:
# Without Outliers
df.describe()

Unnamed: 0,order_num_total_ever_online,order_num_total_ever_offline,customer_value_total_ever_offline,customer_value_total_ever_online
count,19945.0,19945.0,19945.0,19945.0
mean,3.091953,1.886187,251.92132,489.705676
std,3.809541,1.434694,251.02367,632.609844
min,1.0,1.0,10.0,12.99
25%,1.0,1.0,99.99,149.98
50%,2.0,1.0,179.98,286.46
75%,4.0,2.0,319.97,578.44
max,48.0,16.0,3020.0,7800.0


In [7]:
# Omnichannel customers indicate that they shop both online and offline platforms.
# Create new variables for the total number of purchases and expenditures for each customer.

df["order_num_total"] = df["order_num_total_ever_online"] + df["order_num_total_ever_offline"]
df["customer_value_total"] = df["customer_value_total_ever_offline"] + df["customer_value_total_ever_online"]

In [8]:
# Check variable types. Convert the variables representing dates to date type.

df[[col for col in df.columns if 'date' in col]] = df[[col for col in df.columns if 'date' in col]].apply(pd.to_datetime)

# ***Creation of the CLTV Data Structure***

In [9]:
# Take 2 days after the date of the last purchase in the dataset as the analysis date.

df["last_order_date"].max()
analysis_date = dt.datetime(2021, 6, 1)

In [14]:
#Create a new cltv dataframe including customer_id, recency_cltv_weekly, T_weekly, frequency, and monetary_cltv_avg values.

cltv_df = pd.DataFrame()
cltv_df["customer_id"] = df["master_id"]
cltv_df["recency_cltv_weekly"] = (df["last_order_date"] - df["first_order_date"]).dt.days / 7
cltv_df["T_weekly"] = (analysis_date - df["first_order_date"]).dt.days / 7 
cltv_df["frequency"] = df["order_num_total"]
cltv_df["monetary_cltv_avg"] = df["customer_value_total"] / df["order_num_total"]

##### **Establishing BG/NBD and Gamma-Gamma Models, Calculating 6-Month CLTV**

In [15]:
# Set up the BG/NBD model.

bgf = BetaGeoFitter(penalizer_coef=0.001)
bgf.fit(cltv_df['frequency'],
        cltv_df['recency_cltv_weekly'],
        cltv_df['T_weekly'])


  result = getattr(ufunc, method)(*inputs, **kwargs)


<lifetimes.BetaGeoFitter: fitted with 19945 subjects, a: 0.00, alpha: 76.17, b: 0.00, r: 3.66>

In [16]:
# Predict the expected purchases from the customers within 3 months and add it to the cltv dataframe as exp_sales_3_month.

cltv_df["exp_sales_3_month"] = bgf.predict(4 * 3,
                                           cltv_df['frequency'],
                                           cltv_df['recency_cltv_weekly'],
                                           cltv_df['T_weekly'])

In [17]:
# Predict the expected purchases from the customers within 6 months and add it to the cltv dataframe as exp_sales_6_month.

cltv_df["exp_sales_6_month"] = bgf.predict(4 * 6,
                                           cltv_df['frequency'],
                                           cltv_df['recency_cltv_weekly'],
                                           cltv_df['T_weekly'])

In [18]:
# Examine the top 10 customers who will make the most purchases in 3 and 6 months.

cltv_df.sort_values(["exp_sales_3_month" , "exp_sales_6_month"], ascending=False).head(10)

Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_3_month,exp_sales_6_month
7330,a4d534a2-5b1b-11eb-8dbd-000d3a38a36f,62.714286,67.285714,52.0,166.224615,4.656138,9.312276
15611,4a7e875e-e6ce-11ea-8f44-000d3a38a36f,39.714286,40.0,29.0,165.297586,3.373958,6.747915
8328,1902bf80-0035-11eb-8341-000d3a38a36f,28.857143,33.285714,25.0,97.4396,3.142396,6.284792
19538,55d54d9e-8ac7-11ea-8ec0-000d3a38a36f,52.571429,58.714286,31.0,228.53,3.083779,6.167558
14373,f00ad516-c4f4-11ea-98f7-000d3a38a36f,38.0,46.428571,27.0,141.354815,3.001287,6.002574
10489,7af5cd16-b100-11e9-9757-000d3a38a36f,103.142857,111.857143,43.0,157.112558,2.978047,5.956093
4315,d5ef8058-a5c6-11e9-a2fc-000d3a38a36f,133.142857,147.142857,49.0,161.846735,2.829904,5.659808
6756,27310582-6362-11ea-a6dc-000d3a38a36f,62.714286,64.142857,29.0,168.881034,2.793429,5.586858
6666,53fe00d4-7b7a-11eb-960b-000d3a38a36f,9.714286,13.0,17.0,259.865294,2.780689,5.561378
10536,e143b6fa-d6f8-11e9-93bc-000d3a38a36f,104.571429,113.428571,40.0,176.2,2.763492,5.526983


In [19]:
# Gamma-Gamma model fitting. Predict the expected average value that customers will leave and add it to the cltv dataframe as exp_average_value.

ggf = GammaGammaFitter(penalizer_coef=0.01)
ggf.fit(cltv_df['frequency'], cltv_df['monetary_cltv_avg'])
cltv_df["exp_average_value"] = ggf.conditional_expected_average_profit(cltv_df['frequency'],
                                                                       cltv_df['monetary_cltv_avg'])
cltv_df.head()

Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_3_month,exp_sales_6_month,exp_average_value
0,cc294636-19f0-11eb-8d74-000d3a38a36f,17.0,30.571429,5.0,187.874,0.973927,1.947853,193.632679
1,f431bd5a-ab7b-11e9-a2fc-000d3a38a36f,209.857143,224.857143,21.0,95.883333,0.983161,1.966323,96.665048
2,69b69676-1a40-11ea-941b-000d3a38a36f,52.285714,78.857143,5.0,117.064,0.670586,1.341172,120.967619
3,1854e56c-491f-11eb-806e-000d3a38a36f,1.571429,20.857143,2.0,60.985,0.700412,1.400824,67.320145
4,d6ea1074-f1f5-11e9-9346-000d3a38a36f,83.142857,95.428571,2.0,104.99,0.396039,0.792077,114.325108


In [20]:
# Calculate the 6-month CLTV and add it to the dataframe as cltv.
"""Since freq="W" in this part, it might be using weekly predictions"""
cltv = ggf.customer_lifetime_value(bgf,
                                   cltv_df['frequency'],
                                   cltv_df['recency_cltv_weekly'],
                                   cltv_df['T_weekly'],
                                   cltv_df['monetary_cltv_avg'],
                                   time=26,
                                   freq="W",
                                   discount_rate=0.01)
cltv_df["cltv"] = cltv
cltv_df.head()

Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_3_month,exp_sales_6_month,exp_average_value,cltv
0,cc294636-19f0-11eb-8d74-000d3a38a36f,17.0,30.571429,5.0,187.874,0.973927,1.947853,193.632679,1556.527701
1,f431bd5a-ab7b-11e9-a2fc-000d3a38a36f,209.857143,224.857143,21.0,95.883333,0.983161,1.966323,96.665048,784.415792
2,69b69676-1a40-11ea-941b-000d3a38a36f,52.285714,78.857143,5.0,117.064,0.670586,1.341172,120.967619,669.538556
3,1854e56c-491f-11eb-806e-000d3a38a36f,1.571429,20.857143,2.0,60.985,0.700412,1.400824,67.320145,389.179979
4,d6ea1074-f1f5-11e9-9346-000d3a38a36f,83.142857,95.428571,2.0,104.99,0.396039,0.792077,114.325108,373.706997


In [21]:
# Observe the top 20 customers with the highest CLTV value.
cltv_df.sort_values("cltv",ascending=False).head(20)

Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_3_month,exp_sales_6_month,exp_average_value,cltv
9055,47a642fe-975b-11eb-8c2a-000d3a38a36f,2.857143,7.857143,4.0,1401.8,1.094385,2.188769,1449.060468,13089.062775
13880,7137a5c0-7aad-11ea-8f20-000d3a38a36f,6.142857,13.142857,11.0,758.085455,1.970108,3.940216,767.360602,12477.900055
17323,f59053e2-a503-11e9-a2fc-000d3a38a36f,51.714286,101.0,7.0,1106.467143,0.722238,1.444476,1127.611525,6721.896697
12438,625f40a2-5bd2-11ea-98b0-000d3a38a36f,74.285714,74.571429,16.0,501.87375,1.565309,3.130618,506.166665,6539.51635
7330,a4d534a2-5b1b-11eb-8dbd-000d3a38a36f,62.714286,67.285714,52.0,166.224615,4.656138,9.312276,166.712253,6406.862275
8868,9ce6e520-89b0-11ea-a6e7-000d3a38a36f,3.428571,34.428571,8.0,601.22625,1.265456,2.530912,611.492616,6386.902096
6402,851de3b4-8f0c-11eb-8cb8-000d3a38a36f,8.285714,9.428571,2.0,862.69,0.793924,1.587847,923.679965,6052.743712
6666,53fe00d4-7b7a-11eb-960b-000d3a38a36f,9.714286,13.0,17.0,259.865294,2.780689,5.561378,262.072907,6014.87449
19538,55d54d9e-8ac7-11ea-8ec0-000d3a38a36f,52.571429,58.714286,31.0,228.53,3.083779,6.167558,229.606946,5844.135581
14858,031b2954-6d28-11eb-99c4-000d3a38a36f,14.857143,15.571429,3.0,743.586667,0.871564,1.743128,778.05037,5597.04927


#### ***Creating Segments According to CLTV***

In [23]:
#Divide all our customers into 4 groups (segments) based on the 6-month CLTV and add group names to the dataset.
# Assign them as cltv_segment.

#cltv_df["cltv_segment_cut"] = pd.cut(cltv_df["cltv"], 4, labels=["D", "C", "B", "A"])
#cltv_df["cltv_segment_cut"].value_counts()
cltv_df["cltv_segment_qcut"] = pd.qcut(cltv_df["cltv"], 4, labels=["D", "C", "B", "A"])
cltv_df["cltv_segment_qcut"].value_counts()

# 2. Examine the recency, frequency, and monetary averages of the segments.

cltv_df.groupby("cltv_segment_qcut")[["recency_cltv_weekly", "frequency", "monetary_cltv_avg"]].mean()


Unnamed: 0_level_0,recency_cltv_weekly,frequency,monetary_cltv_avg
cltv_segment_qcut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
D,139.000172,3.768799,93.151603
C,92.629534,4.404733,125.788537
B,81.988367,5.092659,160.636574
A,67.427139,6.646611,228.831142
