In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
from lifetimes.utils import calibration_and_holdout_data
from lifetimes.utils import summary_data_from_transaction_data
from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases
from decimal import Decimal 
import datetime as dt

In [61]:
df_ord_items = pd.read_csv('/Users/Ian/Downloads/archive/order_items.csv')
df_ord = pd.read_csv('/Users/Ian/Downloads/archive/orders.csv')
df_prods = pd.read_csv('/Users/Ian/Downloads/archive/products.csv')

In [103]:
f'Data ranges from: {df.created_at.min()} to: {df.created_at.max()}'

'Data ranges from: 2019-01-06 02:25:41+00:00 to: 2024-01-21 18:02:23.533893+00:00'

In [75]:
df_prods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29120 entries, 0 to 29119
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      29120 non-null  int64  
 1   cost                    29120 non-null  float64
 2   category                29120 non-null  object 
 3   name                    29118 non-null  object 
 4   brand                   29096 non-null  object 
 5   retail_price            29120 non-null  float64
 6   department              29120 non-null  object 
 7   sku                     29120 non-null  object 
 8   distribution_center_id  29120 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 2.0+ MB


In [104]:
df_ord_items.tail()

Unnamed: 0,id,order_id,user_id,product_id,inventory_item_id,status,created_at,shipped_at,delivered_at,returned_at,sale_price
181754,9674,6679,5325,24447,26095,Returned,2020-12-23 00:05:01+00:00,2020-12-25 22:54:00+00:00,2020-12-28 07:08:00+00:00,2020-12-30 22:43:00+00:00,999.0
181755,7801,5416,4283,24447,21078,Shipped,2022-01-12 23:51:07+00:00,2022-01-10 03:42:00+00:00,,,999.0
181756,62986,43364,34691,23546,169937,Shipped,2023-02-26 00:38:43+00:00,2023-02-25 06:53:00+00:00,,,999.0
181757,106577,73418,58623,24447,287560,Shipped,2023-11-12 02:41:02+00:00,2023-11-14 20:44:00+00:00,,,999.0
181758,143759,98984,78981,24447,388091,Shipped,2022-03-13 10:29:49+00:00,2022-03-14 01:27:00+00:00,,,999.0


In [105]:
df.status.value_counts()

status
Shipped       54440
Complete      45609
Processing    36388
Cancelled     27090
Returned      18232
Name: count, dtype: int64

In [41]:
def data_cleaning(order_units_df):
    '''
    This function takes the order_items dataframe, cleans the data and returns the cleaned dataframe
    '''
    
    # Convert the created_at column from object to datetime
    order_units_df['created_at'] = pd.to_datetime(order_units_df.created_at,format='mixed')
    
    return df

In [96]:
def rfm(cleaned_order_units_df):
    '''
    This function takes the cleaned order_items dataframe and outputs the Recency, Frequency and Tenure dataset
    '''
    return summary_data_from_transaction_data(transactions = df,
                                              customer_id_col = 'user_id', 
                                              datetime_col = 'created_at',
                                              monetary_value_col = 'sale_price')

  pd.to_datetime(transactions[datetime_col].max(), format=datetime_format).to_period(freq).to_timestamp()
  transactions = transactions.set_index(datetime_col).to_period(freq).to_timestamp()


Unnamed: 0_level_0,frequency,recency,T,monetary_value
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,1.0,551.0,140.000000
2,0.0,0.0,700.0,0.000000
3,4.0,151.0,317.0,73.052501
4,0.0,0.0,460.0,0.000000
5,2.0,4.0,458.0,20.330000
...,...,...,...,...
99993,0.0,0.0,18.0,0.000000
99996,3.0,956.0,1268.0,27.956667
99998,2.0,218.0,401.0,38.495000
99999,0.0,0.0,29.0,0.000000


In [87]:
time_diff = df.created_at.max() - df.created_at.min()

In [94]:
train_test_ratio = 0.3
train_date_end = df.created_at.min() + time_diff * (1-train_test_ratio)

In [122]:
df_rfm_cal = calibration_and_holdout_data(transactions=df, 
                                          customer_id_col="user_id",
                                          datetime_col = "created_at", 
                                          calibration_period_end=train_date_end,
                                          observation_period_end=df.created_at.max())

  pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq).to_timestamp()
  transactions = transactions.set_index(datetime_col).to_period(freq).to_timestamp()
  return d.to_period(freq)
  return d.to_period(freq)


In [130]:
l2_coefs = [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01]
l2_list = []
rmse_list = []
for coef in l2_coefs :
    # Fitting the model using the calibration dataset.
    model = BetaGeoFitter(penalizer_coef=coef)
    model.fit(df_rfm_cal['frequency_cal'], 
        df_rfm_cal['recency_cal'], 
        df_rfm_cal['T_cal'])
    # Predicting the frequency for the holdout period for all customers. 
    pred_freq = pd.DataFrame(model.predict(df_rfm_cal['duration_holdout'], 
                                           df_rfm_cal['frequency_cal'],
                                           df_rfm_cal['recency_cal'],
                                           df_rfm_cal['T_cal']),
                             columns=['pred_frequency']).reset_index()
    # Merging the two dataframes and dropping NaN values. 
    new_df = df_rfm_cal.reset_index().merge(pred_freq, on='user_id').dropna()

    # Computing the rmse score 
    rmse_score = np.sqrt(mean_squared_error(new_df['frequency_holdout'],new_df['pred_frequency']))
    l2_list.append(coef)
    rmse_list.append(rmse_score)

# Getting the results 
resl = pd.DataFrame(np.array(rmse_list), columns=['rmse_score'])\
             .merge(pd.DataFrame(np.array(l2_list), columns=['L2 coefs']), right_index=True, left_index=True)
resl

Unnamed: 0,rmse_score,L2 coefs
0,1.135584,0.001
1,1.139688,0.002
2,1.143716,0.003
3,1.147738,0.004
4,1.151785,0.005
5,1.155875,0.006
6,1.160015,0.007
7,1.164208,0.008
8,1.168454,0.009
9,1.172753,0.01


In [131]:
model = BetaGeoFitter(penalizer_coef=0.001)
model.fit(df_rfm_cal['frequency_cal'], 
        df_rfm_cal['recency_cal'], 
        df_rfm_cal['T_cal']);

In [132]:
df_rfm = df_rfm[df_rfm['monetary_value']>0]

In [133]:
gg_model = GammaGammaFitter()
gg_model.fit(df_rfm['frequency'], df_rfm['monetary_value']);

In [134]:
df_rfm['CLV'] = gg_model.customer_lifetime_value(
    model,
    df_rfm['frequency'],
    df_rfm['recency'],
    df_rfm['T'],
    df_rfm['monetary_value'],
    time = 6,# In months 
    )
df_rfm.sort_values('CLV')

Unnamed: 0_level_0,frequency,recency,T,monetary_value,CLV
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
61664,5.0,16.0,1461.0,75.162000,1.601477e-08
82202,5.0,10.0,760.0,34.465999,5.287405e-08
68508,3.0,3.0,1661.0,43.446667,2.426542e-07
94535,3.0,4.0,1718.0,18.626666,2.914203e-07
16636,3.0,4.0,1720.0,21.493333,3.040252e-07
...,...,...,...,...,...
88483,3.0,4.0,4.0,172.290001,4.392529e+02
22605,4.0,29.0,34.0,245.500000,4.998817e+02
58372,2.0,5.0,5.0,361.110000,5.099627e+02
69590,3.0,4.0,5.0,285.996667,5.282838e+02


In [91]:
train_date_end

Timestamp('2022-07-18 06:09:22.773725088+0000', tz='UTC')

In [93]:
test_date_start

Timestamp('2022-07-18 06:09:22.773725104+0000', tz='UTC')