<a href="https://colab.research.google.com/github/vubanc/AWS_DeployedCLVPredictor/blob/main/AWS_DeployedCLVPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Customer Lifetime Value Prediction

## Importing Libraries and Initiating Sagemaker Session

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [447]:
#!pip install --disable-pip-version-check -q sagemaker==2.35.0
#!pip install -v protobuf==3.20.1
#!pip install awswrangler

In [5]:
import awswrangler as wr
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [6]:
import sagemaker
import boto3
import botocore

config = botocore.config.Config()
sm = boto3.client(service_name='sagemaker', config=config)
sess = sagemaker.Session(sagemaker_client = sm)

role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
region = sess.boto_region_name

In [7]:
df_customers_uri = 's3://sagemaker-us-east-2-397738742408/data/customers/'
df_transactions_uri = 's3://sagemaker-us-east-2-397738742408/data/transactions/'
output_location = "s3://{}/data/output".format(bucket)

In [12]:
!aws s3 ls 's3://sagemaker-us-east-2-397738742408/data/transactions/'

                           PRE transactions1/
                           PRE transactions2/


In [13]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="top" href="https://s3.console.aws.amazon.com/s3/home?region={}#">Amazon S3 buckets</a></b>'.format(region)))

## Data Preparation

In [14]:
df_customers = wr.s3.read_csv(df_customers_uri)
df_customers.head(2)

Unnamed: 0,CustomerID,Country,Recency,Frequency,DailySpending,DailyTransCount,MonetaryValue_x,MonetaryValue_y
0,13313,United Kingdom,53.0,31.0,304.87,16.0,609.74,945.58
1,18097,United Kingdom,43.0,49.0,637.02,24.0,1274.04,1241.24


In [20]:
df_customers["Country"] = df_customers.Country.apply(lambda x: 1 if x=="United Kingdom" else 0)
df_customers.drop(labels=["CustomerID"], axis=1).corr()

Unnamed: 0,Country,Recency,Frequency,DailySpending,DailyTransCount,MonetaryValue_x,MonetaryValue_y
Country,1.0,0.02426,-0.036808,-0.225447,-0.055746,-0.157125,-0.121992
Recency,0.02426,1.0,-0.255992,-0.057845,-0.019528,-0.387972,-0.261098
Frequency,-0.036808,-0.255992,1.0,0.205653,0.392446,0.452922,0.326404
DailySpending,-0.225447,-0.057845,0.205653,1.0,0.291026,0.417974,0.327666
DailyTransCount,-0.055746,-0.019528,0.392446,0.291026,1.0,0.246816,0.195491
MonetaryValue_x,-0.157125,-0.387972,0.452922,0.417974,0.246816,1.0,0.616622
MonetaryValue_y,-0.121992,-0.261098,0.326404,0.327666,0.195491,0.616622,1.0


In [21]:
x = df_customers [["Country", "Recency", "Frequency", "DailySpending", "DailyTransCount", "MonetaryValue_x"]]
y = df_customers[["MonetaryValue_y"]]
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=.15, random_state=3)

train_x.reset_index(inplace=True, drop=True)
train_y.reset_index(inplace=True, drop=True)
test_x.reset_index(inplace=True, drop=True)
test_y.reset_index(inplace=True, drop=True)

df_train = pd.concat([train_y, train_x], axis=1)
df_train.to_csv("df_train.csv", index=False, header=False)

df_test = pd.concat([test_y, test_x], axis=1)
df_test.to_csv("df_test.csv", index=False, header=False)

In [22]:
train_uri = sess.upload_data(bucket=bucket, key_prefix="data/train/train.csv", path="df_train.csv")
test_uri = sess.upload_data(bucket=bucket, key_prefix="data/test/test.csv", path="df_test.csv")

In [23]:
train_data = sagemaker.inputs.TrainingInput(s3_data='s3://{}/data/train'.format(bucket), content_type='text/csv')

## Lasso Regression

A lasso regression (L1 regularization) is fit to the data before running any of the models to check the effect size and direction of the features on the lifetime value. Moreover, this step would tease out variables that can be discarded from further consideration because L1 regularization can reduce coefficientsto 0.

In [444]:
from sklearn.linear_model import Lasso

In [445]:
lasso = Lasso(alpha=8)
lasso.fit(train_x, train_y)

In [446]:
lasso.coef_

array([-0.        , -0.63663644,  0.4443623 ,  0.13473509,  0.86664086,
        0.63675168])

The L2 regression coefficients suggest that the country variable can be dicarded from further consideration.

## Econometric Model (BG/NBD & GG)

In [29]:
#!pip install lifetimes
import lifetimes

In [833]:
df1_transactions = wr.s3.read_csv(f"s3://{bucket}/data/transactions/transactions1")
df1_transactions.head(2)

Unnamed: 0,CustomerID,InvoiceDate,Revenue
0,13313.0,2011-01-04,19.5
1,13313.0,2011-01-04,10.5


In [843]:
summary = lifetimes.utils.summary_data_from_transaction_data(df1_transactions, "CustomerID", "InvoiceDate", "Revenue")
summary.reset_index(inplace=True)
summary.CustomerID = summary.CustomerID.astype(int) 
summary = summary[(summary.monetary_value > 0)]

print(summary.shape)
summary.head(2)

(1384, 5)


Unnamed: 0,CustomerID,frequency,recency,T,monetary_value
1,12347,2.0,134.0,155.0,509.385
2,12348,1.0,70.0,156.0,367.0


In [844]:
bgf = lifetimes.BetaGeoFitter(penalizer_coef = 100) #100
bgf.fit(summary["frequency"], summary["recency"], summary["T"])
summary["expected_num_purchases"] = round(bgf.conditional_expected_number_of_purchases_up_to_time(180, summary["frequency"], summary["recency"], summary["T"]))
summary.head(2)

Unnamed: 0,CustomerID,frequency,recency,T,monetary_value,expected_num_purchases
1,12347,2.0,134.0,155.0,509.385,2.0
2,12348,1.0,70.0,156.0,367.0,1.0


In [845]:
ggf = lifetimes.GammaGammaFitter(penalizer_coef = 0.05)
ggf.fit(summary["frequency"], summary["monetary_value"])
summary["expected_revenue"] = ggf.conditional_expected_average_profit(summary["frequency"], summary["monetary_value"])
summary.head(2)

Unnamed: 0,CustomerID,frequency,recency,T,monetary_value,expected_num_purchases,expected_revenue
1,12347,2.0,134.0,155.0,509.385,2.0,688.686068
2,12348,1.0,70.0,156.0,367.0,1.0,765.382224


In [846]:
df2_transactions = wr.s3.read_csv(f"s3://{bucket}/data/transactions/transactions2")
df2_transactions = df2_transactions.groupby("CustomerID", as_index=False)["Revenue"].sum()
df2_transactions.columns = ["CustomerID", "MonetaryValue_y"]
df2_transactions.head(2)

Unnamed: 0,CustomerID,MonetaryValue_y
0,12347.0,2104.05
1,12348.0,310.0


In [847]:
summary = pd.merge(summary, df2_transactions[["CustomerID", "MonetaryValue_y"]], how="inner", on="CustomerID") 

In [848]:
summary["pred_MonetaryValue_y1"] = ggf.customer_lifetime_value(bgf,
                                                              summary["frequency"],
                                                              summary["recency"],
                                                              summary["T"],
                                                              summary.monetary_value,
                                                              time=6,
                                                              freq="D",
                                                              discount_rate = 0.01)

In [849]:
summary["pred_MonetaryValue_y2"] = summary.expected_num_purchases * summary.expected_revenue
summary = summary[["CustomerID", "frequency", "recency","T","monetary_value", "pred_MonetaryValue_y1", "pred_MonetaryValue_y2", "MonetaryValue_y"]]

In [850]:
r2 = r2_score(summary["pred_MonetaryValue_y2"], summary["MonetaryValue_y"])
print(f"R-squared value for the beta-geometric/negative-binomial and gamma-gamma model: {r2}")

R-squared value for the beta-geometric/negative-binomial and gamma-gamma model: 0.47079987397540213


In [851]:
summary.tail()

Unnamed: 0,CustomerID,frequency,recency,T,monetary_value,pred_MonetaryValue_y1,pred_MonetaryValue_y2,MonetaryValue_y
1168,18242,1.0,17.0,51.0,379.82,2382.946174,2376.050471,1538.41
1169,18245,1.0,101.0,150.0,501.38,1174.678417,1044.56768,1260.77
1170,18257,3.0,128.0,134.0,202.4,962.232167,981.088046,1402.03
1171,18272,1.0,21.0,84.0,340.72,1250.970792,1421.566742,2098.04
1172,18283,6.0,168.0,175.0,122.355,800.33205,805.130142,1252.3


## XGBoost

In [35]:
from sagemaker import image_uris

In [45]:
container = image_uris.retrieve("xgboost", region=region)

In [46]:
XGBoost = sagemaker.estimator.Estimator(container,
                                             role,
                                             instance_count = 1,
                                             instance_type = 'ml.m5.large',
                                             output_path = output_location,
                                             sagemaker_session = sess
                                             )
XGBoost.set_hyperparameters(predictor_type="regressor", l1=0.1, optimizer="adam")

In [48]:
LinearLearner.fit({'train':train_data})