<a href="https://colab.research.google.com/github/vubanc/AWS_DeployedCLVPredictor/blob/main/AWS_DeployedCLVPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Customer Lifetime Value Prediction

## Importing Libraries and Initiating Sagemaker Session

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
#!pip install --disable-pip-version-check -q sagemaker==2.35.0
#!pip install -v protobuf==3.20.1
#!pip install awswrangler

In [60]:
import awswrangler as wr
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [3]:
import sagemaker
import boto3
import botocore

config = botocore.config.Config()
sm = boto3.client(service_name='sagemaker', config=config)
sess = sagemaker.Session(sagemaker_client = sm)

role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
region = sess.boto_region_name

In [11]:
df_customers_uri = 's3://sagemaker-us-east-2-397738742408/data/customers/'
df_transactions_uri = 's3://sagemaker-us-east-2-397738742408/data/transactions/'
output_location = "s3://{}/data/output".format(bucket)

In [12]:
!aws s3 ls 's3://sagemaker-us-east-2-397738742408/data/transactions/'

                           PRE transactions1/
                           PRE transactions2/


In [13]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="top" href="https://s3.console.aws.amazon.com/s3/home?region={}#">Amazon S3 buckets</a></b>'.format(region)))

## Data Preparation

In [15]:
df_customers = wr.s3.read_csv(df_customers_uri)
df_customers["Country"] = df_customers.Country.apply(lambda x: 1 if x=="United Kingdom" else 0)
df_customers.drop(labels=["CustomerID"], axis=1).corr()

Unnamed: 0,Country,Recency,Frequency,DailySpending,DailyTransCount,MonetaryValue_x,MonetaryValue_y
Country,1.0,0.02426,-0.036808,-0.225447,-0.055746,-0.157125,-0.121992
Recency,0.02426,1.0,-0.255992,-0.057845,-0.019528,-0.387972,-0.261098
Frequency,-0.036808,-0.255992,1.0,0.205653,0.392446,0.452922,0.326404
DailySpending,-0.225447,-0.057845,0.205653,1.0,0.291026,0.417974,0.327666
DailyTransCount,-0.055746,-0.019528,0.392446,0.291026,1.0,0.246816,0.195491
MonetaryValue_x,-0.157125,-0.387972,0.452922,0.417974,0.246816,1.0,0.616622
MonetaryValue_y,-0.121992,-0.261098,0.326404,0.327666,0.195491,0.616622,1.0


In [16]:
x = df_customers [["Country", "Recency", "Frequency", "DailySpending", "DailyTransCount", "MonetaryValue_x"]]
y = df_customers[["MonetaryValue_y"]]
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=.15, random_state=3)

train_x.reset_index(inplace=True, drop=True)
train_y.reset_index(inplace=True, drop=True)
test_x.reset_index(inplace=True, drop=True)
test_y.reset_index(inplace=True, drop=True)

df_train = pd.concat([train_y, train_x], axis=1)
df_train.to_csv("df_train.csv", index=False, header=False)

df_test = pd.concat([test_y, test_x], axis=1)
df_test.to_csv("df_test.csv", index=False, header=False)

In [17]:
train_uri = sess.upload_data(bucket=bucket, key_prefix="data/train/train.csv", path="df_train.csv")
test_uri = sess.upload_data(bucket=bucket, key_prefix="data/test/test.csv", path="df_test.csv")

In [44]:
train_data = sagemaker.inputs.TrainingInput(s3_data='s3://{}/data/train'.format(bucket), content_type='text/csv')

## Lasso Regression

A lasso regression (L1 regularization) is fit to the data before running any of the models to check the effect size and direction of the features on the lifetime value. Moreover, this step would tease out variables that can be discarded from further consideration because L1 regularization can reduce coefficientsto 0.

In [18]:
from sklearn.linear_model import Lasso

In [21]:
lasso = Lasso(alpha=0.05)
lasso.fit(train_x, train_y)

In [22]:
lasso.coef_

array([-45.14886414,  -0.64533434,   0.45166943,   0.13023463,
         0.88412659,   0.63439208])

## Econometric Model

In [25]:
#!pip install lifetimes
import lifetimes

In [157]:
df1_transactions = wr.s3.read_csv(f"s3://{bucket}/data/transactions/transactions1")
df1_transactions.head(2)

Unnamed: 0,CustomerID,InvoiceDate,Revenue
0,13313.0,2011-01-04,19.5
1,13313.0,2011-01-04,10.5


In [158]:
summary = lifetimes.utils.summary_data_from_transaction_data(df1_transactions, "CustomerID", "InvoiceDate", "Revenue")
summary.reset_index(inplace=True)
summary.CustomerID = summary.CustomerID.astype(int) 
summary = summary[summary.monetary_value > 0]
summary.head()

Unnamed: 0,CustomerID,frequency,recency,T,monetary_value
1,12347,2.0,134.0,155.0,509.385
2,12348,1.0,70.0,156.0,367.0
4,12352,3.0,34.0,134.0,421.77
8,12356,1.0,80.0,163.0,481.46
9,12359,2.0,142.0,169.0,1474.115


In [159]:
bgf = lifetimes.BetaGeoFitter(penalizer_coef = 0.2)
bgf.fit(summary["frequency"], summary["recency"], summary["T"])

<lifetimes.BetaGeoFitter: fitted with 1384 subjects, a: 0.00, alpha: 34.75, b: 0.01, r: 0.81>

In [160]:
ggf = lifetimes.GammaGammaFitter(penalizer_coef = 0.01)
ggf.fit(summary["frequency"], summary["monetary_value"])

<lifetimes.GammaGammaFitter: fitted with 1384 subjects, p: 3.77, q: 0.33, v: 3.63>

In [161]:
summary["pred_MonetaryValue_y"] = ggf.customer_lifetime_value(bgf,
                                                             summary["frequency"],
                                                             summary["recency"],
                                                             summary["T"],
                                                             summary["monetary_value"],
                                                             time = 6,
                                                             freq = "D",
                                                             discount_rate = 0.001)

In [162]:
df2_transactions = wr.s3.read_csv(f"s3://{bucket}/data/transactions/transactions2")
df2_transactions = df2_transactions.groupby("CustomerID", as_index=False)["Revenue"].sum()
df2_transactions.columns = ["CustomerID", "MonetaryValue_y"]
df2_transactions.head()

Unnamed: 0,CustomerID,MonetaryValue_y
0,12347.0,2104.05
1,12348.0,310.0
2,12352.0,944.23
3,12356.0,58.35
4,12359.0,2876.85


In [163]:
summary = pd.merge(summary, df2_transactions[["CustomerID", "MonetaryValue_y"]], how="inner", on="CustomerID") 

In [164]:
summary.tail(2)

Unnamed: 0,CustomerID,frequency,recency,T,monetary_value,pred_MonetaryValue_y,MonetaryValue_y
1171,18272,1.0,21.0,84.0,340.72,912.697706,2098.04
1172,18283,6.0,168.0,175.0,122.355,737.896449,1252.3


In [166]:
#summary_test_sample = summary.sample(round(0.15*summary.shape[0]))
r2 = r2_score(summary["pred_MonetaryValue_y"], summary["MonetaryValue_y"])
print(f"R-squared value for the beta-geometric/negative-binomial and gamma-gamma model: {r2}")

R-squared value for the beta-geometric/negative-binomial and gamma-gamma model: 0.18379851255674473


## XGBoost

In [35]:
from sagemaker import image_uris

In [45]:
container = image_uris.retrieve("xgboost", region=region)

In [46]:
XGBoost = sagemaker.estimator.Estimator(container,
                                             role,
                                             instance_count = 1,
                                             instance_type = 'ml.m5.large',
                                             output_path = output_location,
                                             sagemaker_session = sess
                                             )
XGBoost.set_hyperparameters(predictor_type="regressor", l1=0.1, optimizer="adam")

In [48]:
LinearLearner.fit({'train':train_data})