#### imports

In [3]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor

In [4]:
dataframe = pd.read_csv('../data/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv')

In [5]:
x = dataframe.drop(columns=['Customer Lifetime Value','Customer','Effective To Date'])
y = dataframe['Customer Lifetime Value']


In [6]:
categorical_features = x.select_dtypes(include=['object','string']).columns.tolist()
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(drop='first'), categorical_features),
    ],remainder='passthrough'
)

x = ct.fit_transform(x)



In [7]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [8]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
y_train_log = np.log(y_train)

In [11]:
model_xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    n_jobs=-1,
    random_state=0
)


model_xgb.fit(X_train,y_train_log)
y_pred_log = model_xgb.predict(X_test)
y_pred = np.exp(y_pred_log)


print("R2 score: " , r2_score(y_test, y_pred))

R2 score:  0.6726145430267954


In [14]:
import joblib

artifacts = {
    "model":model_xgb,
    "encoder":ct,
    "scaler":sc,
}

joblib.dump(artifacts,'XGBoost_model.joblib')

['XGBoost_model.joblib']