In [2]:
import pandas as pd

url = "https://raw.githubusercontent.com/sushkbura/Explore_ChurnDataSet/refs/heads/main/tea_churn.csv"
df = pd.read_csv(url)

print(df.shape)
df.head()

(30801, 15)


Unnamed: 0,custid,retained,created,firstorder,lastorder,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,favday,city
0,6H6T6N,0,9/28/12,8/11/13,8/11/13,29,100.0,3.448276,14.52,0.0,0,0,0,Monday,DEL
1,APCENR,1,12/19/10,4/1/11,1/19/14,95,92.631579,10.526316,83.69,0.181641,1,1,1,Friday,DEL
2,7UP6MS,0,10/3/10,12/1/10,7/6/11,0,0.0,0.0,33.58,0.059908,0,0,0,Wednesday,DEL
3,7ZEW8G,0,10/22/10,3/28/11,3/28/11,0,0.0,0.0,54.96,0.0,0,0,0,Thursday,BOM
4,8V726M,1,11/27/10,11/29/10,1/28/13,30,90.0,13.333333,111.91,0.00885,0,0,0,Monday,BOM


In [4]:
#convert date columns to datetime

date_cols = ["created", "firstorder", "lastorder"]

for col in date_cols:
    df[col] = pd.to_datetime(
        df[col],
        format="%m/%d/%y",
        errors="coerce"
    )

In [5]:
df[date_cols].dtypes

created       datetime64[ns]
firstorder    datetime64[ns]
lastorder     datetime64[ns]
dtype: object

In [7]:
#drop rows missing dates
df = df.dropna(subset=["created", "firstorder", "lastorder"]).copy()

In [32]:
#Feature Engineering (these are the same features I added in my data wrangler workflow)
# reference date for recency (reproducible)
as_of = df["lastorder"].max()

df["tenure_days"] = (df["lastorder"] - df["created"]).dt.days
df["time_to_first_order"] = (df["firstorder"] - df["created"]).dt.days
df["recency_days"] = (as_of - df["lastorder"]).dt.days

df["subscription_score"] = df[["paperless", "refill", "doorstep"]].sum(axis=1)
df["digital_customer"] = ((df["paperless"] == 1) | (df["eopenrate"] >= 50)).astype(int)
df["is_weekend"] = df["favday"].isin(["Saturday", "Sunday"]).astype(int)
df["purchase_intensity"] = df["avgorder"] * df["ordfreq"]


In [33]:
#drop id and raw dates (although in data wrangler I kept custid and lastorder to overcome an error)
df_model = df.drop(columns=["custid", "created", "firstorder", "lastorder"])

In [34]:
# Confirm target exists
print("Target column:", "retained" in df_model.columns)
df_model.head()

Target column: True


Unnamed: 0,retained,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,favday,city,tenure_days,time_to_first_order,recency_days,subscription_score,digital_customer,is_weekend,purchase_intensity
0,0,29,100.0,3.448276,14.52,0.0,0,0,0,Monday,DEL,317,317,1624,0,1,0,0.0
1,1,95,92.631579,10.526316,83.69,0.181641,1,1,1,Friday,DEL,1127,103,1463,3,1,0,15.201504
2,0,0,0.0,0.0,33.58,0.059908,0,0,0,Wednesday,DEL,276,59,2391,0,0,0,2.011705
3,0,0,0.0,0.0,54.96,0.0,0,0,0,Thursday,BOM,157,157,2491,0,0,0,0.0
4,1,30,90.0,13.333333,111.91,0.00885,0,0,0,Monday,BOM,793,2,1819,0,1,0,0.990354


In [35]:
# favday and city are text categories.
# Most ML models (like Logistic Regression) require numeric inputs.
df_model = pd.get_dummies(df_model, columns=["favday", "city"], drop_first=True)

print("Columns after encoding:", df_model.shape[1])
df_model.head()


Columns after encoding: 25


Unnamed: 0,retained,esent,eopenrate,eclickrate,avgorder,ordfreq,paperless,refill,doorstep,tenure_days,...,purchase_intensity,favday_Monday,favday_Saturday,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BOM,city_DEL,city_MAA
0,0,29,100.0,3.448276,14.52,0.0,0,0,0,317,...,0.0,True,False,False,False,False,False,False,True,False
1,1,95,92.631579,10.526316,83.69,0.181641,1,1,1,1127,...,15.201504,False,False,False,False,False,False,False,True,False
2,0,0,0.0,0.0,33.58,0.059908,0,0,0,276,...,2.011705,False,False,False,False,False,True,False,True,False
3,0,0,0.0,0.0,54.96,0.0,0,0,0,157,...,0.0,False,False,False,True,False,False,True,False,False
4,1,30,90.0,13.333333,111.91,0.00885,0,0,0,793,...,0.990354,True,False,False,False,False,False,True,False,False


In [36]:
from sklearn.model_selection import train_test_split

X = df_model.drop("retained", axis=1)
y = df_model["retained"]

# Stratify keeps the same churn/retained ratio in both train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (24606, 24)
Test shape: (6152, 24)


In [37]:
#Built-in XGBoost expects CSV with label in the FIRST column, no header row
train_df = pd.concat([y_train, X_train], axis=1)
test_df  = pd.concat([y_test,  X_test], axis=1)

train_path = "train.csv"
test_path  = "test.csv"

train_df.to_csv(train_path, index=False, header=False)
test_df.to_csv(test_path, index=False, header=False)

print("Saved:", train_path, test_path)


Saved: train.csv test.csv


In [38]:
import sagemaker

#Upload train/test to S3 so SageMaker training can access it
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket = sess.default_bucket()
prefix = "tea-churn-xgboost"

train_s3_uri = sess.upload_data(train_path, bucket=bucket, key_prefix=f"{prefix}/data")
test_s3_uri  = sess.upload_data(test_path,  bucket=bucket, key_prefix=f"{prefix}/data")

print("Train S3:", train_s3_uri)
print("Test  S3:", test_s3_uri)


Train S3: s3://sagemaker-us-east-1-962586284284/tea-churn-xgboost/data/train.csv
Test  S3: s3://sagemaker-us-east-1-962586284284/tea-churn-xgboost/data/test.csv


In [39]:
from sagemaker.image_uris import retrieve
from sagemaker.estimator import Estimator

#Get the built-in XGBoost container image for your region

xgb_image = retrieve(
    framework="xgboost",
    region=sess.boto_region_name,
    version="1.7-1",         # common stable version
    image_scope="training"
)

# Choose a training instance type.

xgb = Estimator(
    image_uri=xgb_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=sess,
)

# XGBoost binary classification
xgb.set_hyperparameters(
    objective="binary:logistic",
    eval_metric="auc",
    num_round=200,
    max_depth=5,
    eta=0.2,
    subsample=0.8,
    colsample_bytree=0.8
)

print("XGBoost image:", xgb_image)


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


XGBoost image: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1


In [40]:
from sagemaker.inputs import TrainingInput

#Train with train + validation channels
xgb.fit(
    {
        "train": TrainingInput(train_s3_uri, content_type="text/csv"),
        "validation": TrainingInput(test_s3_uri, content_type="text/csv"),
    }
)


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-12-15-01-19-05-133


2025-12-15 01:19:06 Starting - Starting the training job...
2025-12-15 01:19:22 Starting - Preparing the instances for training...
2025-12-15 01:19:45 Downloading - Downloading input data...
2025-12-15 01:20:30 Downloading - Downloading the training image......
  import pkg_resources[0m
[34m[2025-12-15 01:21:45.127 ip-10-0-220-12.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-12-15 01:21:45.194 ip-10-0-220-12.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-12-15:01:21:45:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-12-15:01:21:45:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-12-15:01:21:45:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-12-15:01:21:45:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-1

In [41]:
bool_cols = X_test.select_dtypes(include=["bool"]).columns.tolist()
bool_cols

['favday_Monday',
 'favday_Saturday',
 'favday_Sunday',
 'favday_Thursday',
 'favday_Tuesday',
 'favday_Wednesday',
 'city_BOM',
 'city_DEL',
 'city_MAA']

In [42]:
non_numeric = X_test.columns[~X_test.apply(lambda s: pd.api.types.is_numeric_dtype(s))]
print("Non-numeric columns:", list(non_numeric))

# Also specifically check bool columns
bool_cols = X_test.select_dtypes(include=["bool"]).columns.tolist()
print("Boolean columns:", bool_cols)

# Peek at a few rows of those columns to confirm if they contain True/False
if len(bool_cols) > 0:
    display(X_test[bool_cols].head())


Non-numeric columns: []
Boolean columns: ['favday_Monday', 'favday_Saturday', 'favday_Sunday', 'favday_Thursday', 'favday_Tuesday', 'favday_Wednesday', 'city_BOM', 'city_DEL', 'city_MAA']


Unnamed: 0,favday_Monday,favday_Saturday,favday_Sunday,favday_Thursday,favday_Tuesday,favday_Wednesday,city_BOM,city_DEL,city_MAA
26960,True,False,False,False,False,False,False,True,False
13318,False,False,False,True,False,False,True,False,False
15132,False,False,False,False,False,False,True,False,False
5408,False,False,False,False,True,False,False,True,False
25125,False,False,False,False,False,False,False,False,True


In [43]:
# Convert all boolean dummy columns to integers (0/1) for training + inference compatibility
for c in X_train.select_dtypes(include=["bool"]).columns:
    X_train[c] = X_train[c].astype(int)

for c in X_test.select_dtypes(include=["bool"]).columns:
    X_test[c] = X_test[c].astype(int)

print("Any bool left in X_train?", any(X_train.dtypes == "bool"))
print("Any bool left in X_test?", any(X_test.dtypes == "bool"))


Any bool left in X_train? False
Any bool left in X_test? False


In [44]:
# Recreate train/test DataFrames (label first)
train_df = pd.concat([y_train, X_train], axis=1)
test_df  = pd.concat([y_test,  X_test], axis=1)

train_path = "train.csv"
test_path = "test.csv"

train_df.to_csv(train_path, index=False, header=False)
test_df.to_csv(test_path, index=False, header=False)

sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = "tea-churn-xgboost"

train_s3_uri = sess.upload_data(train_path, bucket=bucket, key_prefix=f"{prefix}/data")
test_s3_uri  = sess.upload_data(test_path,  bucket=bucket, key_prefix=f"{prefix}/data")

print("Train S3:", train_s3_uri)
print("Test  S3:", test_s3_uri)


Train S3: s3://sagemaker-us-east-1-962586284284/tea-churn-xgboost/data/train.csv
Test  S3: s3://sagemaker-us-east-1-962586284284/tea-churn-xgboost/data/test.csv


In [45]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

preds = predictor.predict(X_test.iloc[:5].to_numpy())
preds


{'predictions': [{'score': 0.9997170567512512},
  {'score': 0.9976848363876343},
  {'score': 0.9885953068733215},
  {'score': 0.999872088432312},
  {'score': 0.9980908036231995}]}

In [49]:
import json
import numpy as np

def extract_probs(resp):
    # If it's bytes, decode
    if isinstance(resp, (bytes, bytearray)):
        resp = resp.decode("utf-8")

    # If it's a JSON string, parse it
    if isinstance(resp, str):
        resp = json.loads(resp)

    # Now resp should be a dict or list
    if isinstance(resp, dict) and "predictions" in resp:
        preds = resp["predictions"]
        # predictions might be list[float] or list[dict]
        if len(preds) > 0 and isinstance(preds[0], dict) and "score" in preds[0]:
            return [p["score"] for p in preds]
        return preds

    # If endpoint returns a bare list
    if isinstance(resp, list):
        return resp

    raise ValueError(f"Unexpected response format: {type(resp)} | {resp}")


In [50]:
from sklearn.metrics import roc_auc_score

batch_size = 200
preds = []

X_test_np = X_test.to_numpy()

for i in range(0, len(X_test_np), batch_size):
    batch = X_test_np[i:i+batch_size]
    resp = predictor.predict(batch)
    preds.extend(extract_probs(resp))

preds = np.array(preds, dtype=float)
y_true = y_test.to_numpy().astype(int)

auc = roc_auc_score(y_true, preds)
print("AUC ROC (XGBoost Endpoint):", auc)

AUC ROC (XGBoost Endpoint): 0.9843756474134537
