### Customer Churn Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('ecommerce_customer_behavior_dataset_v2.csv')

df.head()

Unnamed: 0,Order_ID,Customer_ID,Date,Age,Gender,City,Product_Category,Unit_Price,Quantity,Discount_Amount,Total_Amount,Payment_Method,Device_Type,Session_Duration_Minutes,Pages_Viewed,Is_Returning_Customer,Delivery_Time_Days,Customer_Rating
0,ORD_000001-1,CUST_00001,2023-05-29,40,Male,Ankara,Books,29.18,1,0.0,29.18,Digital Wallet,Mobile,14,9,True,13,4
1,ORD_000001-2,CUST_00001,2023-10-12,40,Male,Ankara,Home & Garden,644.4,1,138.05,506.35,Credit Card,Desktop,14,8,True,6,2
2,ORD_000001-3,CUST_00001,2023-12-05,40,Male,Ankara,Sports,332.82,5,0.0,1664.1,Credit Card,Mobile,15,10,True,9,4
3,ORD_000002-1,CUST_00002,2023-05-11,33,Male,Istanbul,Food,69.3,5,71.05,275.45,Digital Wallet,Desktop,16,13,True,4,4
4,ORD_000002-2,CUST_00002,2023-06-16,33,Male,Istanbul,Beauty,178.15,3,0.0,534.45,Credit Card,Mobile,14,7,True,6,4


In [3]:
X = df.drop("Is_Returning_Customer", axis=1)
y = df["Is_Returning_Customer"]

In [4]:
# filtering Categorical and Numeric value

categorical_features = X.columns[X.dtypes == object].tolist()

numeric_features = X.columns[~(X.dtypes == object)].tolist()

In [5]:
X.head(2)

Unnamed: 0,Order_ID,Customer_ID,Date,Age,Gender,City,Product_Category,Unit_Price,Quantity,Discount_Amount,Total_Amount,Payment_Method,Device_Type,Session_Duration_Minutes,Pages_Viewed,Delivery_Time_Days,Customer_Rating
0,ORD_000001-1,CUST_00001,2023-05-29,40,Male,Ankara,Books,29.18,1,0.0,29.18,Digital Wallet,Mobile,14,9,13,4
1,ORD_000001-2,CUST_00001,2023-10-12,40,Male,Ankara,Home & Garden,644.4,1,138.05,506.35,Credit Card,Desktop,14,8,6,2


In [6]:
y

0         True
1         True
2         True
3         True
4         True
         ...  
17044    False
17045     True
17046     True
17047     True
17048     True
Name: Is_Returning_Customer, Length: 17049, dtype: bool

In [7]:
y.unique()

array([ True, False])

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(y)

print(np.unique(y))

[0 1]


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = 24
)

In [10]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transform = Pipeline(
    steps = [
       ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(steps = [
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers = [
    ("num", numeric_transform, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [11]:
# KNN Pipeline

from sklearn.neighbors import KNeighborsClassifier

Knn_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("knn", KNeighborsClassifier(n_neighbors = 4))
])

Knn_pipeline.fit(X_train, y_train)

In [12]:
from sklearn.metrics import accuracy_score

knn_pred = Knn_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, knn_pred))

Accuracy: 0.8355816226783969


In [13]:
import xgboost as xgb

xgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", xgb.XGBClassifier(max_depth=3))
])

xgb_pipeline.fit(X_train, y_train)

In [14]:
xgb_pred = xgb_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, xgb_pred))

Accuracy: 0.8813294232649072


In [15]:
from sklearn.metrics import classification_report

print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.25      0.00      0.01       603
           1       0.88      1.00      0.94      4512

    accuracy                           0.88      5115
   macro avg       0.57      0.50      0.47      5115
weighted avg       0.81      0.88      0.83      5115



In [16]:
print(classification_report(y_test, knn_pred))

              precision    recall  f1-score   support

           0       0.15      0.09      0.11       603
           1       0.88      0.94      0.91      4512

    accuracy                           0.84      5115
   macro avg       0.52      0.51      0.51      5115
weighted avg       0.80      0.84      0.82      5115

