In [47]:
#!pip install scikit-learn
#!pip install xgboost

In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
#from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import pickle

## Step 1: Load Data

In [49]:
print("Loading dataset...")
df = pd.read_csv('/Users/usamahameed/Downloads/Ecommerce Churn Project/data/fact_customer.csv')
print("Dataset loaded successfully!\n")

Loading dataset...
Dataset loaded successfully!



In [50]:
df.head()

Unnamed: 0.1,Unnamed: 0,invoice_no,category,item,quantity,price,location_id,customer_id,invoice_date,payment_id,...,dob,email,phone_number,is_churned,days_since_last_purchase,tenure,discount_used,last_purchase_date,purchase_frequency,avg_purchase_value
0,0,100000,Hair Care,Hair Oil,1,17.45,1,398,2023-03-22,2,...,1999-07-13,michealbender@gmail.com,(309)938-3205x8088,1.0,1056.0,8.0,0.0,2021-08-15,30.0,280.82
1,1,100001,Body Care and Hygiene,Deodorant,4,20.81,21,938,2024-04-06,9,...,1995-01-18,nicholasclark@flowers.net,(363)377-7602,1.0,1494.0,1.0,0.0,2020-01-19,30.0,52.4
2,2,100002,Skin Care,Face Wash,4,40.96,17,258,2022-05-31,10,...,2002-10-11,geraldcastro@mosley.com,661-041-4070x466,0.0,386.0,6.0,1.0,2021-04-06,42.0,165.9
3,3,100003,Body Care and Hygiene,Body Lotion,2,27.5,9,892,2021-01-01,10,...,1992-12-29,vguerrero@green.org,712-518-4596x428,0.0,482.0,3.0,1.0,2022-05-17,28.0,427.81
4,4,100004,Oral Care,Toothpaste,1,16.43,55,978,2020-02-10,7,...,1989-08-17,qfloyd@gmail.com,249-201-6223,0.0,1309.0,2.0,1.0,2023-01-28,31.0,321.2


## Step 2: Initial Data Exploration

In [51]:
print("Displaying first few rows of the dataset:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nChecking for missing values:")
print(df.isnull().sum())

print("\nSummary Statistics:")
print(df.describe())

print("\nChecking data types:")
print(df.dtypes)

print("\nChecking for duplicate rows:")
print(df.duplicated().sum())

print("\nChecking unique values in categorical columns:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"{col}: {df[col].nunique()} unique values")

Displaying first few rows of the dataset:
   Unnamed: 0  invoice_no               category         item  quantity  \
0           0      100000              Hair Care     Hair Oil         1   
1           1      100001  Body Care and Hygiene    Deodorant         4   
2           2      100002              Skin Care    Face Wash         4   
3           3      100003  Body Care and Hygiene  Body Lotion         2   
4           4      100004              Oral Care   Toothpaste         1   

   price  location_id  customer_id invoice_date  payment_id  ...         dob  \
0  17.45            1          398   2023-03-22           2  ...  1999-07-13   
1  20.81           21          938   2024-04-06           9  ...  1995-01-18   
2  40.96           17          258   2022-05-31          10  ...  2002-10-11   
3  27.50            9          892   2021-01-01          10  ...  1992-12-29   
4  16.43           55          978   2020-02-10           7  ...  1989-08-17   

                       ema

In [52]:
df.columns

Index(['Unnamed: 0', 'invoice_no', 'category', 'item', 'quantity', 'price',
       'location_id', 'customer_id', 'invoice_date', 'payment_id',
       'payment_method', 'card_type', 'Unnamed: 3', 'shopping_mall', 'city',
       'province_state', 'country', 'first_name', 'last_name', 'gender', 'age',
       'dob', 'email', 'phone_number', 'is_churned',
       'days_since_last_purchase', 'tenure', 'discount_used',
       'last_purchase_date', 'purchase_frequency', 'avg_purchase_value'],
      dtype='object')

## Step 3: Data Preprocessing

In [25]:
print("Cleaning and processing data...")

# Drop unnecessary columns
df.drop(columns=['Unnamed: 3'], inplace=True, errors='ignore')

Cleaning and processing data...


In [26]:
# Handle missing values
print("Filling missing values...")
df.fillna(method='ffill', inplace=True)


Filling missing values...


  df.fillna(method='ffill', inplace=True)


In [57]:
# Convert dates to datetime format
print("Converting date columns...")
df['invoice_date'] = pd.to_datetime(df['invoice_date'])
df['last_purchase_date'] = pd.to_datetime(df['last_purchase_date'])


Converting date columns...


In [53]:
df['invoice_date']

0       2023-03-22
1       2024-04-06
2       2022-05-31
3       2021-01-01
4       2020-02-10
           ...    
9995    2020-05-07
9996    2021-09-22
9997    2021-02-25
9998    2023-06-24
9999    2025-02-12
Name: invoice_date, Length: 10000, dtype: object

In [54]:
df['last_purchase_date']

0       2021-08-15
1       2020-01-19
2       2021-04-06
3       2022-05-17
4       2023-01-28
           ...    
9995    2024-03-29
9996    2024-06-26
9997    2023-10-26
9998    2020-09-30
9999    2023-07-27
Name: last_purchase_date, Length: 10000, dtype: object

In [58]:

# Feature Engineering
print("Performing feature engineering...")
df['recency'] = (df['invoice_date'].max() - df['last_purchase_date']).dt.days
df['purchase_per_tenure'] = df['purchase_frequency'] / (df['tenure'] + 1)
df['discount_ratio'] = df['discount_used'] / (df['price'] + 1)


Performing feature engineering...


In [59]:
df[['recency', 'purchase_per_tenure', 'discount_ratio']].head()

Unnamed: 0,recency,purchase_per_tenure,discount_ratio
0,1279.0,3.333333,0.0
1,1853.0,15.0,0.0
2,1410.0,6.0,0.023832
3,1004.0,7.0,0.035088
4,748.0,10.333333,0.057372


In [61]:
df[['category','item', 'payment_method', 'card_type', 'shopping_mall', 'city', 'province_state', 'country', 'gender']]

Unnamed: 0,category,item,payment_method,card_type,shopping_mall,city,province_state,country,gender
0,Hair Care,Hair Oil,Credit Card - MasterCard,MasterCard,Mall of America,Bloomington,Minnesota,US,Female
1,Body Care and Hygiene,Deodorant,Apple Pay,,Fashion Show Mall,Las Vegas,Nevada,US,Male
2,Skin Care,Face Wash,Google Pay,,Lakeside Shopping Centre,Thurrock,Essex,UK,Female
3,Body Care and Hygiene,Body Lotion,Google Pay,,Bullring & Grand Central,Birmingham,West Midlands,UK,Other
4,Oral Care,Toothpaste,Cash,,The Galleria at Fort Lauderdale,Fort Lauderdale,Florida,US,Other
...,...,...,...,...,...,...,...,...,...
9995,Hair Care,Conditioner,PayPal,,Metrocentre,Gateshead,Tyne and Wear,UK,Female
9996,Skin Care,Face Wash,Credit Card - AMEX,AMEX,The Gardens Mall,Palm Beach Gardens,Florida,US,Female
9997,Body Care and Hygiene,Hand Sanitizer,Apple Pay,,The Shops at La Cantera,San Antonio,Texas,US,Female
9998,Skin Care,Face Wash,Debit Card - MasterCard,MasterCard,Lenox Square,Atlanta,Georgia,US,Male


In [62]:
# Encode categorical variables
print("Encoding categorical features...")
label_encoders = {}
categorical_cols = ['category','item','location_id', 'payment_method', 'card_type', 'shopping_mall', 'city', 'province_state', 'country', 'gender']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

Encoding categorical features...


In [63]:
df[['category','item', 'payment_method', 'card_type', 'shopping_mall', 'city', 'province_state', 'country', 'gender']]

Unnamed: 0,category,item,payment_method,card_type,shopping_mall,city,province_state,country,gender
0,1,4,4,2,17,5,16,3,0
1,0,2,0,4,12,24,17,3,1
2,3,3,8,4,15,47,8,2,0
3,0,0,8,4,7,4,30,2,2
4,2,11,2,4,38,18,9,3,2
...,...,...,...,...,...,...,...,...,...
9995,1,1,9,4,19,21,27,2,0
9996,3,3,3,1,39,36,9,3,0
9997,0,5,0,4,46,40,26,3,0
9998,3,3,6,2,16,1,10,3,1


In [65]:
df[['quantity', 'price', 'days_since_last_purchase', 'tenure', 'discount_used', 'purchase_frequency', 'avg_purchase_value', 'recency', 'purchase_per_tenure', 'discount_ratio']]

Unnamed: 0,quantity,price,days_since_last_purchase,tenure,discount_used,purchase_frequency,avg_purchase_value,recency,purchase_per_tenure,discount_ratio
0,1,17.45,1056.0,8.0,0.0,30.0,280.82,1279.0,3.333333,0.000000
1,4,20.81,1494.0,1.0,0.0,30.0,52.40,1853.0,15.000000,0.000000
2,4,40.96,386.0,6.0,1.0,42.0,165.90,1410.0,6.000000,0.023832
3,2,27.50,482.0,3.0,1.0,28.0,427.81,1004.0,7.000000,0.035088
4,1,16.43,1309.0,2.0,1.0,31.0,321.20,748.0,10.333333,0.057372
...,...,...,...,...,...,...,...,...,...,...
9995,4,30.30,1819.0,7.0,1.0,22.0,253.78,322.0,2.750000,0.031949
9996,4,91.11,1010.0,6.0,0.0,24.0,395.23,233.0,3.428571,0.000000
9997,2,4.13,1447.0,9.0,0.0,16.0,210.32,477.0,1.600000,0.000000
9998,2,87.97,769.0,10.0,1.0,5.0,104.89,1598.0,0.454545,0.011240


In [66]:

# Normalize numerical features
print("Scaling numerical features...")
scaler = StandardScaler()
numerical_cols = ['quantity', 'price', 'days_since_last_purchase', 'tenure', 'discount_used', 'purchase_frequency', 'avg_purchase_value', 'recency', 'purchase_per_tenure', 'discount_ratio']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

Scaling numerical features...


In [67]:
df[['quantity', 'price', 'days_since_last_purchase', 'tenure', 'discount_used', 'purchase_frequency', 'avg_purchase_value', 'recency', 'purchase_per_tenure', 'discount_ratio']]

Unnamed: 0,quantity,price,days_since_last_purchase,tenure,discount_used,purchase_frequency,avg_purchase_value,recency,purchase_per_tenure,discount_ratio
0,-1.329069,-0.556498,0.276452,0.821493,-1.01147,0.321027,0.222337,0.618691,-0.373911,-0.623352
1,1.346191,-0.419132,1.112575,-1.585001,-1.01147,0.321027,-1.381148,1.702689,2.071938,-0.623352
2,1.346191,0.404655,-1.002549,0.133923,0.98866,1.143984,-0.584390,0.866084,0.185140,-0.115683
3,-0.437316,-0.145627,-0.819289,-0.897431,0.98866,0.183867,1.254192,0.099354,0.394784,0.124079
4,-1.329069,-0.598198,0.759418,-1.241216,0.98866,0.389607,0.505801,-0.384101,1.093598,0.598781
...,...,...,...,...,...,...,...,...,...,...
9995,1.346191,-0.031155,1.732986,0.477708,0.98866,-0.227611,0.032519,-1.188601,-0.496204,0.057216
9996,1.346191,2.454923,0.188640,0.133923,-1.01147,-0.090452,1.025484,-1.356677,-0.353945,-0.623352
9997,-0.437316,-1.101056,1.022854,1.165278,-1.01147,-0.639090,-0.272566,-0.895884,-0.737294,-0.623352
9998,-0.437316,2.326551,-0.271419,1.509063,0.98866,-1.393467,-1.012674,1.221122,-0.977432,-0.383925


## Step 4: Prepare Data for Model Training

In [68]:
X = df.drop(columns=['is_churned', 'invoice_no', 'customer_id', 'email', 'phone_number', 'location_id', 'payment_id',
                     'first_name', 'last_name', 'dob', 'invoice_date', 'last_purchase_date', 
                     'Unnamed: 0'], errors='ignore')

In [40]:
print("Preparing training and testing datasets...")
#X = df.drop(columns=['is_churned', 'invoice_no', 'customer_id', 'email', 'phone_number', 'first_name', 'last_name', 'dob'])
y = df['is_churned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Preparing training and testing datasets...


In [46]:
X_train

Unnamed: 0,category,item,quantity,price,payment_method,card_type,shopping_mall,city,province_state,country,gender,age,days_since_last_purchase,tenure,discount_used,purchase_frequency,avg_purchase_value,recency,purchase_per_tenure,discount_ratio
9254,2,11,-0.437316,-0.757232,6,2,40,27,4,3,0,57.0,1.316434,0.821407,-1.011870,-1.530913,0.304646,-0.221635,-1.003058,-0.623628
1561,1,4,-0.437316,-0.125594,6,2,8,7,15,3,2,18.0,1.234366,-0.553678,-1.011870,-0.913795,-0.487269,0.135344,-0.569815,-0.623628
1670,3,10,-0.437316,0.463117,3,1,40,27,4,3,1,50.0,-0.244767,0.477636,-1.011870,-0.913795,-1.200850,1.102401,-0.758485,-0.623628
6087,0,0,-1.329069,-0.875383,8,3,24,1,10,3,0,34.0,-0.824969,-1.584991,0.988269,0.183305,1.522913,-0.718385,1.861931,1.376571
6669,2,8,-0.437316,-1.022152,4,2,40,27,4,3,1,35.0,0.577822,1.165178,0.988269,-1.462345,1.139449,0.671758,-0.989082,2.393670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,2,8,1.346191,-0.576939,6,2,32,19,26,3,0,53.0,-1.221950,-0.209907,-1.011870,-1.119501,0.196409,-1.749660,-0.758485,-0.623628
5191,3,7,-1.329069,2.078393,1,0,57,41,4,3,0,20.0,-1.231493,0.821407,0.988269,-0.845226,-0.500466,-0.059200,-0.770132,-0.366667
5390,1,4,-0.437316,0.526076,4,2,36,34,9,3,2,29.0,0.877465,-0.553678,0.988269,-0.776657,-1.336813,-0.484176,-0.485962,-0.149510
860,0,0,-1.329069,-0.947746,6,2,6,32,20,3,1,36.0,-0.282939,0.821407,-1.011870,0.594717,-0.059374,0.758642,-0.280987,-0.623628


## Step 5: Train ML Models

In [43]:
print("Training Logistic Regression Model...")
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

Training Logistic Regression Model...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Step 6: Model Evaluation

In [44]:
print("Evaluating Model...")
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Logistic Regression ROC-AUC:", roc_auc_score(y_test, lr_model.predict_proba(X_test)[:, 1]))

Evaluating Model...
Logistic Regression Classification Report:
               precision    recall  f1-score   support

         0.0       0.53      0.65      0.58       970
         1.0       0.58      0.46      0.51      1030

    accuracy                           0.55      2000
   macro avg       0.55      0.55      0.55      2000
weighted avg       0.56      0.55      0.55      2000

Logistic Regression ROC-AUC: 0.5702992693424082


## Step 7: Save Model

In [69]:
print("Saving the model...")
with open('churn_prediction_model.pkl', 'wb') as model_file:
    pickle.dump(lr_model, model_file)
print("Model saved successfully!")

Saving the model...
Model saved successfully!
