## **Import Dataset and Libraries**

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


In [None]:
data = pd.read_excel('/kaggle/input/ecommerce-customer-churn-analysis-and-prediction/E Commerce Dataset.xlsx', sheet_name='E Comm')
data_desc = pd.read_excel('/kaggle/input/ecommerce-customer-churn-analysis-and-prediction/E Commerce Dataset.xlsx', sheet_name='Data Dict', header=1, usecols=[1,2,3])
data_desc

## **Diving To The Dataset**

In [None]:
# Let's take a peak what kind of data we have.
data.head()

> From 5 row above we already see some null values. We'll handle that missing values later

In [6]:
# I wonder how many rows and columns in this dataset and their dtypes
print('Data shape: %d rows and %d cols.' % data.shape)

data.info()

Data shape: 5630 rows and 19 cols.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Churn                        5630 non-null   int64  
 1   Tenure                       5366 non-null   float64
 2   PreferredLoginDevice         5630 non-null   object 
 3   CityTier                     5630 non-null   int64  
 4   WarehouseToHome              5379 non-null   float64
 5   PreferredPaymentMode         5630 non-null   object 
 6   Gender                       5630 non-null   object 
 7   HourSpendOnApp               5375 non-null   float64
 8   NumberOfDeviceRegistered     5630 non-null   int64  
 9   PreferedOrderCat             5630 non-null   object 
 10  SatisfactionScore            5630 non-null   int64  
 11  MaritalStatus                5630 non-null   object 
 12  NumberOfAddress              5630 non-nul

>There are 5.630 rows and 20 cols.

In [7]:
# Next question, how many missing values in each column? 

total_na = data.isnull().sum().sum()
print(f"Total missing values: {total_na} ({ round((total_na/data.shape[0])*100, 2) }%)")

data.isnull().sum()

Total missing values: 1856 (32.97%)


Churn                            0
Tenure                         264
PreferredLoginDevice             0
CityTier                         0
WarehouseToHome                251
PreferredPaymentMode             0
Gender                           0
HourSpendOnApp                 255
NumberOfDeviceRegistered         0
PreferedOrderCat                 0
SatisfactionScore                0
MaritalStatus                    0
NumberOfAddress                  0
Complain                         0
OrderAmountHikeFromlastYear    265
CouponUsed                     256
OrderCount                     258
DaySinceLastOrder              307
CashbackAmount                   0
dtype: int64

> We have total 1.856 missing values, it's about 32.97% from total data.

In [None]:
# Before we handle missing values, let's see the statistical summary of this dataset
data.describe().transpose()

> In summary, the mean and median in each column are so close from each other. So I decided to filling all the missing values with median, since some feature will make more sense if they had integer values like Complain, CouponUsed, OrderCount, etc. *(like there's no such things like "I used coupon 1.75 times" or "User has 0.2 complained")*

In [8]:
# Now let's fill those missing values
for col in data.columns:
    if data[col].isnull().sum() > 0:
        data[col].fillna(data[col].median(), inplace=True)
        
# Done. Checking the missing values for the last time
data.isnull().sum()

Churn                          0
Tenure                         0
PreferredLoginDevice           0
CityTier                       0
WarehouseToHome                0
PreferredPaymentMode           0
Gender                         0
HourSpendOnApp                 0
NumberOfDeviceRegistered       0
PreferedOrderCat               0
SatisfactionScore              0
MaritalStatus                  0
NumberOfAddress                0
Complain                       0
OrderAmountHikeFromlastYear    0
CouponUsed                     0
OrderCount                     0
DaySinceLastOrder              0
CashbackAmount                 0
dtype: int64

## **Data Exploration and Visualization**

### The Percentage of Customer Churn

In [None]:
# First, what is the percentage of customer churn
labels = ['Retained','Churn']
counts = data['Churn'].value_counts()
# print(counts)

plt.figure(figsize=(6,6))
plt.title('Percentage of Customer Churn')
plt.pie(counts, autopct='%1.1f%%', labels=labels, textprops={"fontsize":14})
plt.show()

### Customer Churn based on Gender

In [None]:
# Which gender most like to churn?
sns.countplot(x=data['Churn'], hue=data['Gender'])
plt.title('Customer Churn based on Gender')
plt.show()

> Male customer tends to churn than female, but also likely to retained.

### Distribution of Order Count of Customers

In [None]:
sns.countplot(x=data['OrderCount'], hue=data['Churn'])
plt.title("Distribution of Customer Orders")
plt.show()

> Most customers only ordered 1-2 times. 

### Customer Preferred Login Device

In [None]:
sns.countplot(x=data['PreferredLoginDevice'], hue=data['Churn'])
plt.title('Customer Preferred Login Device')
plt.show()

> Churn: The number of mobile phone, phone, and computer is likely close with average 316 customer.

### Customer Preferred Payment

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x=data['PreferredPaymentMode'])
plt.title('Customer Preferred Payment')
plt.show()

> Customers prefer to make payments with debit and credit cards.

### Customer Preferred Order Category

In [None]:
plt.figure(figsize=(8,3))
sns.countplot(x=data['PreferedOrderCat'])
plt.title('Customer Preferred Order Category')
plt.show()

> In this E-commerce, customers order category for Laptop & Accessory is higher than other categories.

### Customer Distance from Warehouse to Home

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x=data['WarehouseToHome'], hue=data['Churn'])
plt.title('Distance Between Warehouse to Customer Home')
plt.show()

> This figure shows there is no correlation between warehouse to home distance with customer churn. But we'll confirm this with correlation matrix.

### Customer City Tier

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(x=data['CityTier'])
plt.title('Customer City Tier')
plt.show()

> City Tier 2, has low number customer.

### Customer Hour Spend On App 

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(x=data['HourSpendOnApp'])
plt.title('Customer App Hour Spend')
plt.show()

> People like to spend 2-4 hours on this E-commerce app

### Customers Satisfaction Score

In [None]:
average_score = data['SatisfactionScore'].mean()
print('Average Satisfaction Score:',average_score)

plt.figure(figsize=(5,5))
sns.countplot(x=data['SatisfactionScore'], hue=data['Churn'])
plt.title('Customer Satisfaction Score')
plt.show()

> From scale 1-5 the average satisfaction score is 3.06. Customers satisfaction are neutral.

### Customer Marital Status

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(x=data['MaritalStatus'], hue=data['Churn'])
plt.title('Customer Marital Status')
plt.show()

### Customer Complains

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(x=data['Complain'], hue=data['Churn'])
plt.title('Customer Complain')
plt.show()

> Customers who churn more likely complain in their last month.

### Customer Order

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x=data['OrderCount'], hue=data['Churn'])
plt.title('Customer Order')
plt.show()

> From figure above notice that high number in their 1st and 2nd order, this means customer churn after their first or second order.

### Customer Used Coupon

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x=data['CouponUsed'], hue=data['Churn'])
plt.title('Customer Used Coupon')
plt.show()

> 

### Customer Day Since Last Order

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x=data['DaySinceLastOrder'], hue=data['Churn'])
plt.title('Customer - Day Since Last Order')
plt.show()

> We notice customers leave e-commerce after a week since their last order

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(data.corr(numeric_only=True), annot=True)
plt.title("Correlation Matrix for the Customer Dataset")
plt.show()

### Findings from analysis:
> 1. There are 4.682 users (82.2%) retained, while 948 users (16.8%) has churned.
2. Customers spend 2-4 hours in the app.
3. Customers mostly only order 1-2 times. 
4. Warehouse to home distance doesn't affect customer churn.
5. Customers who complain and also with single status have higher churn rate
6. From correlation matrix, correlation between WarehouseToHome and Churn is 0.07 (0.069544 to be more precise). This means weak or no correlation.

## **Data Preprocessing**

In [None]:
# Create function to convert column with dtype object to integer
def object_to_int(x):
    if x.dtype == 'object':
        x = LabelEncoder().fit_transform(x)
    return x

# Convert column dtype object to int
data = data.apply(lambda x : object_to_int(x))

Since this dataset 'Churn' feature only had 948 (16.8%) customer churn with total 5.630 customers, this make our 'Churn' feature imbalance. In this case, I will apply oversampling method to balance the dataset.

In [None]:
# balance dataset with oversampling
from sklearn.utils import resample
data_0 = data.loc[data['Churn'] == 0]
data_1 = data.loc[data['Churn'] == 1]

# Count total each churn feature
n_0 = len(data_0)
n_1 = len(data_1)

# Applying oversampling method
data_oversampling = resample(data_1, replace=True, n_samples=n_0, random_state=42)
data_new = pd.concat([data_oversampling, data_0])
data_new = data_new.sample(frac=1)
data_new.reset_index(drop=True, inplace=True)

In [None]:
# Check total of churn data after apply oversampling method
data_new['Churn'].value_counts()

> Now our dataset have balance churn feature. Let's build Maching Learning Model!!

In [None]:
# preferredlogindevice 
# mobile phone, phone, computer
# [1, 2, 0]

# PreferedOrderCat
# ['Laptop & Accessory', 'Mobile', 'Mobile Phone', 'Others','Fashion', 'Grocery']
# [2, 3, 4, 5, 0, 1]

# preferred payment mode
# ['Debit Card', 'UPI', 'CC', 'Cash on Delivery', 'E wallet', 'COD','Credit Card']
# [4, 6, 0, 2, 5, 1, 3]

# gender
# female male
# [0, 1]

# marital status
# ['Single', 'Divorced', 'Married']
# [2, 0, 1]

## **Machine Learning Model**
In this section, I will implementing Logistic Regression, KNN and SVC to compare each model and to see which model have the better accuracy to predict customer churn.

In [None]:
# First we need to the data into 70% training dataset and 30% testing dataset
X = data_new.drop(['Churn'],axis=1)
y = data_new['Churn']

from  sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
logreg.fit(X_train, y_train)
logreg_prediction = logreg.predict(X_test)
print('Accuracy Score: %s ' % accuracy_score(y_test, logreg_prediction))
print(classification_report(y_test, logreg_prediction))

### K-Nearest Neighbor (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_prediction = logreg.predict(X_test)
print('Accuracy Score: %s ' % accuracy_score(y_test, knn_prediction))
print(classification_report(y_test, knn_prediction))

### Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
svc_prediction = svc.predict(X_test)
print('Accuracy Score: %s' % accuracy_score(y_test, svc_prediction))
print(classification_report(y_test, svc_prediction))

## **Conclusion**
After build 3 Machine Learning Model and also testing it. Here's the summary of each models result:
   * **Logistic Regression:** 
      - Accuracy Score: 0.79
      - Customer Churn Precision: 0.80
      - Customer Churn Recall: 0.80
      
      
   * **K-Nearest Neighbor (KNN):**
      - Accuracy Score: 0.79
      - Customer Churn Precision: 0.80
      - Customer Churn Recall: 0.80
      
      
   * **Support Vector Machine (SVM):**
      - Accuracy Score: 0.76
      - Customer Churn Precision: 0.75
      - Customer Churn Recall: 0.82
   
This models can be improved by tuning the hyperparameter to make prediction more accurate. 

**Thank you!**