In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

Loading the dataset:

In [3]:
data = pd.read_csv("customer_segmentation.csv")

### Basic Data Preprocessing

In [4]:
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


By looking at `data.head` we can see that the dataset given has 29 columns. Looking at the data, we can make an assumption that dataset represents personal data alongside with some kind of website activity.

In [5]:
data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [6]:
data.isnull().sum()

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64

As there are only 24 null values, it seems logical to just drop the rows with missing data.

In [7]:
data.dropna(inplace=True)

In [8]:
data.isnull().sum().sum()

0

In [9]:
data["Education"].value_counts()

Education
Graduation    1116
PhD            481
Master         365
2n Cycle       200
Basic           54
Name: count, dtype: int64

In [56]:
data["Marital_Status"].value_counts()

Marital_Status
Married     857
Together    572
Single      470
Divorced    231
Widow        76
Alone         3
Absurd        2
YOLO          2
Name: count, dtype: int64

In [10]:
data["Dt_Customer"] = pd.to_datetime(data["Dt_Customer"], dayfirst=True)

In [11]:
from datetime import datetime

In [12]:
data["Age"] = datetime.now().year - data["Year_Birth"]

In [13]:
data["Total_Children"] = data["Kidhome"] + data["Teenhome"]

In [14]:
data.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'Total_Children'],
      dtype='object')

In [15]:
spending_cols = ['MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds']

In [16]:
data["Total_spending"] = data[spending_cols].sum(axis=1)

In [17]:
data["Customer_since"] = (datetime.now() - data["Dt_Customer"]).dt.days

### Data Visualization

#### Distribution

In [18]:
px.histogram(data, x="Age", nbins=30, title="Age Distribution").show()

We can see that some data show ages of 125+, which seems like an input error or a purposefully false data. I suggest we drop these rows.

In [19]:
data = data[data["Age"] <= 120]

In [20]:
px.histogram(data, x="Age", nbins=30, title="Age Distribution").show()

Now the distribution graph seems much neater and more realistic

In [21]:
px.histogram(data, x="Income", nbins=50, title="Income Distribution").show()

This histogram shows outliers but they dont seem to be unrealistic, just rare, so I will leave the data as it is

In [22]:
px.histogram(data, x="Total_spending", nbins=30, title="Total Spending Distribution").show()

In [24]:
px.box(data, y="Income", x="Education", title="Income Boxplot").show()

The graph doesnt seem to be very informative due to an outlier, so I will exclude it to see if it shows more information.

In [25]:
fig = px.box(data, y="Income", x="Education", title="Income Boxplot")
fig.update_yaxes(range=[0, 200000])
fig.show()

#### Correlation

In [27]:
data.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'Total_Children', 'Total_spending', 'Customer_since'],
      dtype='object')

In [28]:
corr = data[["Income", "Age", "Recency", "Total_spending", "NumWebPurchases", "NumStorePurchases"]]

In [40]:
fig = px.imshow(corr.corr(), text_auto=".3f",  color_continuous_scale="RdBu_r", title="Correlation Matrix")
fig.update_xaxes(tickangle=90)
fig.show()

In [41]:
pivot_income = data.pivot_table(values="Income", index="Education", columns="Marital_Status", aggfunc="mean").reset_index()

In [44]:
px.imshow(pivot_income.set_index("Education"), text_auto=".2s", color_continuous_scale="Blues", title="Average Income by Education and Marital Status").show()

#### Grouped data

In [45]:
group1 = data.groupby("Education")["Total_spending"].mean().sort_values(ascending=False)

In [50]:
fig = px.bar(group1, title="Average Total Spending by Education")
fig.update_layout(showlegend=False)
fig.show()

In [51]:
data["AcceptedAny"] = data[["AcceptedCmp1", "AcceptedCmp2", "AcceptedCmp3", "AcceptedCmp4", "AcceptedCmp5"]].sum(axis=1)

In [53]:
data["AcceptedAny"] = data["AcceptedAny"].apply(lambda x: 1 if x > 0 else 0)

In [54]:
group2 = data.groupby("Marital_Status")["AcceptedAny"].mean().sort_values(ascending=False)

In [55]:
fig = px.bar(group2)
fig.update_layout(showlegend=False)
fig.show()

In [57]:
labels = ["18-29", "30-39", "40-49", "50-59", "60+"]

In [58]:
data["AgeGroup"] = pd.cut(data["Age"], bins=[18, 29, 39, 49, 59, 120], labels=labels)

In [59]:
group3 = data.groupby("AgeGroup")["Income"].mean()





In [60]:
fig = px.bar(group3, title="Average Income by Age Group", orientation='h')
fig.update_layout(showlegend=False)
fig.show()

### Feature selection

For this task I would like to choose **Age, Income, Total Spendings, Number of Web Purchases, Number of Store Purchases, Number of Web Visits and Recency** as main features

In [61]:
features = ["Income", "Age", "Recency", "Total_spending", "NumWebPurchases", "NumStorePurchases", "NumWebVisitsMonth"]

In [62]:
X = data[features]

In [67]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [68]:
X_scaled = scaler.fit_transform(X)

### KMeans

In [69]:
from sklearn.cluster import KMeans

Using the **Elbow Method** to find the best k for KMeans algorithm

In [96]:
wcss = []

for i in range(2, 14):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

In [97]:
px.line(x=range(2, 14), y=wcss, markers=True, title="Elbow Method").show()

In [98]:
kmeans = KMeans(n_clusters=6)
data["Cluster"] = kmeans.fit_predict(X_scaled)

In [99]:
cluster_summary = data.groupby("Cluster")[features].mean()

In [100]:
data["Cluster"].value_counts()

Cluster
4    516
0    487
2    314
1    306
5    296
3    294
Name: count, dtype: int64

In [101]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
data["PCA1"], data["PCA2"] = X_pca[:, 0], X_pca[:, 1]

In [102]:
px.scatter(data, x="PCA1", y="PCA2", color="Cluster", title="Customer Segments based on KMeans Clustering").show()

In [103]:
cluster_summary

Unnamed: 0_level_0,Income,Age,Recency,Total_spending,NumWebPurchases,NumStorePurchases,NumWebVisitsMonth
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,33023.887064,53.026694,73.862423,91.640657,2.022587,3.098563,6.603696
1,78896.310458,46.039216,53.490196,1313.921569,4.424837,8.45098,2.568627
2,73452.904459,69.111465,49.764331,1161.719745,4.452229,8.27707,2.493631
3,53254.97619,63.792517,61.547619,548.47619,5.897959,5.921769,6.445578
4,34581.943798,52.153101,22.525194,117.97093,2.193798,3.329457,6.472868
5,63544.570946,56.864865,36.388514,1046.435811,8.253378,9.101351,5.935811


#### What we can see from these clusters is:

- **Cluster 0:** Low income, low total spending, *infrequent buyers with lots of web visits*
- **Cluster 1:** High income, high total spendings, high number of in-store purchases - *premium buyers, possibly loyal customers*
- **Cluster 2:** High income, older in age than Cluster 1, but similar amount of total spending, high number of in-store purchases - possiby *loyal older premium customers*
- **Cluster 3:** Average income, older age and moderate spendings with almost same number of in-store and  web purchases - *balanced customers*
- **Cluster 4:** Low income, but lowest recency too, *frequent buyers of cheper products with lots of web visits, possibly new customers*
- **Cluster 5:** Mid-to-high income,  low recency and high total spendings - *loyal highly engaged cutomers*


In [104]:
import joblib
joblib.dump(kmeans, "kmeans_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']