In [1]:
# Import necessary libraries
from scipy.cluster.hierarchy import linkage,dendrogram
from sklearn.cluster import KMeans,MiniBatchKMeans
from sklearn.preprocessing import StandardScaler 
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

import seaborn  as  sns
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import pandas as pd
import warnings
import os

warnings.simplefilter('ignore')
os.chdir('D:\Datasets')

df2 = pd.read_csv('rfm_data_customer.csv',index_col=0)

df3 = df2.drop(columns=['most_recent_visit'])
df3

Unnamed: 0_level_0,revenue,number_of_orders,recency_days
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
22086,777,9,232
2290,1555,16,115
26377,336,5,43
24650,1189,12,64
12883,1229,12,23
...,...,...,...
3249,998,10,31
6686,771,8,187
16418,1016,9,154
9117,678,7,195


In [2]:
## Data Scaling

# Create a StandardScaler object and set the output to be a pandas DataFrame
scaler = StandardScaler().set_output(transform='pandas')

# Fit the scaler to the input DataFrame 'df3' and transform the data
df3_scaled = scaler.fit_transform(df3)

# Mini-Batch K-Means Clustering Optimization

In [3]:
# Define a list of potential number of clusters to try
ks = [2, 3, 4, 5, 6, 7, 9]

# Initialize a list to store the silhouette scores
scores = []

# Loop through the different number of clusters
for i in ks:
    # Create a MiniBatchKMeans object with i clusters and a fixed random state
    km = MiniBatchKMeans(n_clusters=i, random_state=24)
    
    # Fit the clustering model to the scaled DataFrame
    km.fit(df3_scaled)
    
    # Calculate the silhouette score for the current number of clusters
    scores.append(silhouette_score(df3_scaled, km.labels_))

# Find the index of the maximum silhouette score
i_max = np.argmax(scores)

# Print the best number of clusters and the corresponding silhouette score
print('best no. of clusters: ', ks[i_max])
print('best score: ', scores[i_max])


best no. of clusters:  3
best score:  0.37068062712545796


## Visualizing Clustering Results

In [7]:
# Create a copy of the original DataFrame df3
df3_clust = df3.copy()

# Add a new column 'Clust' to the DataFrame, containing the cluster labels
df3_clust['Clust'] = km.labels_

# Sort the DataFrame by the 'Clust' column
df3_clust.sort_values('Clust', inplace=True)
df3

Unnamed: 0_level_0,revenue,number_of_orders,recency_days
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
22086,777,9,232
2290,1555,16,115
26377,336,5,43
24650,1189,12,64
12883,1229,12,23
...,...,...,...
3249,998,10,31
6686,771,8,187
16418,1016,9,154
9117,678,7,195


## Analyzing Cluster Characteristics

In [8]:
# Group the df3_clust DataFrame by the 'Clust' column and calculate the mean of each feature
cluster_means = df3_clust.groupby('Clust').mean()

# Display the cluster means
print(cluster_means)

           revenue  number_of_orders  recency_days
Clust                                             
0      1041.451748         11.246316     71.391939
1       487.026829          5.699024    195.929024
2      1580.926152         15.409495     94.102165
3       531.248120          5.865003    470.538619
4      1307.347057         13.565133    198.385333
5       704.773316          7.849984     85.473285
6      1155.728158         10.916525    170.306401
7       976.090745          9.845517    407.636658
8       832.295039          9.026436    204.599054


# correletions

In [10]:
## Analyzing Cluster 0

# Filter the df3_clust DataFrame to get only the data points in Cluster 0
rfm_0 = df3_clust[df3_clust['Clust'] == 0]

# Calculate the correlation matrix for the selected features in Cluster 0
corr_matrix = rfm_0[['revenue', 'number_of_orders', 'recency_days']].corr()

# Display the correlation matrix
corr_matrix

Unnamed: 0,revenue,number_of_orders,recency_days
revenue,1.0,0.162158,-0.24596
number_of_orders,0.162158,1.0,0.022889
recency_days,-0.24596,0.022889,1.0


In [17]:
rfm_1 = df3_clust[df3_clust['Clust']==1]
rfm_1[['revenue','number_of_orders','recency_days']].corr()

Unnamed: 0,revenue,number_of_orders,recency_days
revenue,1.0,0.496765,0.20044
number_of_orders,0.496765,1.0,0.194932
recency_days,0.20044,0.194932,1.0


In [16]:
rfm_2 = df3_clust[df3_clust['Clust']==2]
rfm_2[['revenue','number_of_orders','recency_days']].corr()

Unnamed: 0,revenue,number_of_orders,recency_days
revenue,1.0,0.448392,0.276444
number_of_orders,0.448392,1.0,0.197573
recency_days,0.276444,0.197573,1.0
