Second project using sales data for RFM features.
Using log transformation on the full data set

Import packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from mpl_toolkits.mplot3d import Axes3D

Set some conditions for data presentation.
Limit floats to 2 decimal places and show all column

In [None]:
pd.options.display.float_format = '{:20.2f}'.format
pd.set_option('display.max_columns', 999)

Load in the data - note the encoding requirement due to condition of the csv file

In [None]:
df = pd.read_csv("C:\\Users\\alexd\\Python Projects\\k_means/online_retail_II_p2.csv", encoding='ISO-8859-1')
df.head()

In [None]:
df.describe()

In [None]:
df.describe(include='O')

These results suggest the same issues as w/ the first project. Qty and Price have neg mins
There are more unique descriptions than stock codes, so there will have to be checked and resolved

In [None]:
# Check for NaN Customer ID values - these get dropped
df[df['Customer ID'].isna()].head(15)

In [None]:
# Check for neg qty values
# If they have a leading 'C' on the invoice they are returns and get dropped
df[df['Quantity'] < 0].head(15)

In [None]:
# Check the invoice column for other leaders/trailers to expected 6 digits
# Use REGEX to find unique leaders - same as project 1 so drop C and A invoices

df['Invoice'] = df['Invoice'].astype('str')
df['Invoice'].str.replace("[0-9]", "", regex=True).unique()

In [None]:
# Check the stockcodes for interesting things
# Same result as in project 1 so only PADS will kept, the rest will be dropped

df['StockCode'].astype('str')

df[(df['StockCode'].str.match("^\\d{5}$") == False) & (df['StockCode'].str.match("^\\d{5}[a-zA-Z]+$") == False)]["StockCode"].unique()
 
 

Clean up the data
     Create new df for cleaned data
     Drop NaN Customers
     Drop 'C' and 'A' leader invoices
     Drop all non-5 digit StockCodes except for PADS
     Drop and check Price >= 0.00

In [None]:
cleaned_df = df.copy()


In [None]:
# Drop NaN customers
cleaned_df.dropna(subset=['Customer ID'], inplace=True)
cleaned_df.describe()

In [None]:
# Drop invoices with 'A' 'C' leaders
cleaned_df['Invoice'] = cleaned_df['Invoice'].astype('str')

mask = (
    cleaned_df['Invoice'].str.match("^\\d{6}$") == True
)

cleaned_df = cleaned_df[mask]
cleaned_df

In [None]:
# Clean up the StockCodes

cleaned_df['StockCode'] = cleaned_df['StockCode'].astype('str')

mask = (
    (cleaned_df['StockCode'].str.match("^\\d{5}") == True)
    | (cleaned_df['StockCode'].str.match("^\\d{5}[a-zA-Z]+$") == True)
    | (cleaned_df['StockCode'].str.match("^PADS$") == True)  
)

cleaned_df = cleaned_df[mask]
cleaned_df.describe()

In [None]:
# Check and Price >= 0
len(cleaned_df[cleaned_df['Price'] == 0])

In [None]:
cleaned_df = cleaned_df[cleaned_df['Price'] > 0.00]
len(cleaned_df[cleaned_df['Price'] == 0])

How much of the original data remains after the data cleanup
Dividing the cleaned_df by the original df reveals that 73.1 % of data is left

In [None]:
len(cleaned_df) / len(df)

Aggregate the data
First extend the Qty * Price into new column SalesLineTotal
Then use groupby to group data by customer
Recency uses the data lastest invoice date as max and subtracts the line last invoice date to determine 'since when'

In [None]:
cleaned_df["SalesLineTotal"] = cleaned_df["Quantity"] * cleaned_df["Price"]
cleaned_df

In [None]:
aggregated_df = cleaned_df.groupby(by="Customer ID", as_index=False) \
    .agg(
        MonetaryValue = ("SalesLineTotal", "sum"),
        Frequency = ("Invoice", "nunique"),
        LastInvoiceDate = ("InvoiceDate", "max")
    )
    
aggregated_df.head()

In [None]:
aggregated_df["LastInvoiceDate"] = pd.to_datetime(aggregated_df["LastInvoiceDate"])

max_invoice_date = aggregated_df["LastInvoiceDate"].max()
aggregated_df["Recency"] = (max_invoice_date - aggregated_df["LastInvoiceDate"]).dt.days

aggregated_df

Skip the histograms since this data is likely the same as the previous year in having many extreme outliers in Monetary Value and Frequency
Create boxplots

In [None]:
fig = plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(data=aggregated_df["MonetaryValue"], color='lightgreen')
plt.title('Box plot of Monetary Value')
plt.xlabel('Monetary Value')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
sns.boxplot(data=aggregated_df["Frequency"], color='skyblue')
plt.xlabel('Boxplot of Frequency')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
sns.boxplot(data=aggregated_df["Recency"], color='skyblue')
plt.xlabel('Boxplot of Recency')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

Use a log transformation on the full dataset to see if that eases the influence of the outliers

In [None]:
log_agg_df = aggregated_df.copy()
log_agg_df['LastInvoiceDate'] = log_agg_df['LastInvoiceDate'].dt.date


log_agg_df["MonetaryValue"] = np.log1p(log_agg_df["MonetaryValue"])
log_agg_df["Frequency"] = np.log1p(log_agg_df["Frequency"])
log_agg_df["Recency"] = np.log1p(log_agg_df["Recency"])
log_agg_df

In [None]:
log_agg_test_df = aggregated_df.copy()

# Strip the timestamp and keep only the date
log_agg_test_df['DateOnly'] = log_agg_test_df['LastInvoiceDate'].dt.date

# Convert the date to the number of days since January 1, 1970
log_agg_test_df['DateAsFloat'] = (pd.to_datetime(log_agg_test_df['DateOnly']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')

log_agg_test_df

In [None]:
fig = plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(data=log_agg_df["MonetaryValue"], color='lightgreen')
plt.title('Box plot of Monetary Value')
plt.xlabel('Monetary Value')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
sns.boxplot(data=log_agg_df["Frequency"], color='skyblue')
plt.xlabel('Boxplot of Frequency')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
sns.boxplot(data=log_agg_df["Recency"], color='skyblue')
plt.xlabel('Boxplot of Recency')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

The log transformation did help but there is still a profound effect of the Monetary Value and Frequency high outliers.
Use a 3-D scatterplot to check the scaling

In [None]:
fig = plt.figure(figsize=(8, 8))

ax = fig.add_subplot(projection='3d')

scatter = ax.scatter(log_agg_df["MonetaryValue"], log_agg_df["Frequency"], log_agg_df["Recency"])
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3-D Scatterplot of Log Aggregated Data')

The log transformation moves the data closer together and moves it to the centre but adding a scaling should normalize the data for KMeans

In [None]:
log_agg_df = log_agg_df.drop('LastInvoiceDate', axis=1)
log_agg_df

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(log_agg_df)
scaled_data

The scaler returns a Numpy array that has to be worked back into Pandas

In [None]:
scaled_log_data_df = pd.DataFrame(scaled_data, index=log_agg_df.index,
                        columns=('Customer ID', 'MonetaryValue', 'Frequency', 'Recency'))
scaled_log_data_df

Sorting out a Numpy problem digesting a date.time column generating the KMeans result
    log_agg_df['LastInvoiceDate'] = log_agg_df['datetime_column'].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)
Convert datetime_column to string
    log_agg_df['LastInvoiceDate'] = log_agg_df['LastInvoiceDate'].astype(str)
Convert datetime_column to object
    log_agg_df['LastInvoiceDate'] = log_agg_df['LastInvoiceDate'].astype('object')
Peel off the time stamp
    log_agg_df['LastInvoiceDate'] = log_agg_df['LastInvoiceDate'].dt.date
Convert date to Julian number as float as of 01January1970 The 
    log_agg_test_df['DateAsFloat'] = (pd.to_datetime(log_agg_test_df['DateOnly']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


max_k = 12
inertia = []
silhouette_scores = []
k_values = range(2, max_k + 1)

for k in k_values:
    kmeans = KMeans(n_clusters = k, random_state = 42, max_iter = 1000)
    cluster_labels = kmeans.fit_predict(scaled_log_data_df)
    sil_score = silhouette_score(scaled_log_data_df, cluster_labels)
    silhouette_scores.append(sil_score)
    inertia.append(kmeans.inertia_)
    
fig = plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(k_values, inertia, marker='o')
plt.title('Kmeans Inertia for Different Values of (k)')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.xticks(k_values)
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(k_values, silhouette_scores, marker='o', color='orange')
plt.title('Silhouette Scores for Different Values of (k)')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.xticks(k_values)
plt.grid(True)

plt.tight_layout()
plt.show()
    
    

The results of the KMeans cluster run suggests 4 clusters is the optimal choice on the elbow.
This is supported by the silhouette score where 4 scores hight than 5

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42, max_iter=1000)
cluster_labels = kmeans.fit_predict(scaled_log_data_df)
cluster_labels

In [None]:
scaled_log_data_df['Cluster'] = cluster_labels
scaled_log_data_df

3-D plot of cluster results to see how the clusters are mapped

In [None]:
cluster_colors = {
    0: '#1f77b4', # Blue
    1: '#ff7f0e', # Orange
    2: '#2ca02c', # Green
    3: '#d62728', # Red
}

colors = scaled_log_data_df['Cluster'].map(cluster_colors)

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')

scatter = ax.scatter(scaled_log_data_df['MonetaryValue'],
               scaled_log_data_df['Frequency'],
               scaled_log_data_df['Recency'],
               c=colors,
               marker='o')
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3-D Scatterplot of Log Transformed & Std Scaled Customer data by Cluster')

plt.show()


Scale the data using MinMax

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
mm_scaled_log_data = scaler.fit_transform(log_agg_df)
mm_scaled_log_data

Take the numpy array and bring in back into pandas as a new df

In [None]:
mm_scaled_log_data_df = pd.DataFrame(mm_scaled_log_data, index=log_agg_df.index,
                        columns=('Customer ID', 'MonetaryValue', 'Frequency', 'Recency'))
mm_scaled_log_data_df

Run KMeans on the new scaled df

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42, max_iter=1000)
cluster_labels = kmeans.fit_predict(mm_scaled_log_data)
cluster_labels

In [None]:
mm_scaled_log_data_df['Cluster'] = cluster_labels
mm_scaled_log_data_df

3-D Plot the Results

In [None]:
cluster_colors = {
    0: '#1f77b4', # Blue
    1: '#ff7f0e', # Orange
    2: '#2ca02c', # Green
    3: '#d62728', # Red
}

colors = mm_scaled_log_data_df['Cluster'].map(cluster_colors)

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')

scatter = ax.scatter(mm_scaled_log_data_df['MonetaryValue'],
               mm_scaled_log_data_df['Frequency'],
               mm_scaled_log_data_df['Recency'],
               c=colors,
               marker='o')
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3-D Scatterplot of Log Transformed & MinMax Scaled Customer data by Cluster')

plt.show()


The scaling in MinMax is consistent from feature to feature to feature so proceed w/ this scaled data.
Visualize w/ violin plots

In [None]:
fig = plt.figure(figsize=(12, 18))
plt.subplot(3, 1, 1)
sns.violinplot(x=mm_scaled_log_data_df['Cluster'], y=mm_scaled_log_data_df['MonetaryValue'], palette=cluster_colors,
                hue=mm_scaled_log_data_df['Cluster'])
sns.violinplot(mm_scaled_log_data_df['MonetaryValue'], color='gray', linewidth=1.0)
plt.title('Violin Plot of Monetary Value Feature by Cluster(k): Full Data Set Transformed and MinMax Scaled')
plt.ylabel('Monetary Value')


plt.subplot(3, 1, 2)
sns.violinplot(x=mm_scaled_log_data_df['Cluster'], y=mm_scaled_log_data_df['Frequency'], palette=cluster_colors,
                hue=mm_scaled_log_data_df['Cluster'])
sns.violinplot(mm_scaled_log_data_df['Frequency'], color='gray', linewidth=1.0)
plt.title('Violin Plot of Frequency Featue by Cluster(k): Full Data Set Transformed and MinMax Scaled')
plt.ylabel('Frequency')

plt.subplot(3, 1, 3)
sns.violinplot(x=mm_scaled_log_data_df['Cluster'], y=mm_scaled_log_data_df['Recency'], palette=cluster_colors,
                hue=mm_scaled_log_data_df['Cluster'])
sns.violinplot(mm_scaled_log_data_df['Recency'], color='gray', linewidth=1.0)
plt.title('Violin Plot of Recency Featue by Cluster(k): Full Data Set Transformed and MinMax Scaled')
plt.ylabel('Recency')

plt.tight_layout()
plt.show()



Return to the aggregated data df and rework the data to remove and preserve the outliers.
Process both data sets though scaling and then to KMeans and determine the differences between 
the segmented data feature results and the features of the entire data set log transformed and scaled

Redo the boxplots of the aggregated data

In [None]:
fig = plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(data=aggregated_df["MonetaryValue"], color='lightgreen')
plt.title('Box plot of Monetary Value')
plt.xlabel('Monetary Value')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
sns.boxplot(data=aggregated_df["Frequency"], color='skyblue')
plt.xlabel('Boxplot of Frequency')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
sns.boxplot(data=aggregated_df["Recency"], color='skyblue')
plt.xlabel('Boxplot of Recency')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

The boxplots show the outliers well beyond the 1.5 *  +IQR
Remove the high outliers from MonetaryValue and Frequency and re-plot the data

In [None]:
M_Q3 = aggregated_df['MonetaryValue'].quantile(0.75)
M_Q1 = aggregated_df['MonetaryValue'].quantile(0.25)
M_IQR = M_Q3 - M_Q1
M_HO = M_Q3 + 1.5 * M_IQR
M_LO = M_Q1 - 1.5 * M_IQR
high_mvo_df = aggregated_df[(aggregated_df['MonetaryValue'] > M_HO)].copy()
all_mvo_df = aggregated_df[(aggregated_df['MonetaryValue'] > M_HO)
                           | (aggregated_df['MonetaryValue'] < M_LO)].copy()

all_mvo_df = all_mvo_df.drop(["LastInvoiceDate"], axis=1)
all_mvo_df

#high_mvo_df

In [None]:
F_Q3 = aggregated_df['Frequency'].quantile(0.75)
F_Q1 = aggregated_df['Frequency'].quantile(0.25)
F_IQR = F_Q3 - F_Q1
F_HO = F_Q3 + 1.5 * F_IQR
F_LO = F_Q1 - 1.5 * F_IQR
high_fo_df = aggregated_df[(aggregated_df['Frequency'] > F_HO)].copy()

all_fo_df = aggregated_df[(aggregated_df['Frequency'] > F_HO)
                           | (aggregated_df['Frequency'] < F_LO)].copy()

all_fo_df = all_fo_df.drop(["LastInvoiceDate"], axis=1)
all_fo_df
#high_fo_df

Remove those outliers and create a new df

In [None]:
non_outliers_df = aggregated_df[(~aggregated_df.index.isin(high_mvo_df.index)) & 
                                (~aggregated_df.index.isin(high_fo_df.index))]


non_outliers_df.describe()

Remove all of the Monetary Value and Frequency outliers and create a new df

In [None]:
all_outliers_out_df = aggregated_df[(~aggregated_df.index.isin(all_mvo_df.index)) & 
                                (~aggregated_df.index.isin(all_fo_df.index))]

all_outliers_out_df = all_outliers_out_df.drop(['LastInvoiceDate'], axis=1)

all_outliers_out_df.describe()

Redo the boxplots on the non outlier dataframe

In [None]:
fig = plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(data=non_outliers_df["MonetaryValue"], color='lightgreen')
plt.title('Box plot of Monetary Value')
plt.xlabel('Monetary Value')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
sns.boxplot(data=non_outliers_df["Frequency"], color='skyblue')
plt.xlabel('Boxplot of Frequency')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
sns.boxplot(data=non_outliers_df["Recency"], color='skyblue')
plt.xlabel('Boxplot of Recency')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

Go with this result and scale the data and look at results
First drop the LastInvoiceDate column as it is not needed going forward
Use StandardScaler to scale the data
Build new scaled df
3-D plot the scaled data

In [None]:
non_outliers_df = non_outliers_df.drop('LastInvoiceDate', axis=1)
non_outliers_df

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(non_outliers_df[['MonetaryValue', 'Frequency', 'Recency']])
scaled_data

In [None]:
ss_scaled_data_df =pd.DataFrame(scaled_data, index=non_outliers_df.index,
                                columns=('MonetaryValue', 'Frequency', 'Recency'))

In [None]:


fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(projection='3d')

scatter = ax.scatter(ss_scaled_data_df['MonetaryValue'],
               ss_scaled_data_df['Frequency'],
               ss_scaled_data_df['Recency'],
               marker='o')
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3-D Scatterplot of Log Transformed & MinMax Scaled Customer data by Cluster')

plt.show()

The data is clustered over on the right side of the plot but the scaling is reasonable so use this scaled data for KMeans.

Assume that 4 clusters to be used for this KMeans procedure

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42, max_iter=1000)
cluster_labels = kmeans.fit_predict(ss_scaled_data_df)
cluster_labels

In [None]:
ss_scaled_data_df['Cluster'] = cluster_labels
ss_scaled_data_df

3-D plot the results

In [None]:
cluster_colors = {
    0: '#1f77b4', # Blue
    1: '#ff7f0e', # Orange
    2: '#2ca02c', # Green
    3: '#d62728', # Red
}

colors = ss_scaled_data_df['Cluster'].map(cluster_colors)

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')

scatter = ax.scatter(ss_scaled_data_df['MonetaryValue'],
               ss_scaled_data_df['Frequency'],
               ss_scaled_data_df['Recency'],
               c=colors,
               marker='o')
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3-D Scatterplot of Non-High Outlier Std Scaled Customer data by Cluster')

plt.show()

The 3-D plot looks good so proceed to the Violin plots

In [None]:
fig = plt.figure(figsize=(12, 18))
plt.subplot(3, 1, 1)
sns.violinplot(x=ss_scaled_data_df['Cluster'], y=ss_scaled_data_df['MonetaryValue'], palette=cluster_colors,
                hue=ss_scaled_data_df['Cluster'])
sns.violinplot(ss_scaled_data_df['MonetaryValue'], color='gray', linewidth=1.0)
plt.title('Violin Plot of Monetary Value Feature by Cluster(k): High Outlier removed Std Scaled')
plt.ylabel('Monetary Value')


plt.subplot(3, 1, 2)
sns.violinplot(x=ss_scaled_data_df['Cluster'], y=ss_scaled_data_df['Frequency'], palette=cluster_colors,
                hue=ss_scaled_data_df['Cluster'])
sns.violinplot(ss_scaled_data_df['Frequency'], color='gray', linewidth=1.0)
plt.title('Violin Plot of Frequency Featue by Cluster(k): High Outlier removed Std Scaled')
plt.ylabel('Frequency')

plt.subplot(3, 1, 3)
sns.violinplot(x=ss_scaled_data_df['Cluster'], y=ss_scaled_data_df['Recency'], palette=cluster_colors,
                hue=ss_scaled_data_df['Cluster'])
sns.violinplot(ss_scaled_data_df['Recency'], color='gray', linewidth=1.0)
plt.title('Violin Plot of Recency Featue by Cluster(k): High Outlier removed Std Scaled')
plt.ylabel('Recency')

plt.tight_layout()
plt.show()


Run the df with all outliers removed through scaling and KMeans to check and compare the results
Scatterplot the non-outlier data
Scale the data
Scatterplot the results
KMeans
Plot the results

In [None]:
fig = plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(data=all_outliers_out_df["MonetaryValue"], color='lightgreen')
plt.title('Box plot of Monetary Value: Non-outlier')
plt.xlabel('Monetary Value')
plt.ylabel('Count')

plt.subplot(1, 3, 2)
sns.boxplot(data=all_outliers_out_df["Frequency"], color='skyblue')
plt.xlabel('Boxplot of Frequency: Non-outlier')
plt.ylabel('Count')

plt.subplot(1, 3, 3)
sns.boxplot(data=all_outliers_out_df["Recency"], color='skyblue')
plt.xlabel('Boxplot of Recency: Non-outlier')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

Use Standard Scaler to scale the data and plot results

In [None]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(all_outliers_out_df[['MonetaryValue', 'Frequency', 'Recency']])
scaled_data

In [None]:
no_outlier_scaled_data_df = pd.DataFrame(scaled_data, index=all_outliers_out_df.index,
                            columns=('MonetaryValue', 'Frequency', 'Recency'))
no_outlier_scaled_data_df

3-D Plot of scaled data

In [None]:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(projection='3d')

scatter = ax.scatter(no_outlier_scaled_data_df['MonetaryValue'],
               no_outlier_scaled_data_df['Frequency'],
               no_outlier_scaled_data_df['Recency'],
               marker='o')
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3-D Scatterplot of Std Scaled Customer data No Outliers')

plt.show()

KMeans on scaled data and plot results

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42, max_iter=1000)
cluster_labels = kmeans.fit_predict(no_outlier_scaled_data_df)
cluster_labels

In [None]:
no_outlier_scaled_data_df['Cluster'] = cluster_labels
no_outlier_scaled_data_df

In [None]:
cluster_colors = {
    0: '#1f77b4', # Blue
    1: '#ff7f0e', # Orange
    2: '#2ca02c', # Green
    3: '#d62728', # Red
}

colors = no_outlier_scaled_data_df['Cluster'].map(cluster_colors)

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')

scatter = ax.scatter(no_outlier_scaled_data_df['MonetaryValue'],
               no_outlier_scaled_data_df['Frequency'],
               no_outlier_scaled_data_df['Recency'],
               c=colors,
               marker='o')
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3-D Scatterplot of No Outlier Std Scaled Customer data by Cluster')

plt.show()

Violin Plots of the Features

In [None]:
fig = plt.figure(figsize=(12, 18))
plt.subplot(3, 1, 1)
sns.violinplot(x=no_outlier_scaled_data_df['Cluster'], y=no_outlier_scaled_data_df['MonetaryValue'], palette=cluster_colors,
                hue=no_outlier_scaled_data_df['Cluster'])
sns.violinplot(no_outlier_scaled_data_df['MonetaryValue'], color='gray', linewidth=1.0)
plt.title('Violin Plot of Monetary Value Feature by Cluster(k): All Outliers removed Std Scaled')
plt.ylabel('Monetary Value')


plt.subplot(3, 1, 2)
sns.violinplot(x=no_outlier_scaled_data_df['Cluster'], y=no_outlier_scaled_data_df['Frequency'], palette=cluster_colors,
                hue=no_outlier_scaled_data_df['Cluster'])
sns.violinplot(no_outlier_scaled_data_df['Frequency'], color='gray', linewidth=1.0)
plt.title('Violin Plot of Frequency Featue by Cluster(k): All Outliers removed Std Scaled')
plt.ylabel('Frequency')

plt.subplot(3, 1, 3)
sns.violinplot(x=no_outlier_scaled_data_df['Cluster'], y=no_outlier_scaled_data_df['Recency'], palette=cluster_colors,
                hue=no_outlier_scaled_data_df['Cluster'])
sns.violinplot(no_outlier_scaled_data_df['Recency'], color='gray', linewidth=1.0)
plt.title('Violin Plot of Recency Featue by Cluster(k): All Outliers removed Std Scaled')
plt.ylabel('Recency')

plt.tight_layout()
plt.show()
