KMetoids project using 2009 retail sales data
from raw data to final violin plotting

Load packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.decomposition import PCA



In [None]:
df_raw = pd.options.display.float_format = '{:20.2f}'.format
pd.set_option('display.max_columns', 999)

load data set for 2009 retail sales data
Check data for obvious problems

In [None]:
df_raw = pd.read_csv('online_retail_II_p1.csv', encoding='ISO-8859-1')

In [None]:
df_raw.info()

In [None]:
df_raw.describe()

In [None]:
df_raw.describe(include='O')

Customers will need to be looked at
Price and Qty have negative values and these will have to be looked at
1. look at customer id for NaN
2. check qty for neg values
3. check invoice to see why neg values for price

In [None]:
df_raw[df_raw['Customer ID'].isna()].head(20)

In [None]:
df_raw[df_raw['Quantity'] < 0].head(20)

In [None]:
df_raw['Invoice'] = df_raw['Invoice'].astype('str')
df_raw[df_raw['Invoice'].str.match("^\\d{6}$") == False]

Check invoice column for non-6 digit values

In [None]:
df_raw['Invoice'].str.replace("[0-9]", "", regex=True).unique()

In [None]:
df_raw[df_raw['Invoice'].str.startswith('A')]

Check stock codes for interesting things

In [None]:
df_raw['StockCode'] = df_raw['StockCode'].astype('str')

df_raw[(df_raw['StockCode'].str.match("^\\d{5}$") == False) & (df_raw['StockCode'].str.match('^\\d{5}[a-zA-Z]+$')== False)]['StockCode'].unique()

Lots of non-numeric stock codes to check
Out of all of the non-numerics only keep PADS

In [None]:
df_raw[df_raw['StockCode'].str.contains("^DOT")]

In [None]:
df_raw[df_raw['StockCode'].str.contains("^PADS$")]

Clean up the data based on the findings

In [None]:
df_cleaned = df_raw.copy()
df_cleaned

Clean up invoices to invoices w/ 6 digits only. No 'C' or 'A' types

In [None]:
df_cleaned['Invoice'] = df_cleaned['Invoice'].astype('str')

mask = (
    df_cleaned['Invoice'].str.match("^\\d{6}$") == True
)
df_cleaned = df_cleaned[mask]
df_cleaned

Clean up the stock codes to either 5 digits, or 5 digits + an alpha, or PADS

In [None]:
df_cleaned['StockCode'] = df_cleaned['StockCode'].astype('str')

mask = (
    (df_cleaned['StockCode'].str.match("^\\d{5}") == True)
    | (df_cleaned['StockCode'].str.match("^\\d{5}[a-zA-Z]+$") == True)
    | (df_cleaned['StockCode'].str.match("^PADS") == True)
)

df_cleaned[mask]

df_cleaned[(df_cleaned['StockCode'].str.match("^\\d{5}$") == True) & (df_cleaned['StockCode'].str.match('^\\d{5}[a-zA-Z]+$')== True)]['StockCode'].unique()


Drop Customer ID = NaN

In [None]:
df_cleaned.dropna(subset=['Customer ID'], inplace=True)

Clean up neg prices

In [None]:
# Check min price <> actual 0.00
len(df_cleaned[df_cleaned['Price'] == 0])

In [None]:
df_cleaned = df_cleaned[df_cleaned['Price'] > 0.00]

In [None]:
len(df_cleaned[df_cleaned['Price'] == 0])

Check numbers match up

In [None]:
df_cleaned.describe()


Data lost in cleaning - 22.4%

In [None]:
len(df_cleaned) / len(df_raw)

Add line extension of SalesLineTotal to df by multiplying Price * Qty

In [None]:

#Clean up spelling error 

df_cleaned = df_cleaned.drop('SlaesLineTotal', axis=1)

In [None]:
df_cleaned['SalesLineTotal'] = df_cleaned['Price'] * df_cleaned['Quantity']
df_cleaned.head(2)

Aggregate the data on Customer ID and start to create the RFM features

In [None]:
df_agg = df_cleaned.groupby(by='Customer ID', as_index=False) \
    .agg(
        MonetaryValue=('SalesLineTotal', "sum"),
        Frequency = ('Invoice', 'nunique'),
        LastInvoiceDate = ('InvoiceDate', 'max')
)
df_agg.head()

In [None]:
df_agg['LastInvoiceDate'] = pd.to_datetime(df_agg['LastInvoiceDate'])
max_invoice_date = df_agg['LastInvoiceDate'].max()
df_agg['Recency'] = (max_invoice_date - df_agg['LastInvoiceDate']).dt.days

df_agg.head()

We already know that the data is skewed by a large number of high value outliers in MonetaryValue and Frequency and a histogram and 3-D plot would reveal this.

Therefore we will transform the data using a log transformation and scale the result using MinMax to force all of the 
features onto a common scale.

Plot the result and compare it to the previous KMeans and second year KMetoids results

In [None]:
selected_columns = ["MonetaryValue", "Frequency", "Recency"]
df_agg_log = df_agg[selected_columns].copy()

df_agg_log["MonetaryValue"] = np.log1p(df_agg['MonetaryValue'])
df_agg_log["Frequency"] = np.log1p(df_agg['Frequency'])
df_agg_log["Recency"] = np.log1p(df_agg['Recency'])

In [None]:
scaler = MinMaxScaler()

scaled_data = scaler.fit_transform(df_agg_log[["MonetaryValue", "Frequency", "Recency"]])
scaled_data

In [None]:
df_agg_log_scaled = pd.DataFrame(scaled_data, index=df_agg_log.index,
                            columns=('MonetaryValue', 'Frequency', 'Recency'))

df_agg_log_scaled

In [None]:
fig = plt.figure(figsize=(8, 8))

ax = fig.add_subplot(projection='3d')
scatter = ax.scatter(df_agg_log_scaled['MonetaryValue'], df_agg_log_scaled['Frequency'], df_agg_log_scaled['Recency'])
ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3-D Plot of Transformed and Scaled Data')

Data is transformed and scaled on a 0-1 range.
Use KMetoids to develop clusters for this data

In [None]:
df_metoids = df_agg_log_scaled.copy()

Determine the number of clusters to use in KMedoids by using silhouette scores - clusters = 4 is determined

In [None]:
X = df_metoids[['MonetaryValue', 'Frequency', 'Recency']]

silhouette_scores = []
range_n_clusters = range(2, 11)

for n_clusters in range_n_clusters:
    kmedoids = KMedoids(n_clusters=n_clusters, random_state=0).fit(X)
    cluster_labels = kmedoids.labels_
    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette_scores.append(silhouette_avg)

In [None]:
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for K-Medoids')
plt.show()

In [None]:
X = df_metoids[['MonetaryValue', 'Frequency', 'Recency']]
kmedoids = KMedoids(n_clusters=4, random_state=0).fit(X)
cluster_labels = kmedoids.labels_
df_metoids['Cluster'] = cluster_labels
df_metoids

Plot the results

In [None]:
cluster_colors = {0: '#1f77b4', # Blue
                  1: '#ff7f0e', # Orange
                  2: '#2ca02c', # Green
                  3: '#d62728', # Red   
}

colors = df_metoids['Cluster'].map(cluster_colors)

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')

scatter = ax.scatter(df_metoids['MonetaryValue'],
                     df_metoids['Frequency'],
                     df_metoids['Recency'],
                     c=colors,
                     marker='o')

ax.set_xlabel('Monetary Value')
ax.set_ylabel('Frequency')
ax.set_zlabel('Recency')
ax.set_title('3-D Scatterplot of Customer Data by Clusters(k): KMedoids')


Violin Plots

In [None]:
fig = plt.figure(figsize=(12, 18))

plt.subplot(3, 1, 1)
sns.violinplot(x=df_metoids['Cluster'], y=df_metoids['MonetaryValue'], palette=cluster_colors,
               hue=df_metoids['Cluster'])
sns.violinplot(y=df_metoids['MonetaryValue'], color='gray', linewidth=1.0)
plt.title('Monetary Value by Metoid Cluster: 2009 Data')
plt.ylabel('Monetary Value')

plt.subplot(3, 1, 2)
sns.violinplot(x=df_metoids['Cluster'], y=df_metoids['Frequency'], palette=cluster_colors,
               hue=df_metoids['Cluster'])
sns.violinplot(y=df_metoids['Frequency'], color='gray', linewidth=1.0)
plt.title('Frequency by Metoid Cluster: 2009 Data')
plt.ylabel('Frequency')

plt.subplot(3, 1, 3)
sns.violinplot(x=df_metoids['Cluster'], y=df_metoids['Recency'], palette=cluster_colors,
               hue=df_metoids['Cluster'])
sns.violinplot(y=df_metoids['Recency'], color='gray', linewidth=1.0)
plt.title('Recency by Metoid Cluster: 2009 Data')
plt.ylabel('Recency')

plt.tight_layout()
plt.show()