# Customer Segmentation with K-Means Clustering
### Import Libraries

In [1]:
import numpy as np, pandas as pd
from datetime import timedelta
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from plotly import graph_objs as go

### Load the Dataset

In [2]:
transactions = pd.read_csv('customer_segmentation.csv', index_col='index')
transactions.head()

Unnamed: 0_level_0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


### Explore the Dataset

In [3]:
transactions.describe()
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 37.2+ MB


In [4]:
transactions.InvoiceNo.nunique()
transactions.CustomerID.nunique()

4372

### Drop Unnecessary Columns

In [5]:
transactions = transactions.drop(columns=['StockCode', 'Description', 'Country'])
transactions

Unnamed: 0_level_0,InvoiceNo,Quantity,InvoiceDate,UnitPrice,CustomerID
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,536365,6,12/1/2010 8:26,2.55,17850.0
1,536365,6,12/1/2010 8:26,3.39,17850.0
2,536365,8,12/1/2010 8:26,2.75,17850.0
3,536365,6,12/1/2010 8:26,3.39,17850.0
4,536365,6,12/1/2010 8:26,3.39,17850.0
...,...,...,...,...,...
541904,581587,12,12/9/2011 12:50,0.85,12680.0
541905,581587,6,12/9/2011 12:50,2.10,12680.0
541906,581587,4,12/9/2011 12:50,4.15,12680.0
541907,581587,4,12/9/2011 12:50,4.15,12680.0


### Treat Missing Values

In [6]:
transactions = transactions.dropna()
transactions

Unnamed: 0_level_0,InvoiceNo,Quantity,InvoiceDate,UnitPrice,CustomerID
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,536365,6,12/1/2010 8:26,2.55,17850.0
1,536365,6,12/1/2010 8:26,3.39,17850.0
2,536365,8,12/1/2010 8:26,2.75,17850.0
3,536365,6,12/1/2010 8:26,3.39,17850.0
4,536365,6,12/1/2010 8:26,3.39,17850.0
...,...,...,...,...,...
541904,581587,12,12/9/2011 12:50,0.85,12680.0
541905,581587,6,12/9/2011 12:50,2.10,12680.0
541906,581587,4,12/9/2011 12:50,4.15,12680.0
541907,581587,4,12/9/2011 12:50,4.15,12680.0


### Calculate Total Price per Item

In [7]:
transactions.loc[:, 'Price'] = transactions['Quantity'] * transactions['UnitPrice']
transactions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0_level_0,InvoiceNo,Quantity,InvoiceDate,UnitPrice,CustomerID,Price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,536365,6,12/1/2010 8:26,2.55,17850.0,15.30
1,536365,6,12/1/2010 8:26,3.39,17850.0,20.34
2,536365,8,12/1/2010 8:26,2.75,17850.0,22.00
3,536365,6,12/1/2010 8:26,3.39,17850.0,20.34
4,536365,6,12/1/2010 8:26,3.39,17850.0,20.34
...,...,...,...,...,...,...
541904,581587,12,12/9/2011 12:50,0.85,12680.0,10.20
541905,581587,6,12/9/2011 12:50,2.10,12680.0,12.60
541906,581587,4,12/9/2011 12:50,4.15,12680.0,16.60
541907,581587,4,12/9/2011 12:50,4.15,12680.0,16.60



### Calculate Recency of the Purchase

In [8]:
purchase_recency = transactions.groupby('CustomerID')['InvoiceDate'].agg('max').reset_index(name='Recency')
purchase_recency

Unnamed: 0,CustomerID,Recency
0,12346.0,1/18/2011 10:17
1,12347.0,8/2/2011 8:48
2,12348.0,9/25/2011 13:13
3,12349.0,11/21/2011 9:51
4,12350.0,2/2/2011 16:01
...,...,...
4367,18280.0,3/7/2011 9:52
4368,18281.0,6/12/2011 10:53
4369,18282.0,8/9/2011 15:10
4370,18283.0,9/5/2011 12:35


### Apt the Column's Datatype

In [9]:
purchase_recency.Recency = purchase_recency.Recency.astype(dtype='datetime64[ns]')
most_recent_date = max(purchase_recency.Recency)
purchase_recency.Recency = most_recent_date - purchase_recency.Recency
purchase_recency.Recency = purchase_recency.Recency.apply(lambda x : x.days) + 1
purchase_recency

Unnamed: 0,CustomerID,Recency
0,12346.0,326
1,12347.0,130
2,12348.0,75
3,12349.0,19
4,12350.0,310
...,...,...
4367,18280.0,278
4368,18281.0,181
4369,18282.0,122
4370,18283.0,95



### Calculate the Purchase Frequency of Customer

In [10]:
purchase_frequency = transactions.groupby('CustomerID')['CustomerID'].agg('count').reset_index(name='Frequency')

### Calculate the Monetary Value per Customer

In [11]:
purchase_amounts = transactions.groupby('CustomerID')['Price'].agg('sum').reset_index(name='Monetary')
purchase_amounts

Unnamed: 0,CustomerID,Monetary
0,12346.0,0.00
1,12347.0,4310.00
2,12348.0,1797.24
3,12349.0,1757.55
4,12350.0,334.40
...,...,...
4367,18280.0,180.60
4368,18281.0,80.82
4369,18282.0,176.60
4370,18283.0,2094.88


### Prepare the Data

In [12]:
rfm_data = purchase_recency.merge(purchase_frequency, on='CustomerID', how='outer').merge(purchase_amounts, on='CustomerID', how='outer')
rfm_data

Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,12346.0,326,2,0.00
1,12347.0,130,182,4310.00
2,12348.0,75,31,1797.24
3,12349.0,19,73,1757.55
4,12350.0,310,17,334.40
...,...,...,...,...
4367,18280.0,278,10,180.60
4368,18281.0,181,7,80.82
4369,18282.0,122,13,176.60
4370,18283.0,95,756,2094.88


In [13]:
clms_to_scale = ['Recency', 'Frequency', 'Monetary']
scaler = StandardScaler()
rfm_data[clms_to_scale] = scaler.fit_transform(rfm_data[clms_to_scale])
rfm_data

Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,12346.0,2.004230,-0.391720,-0.231001
1,12347.0,-0.094704,0.382657,0.293432
2,12348.0,-0.683691,-0.266959,-0.012316
3,12349.0,-1.283386,-0.086271,-0.017146
4,12350.0,1.832888,-0.327188,-0.190312
...,...,...,...,...
4367,18280.0,1.490205,-0.357303,-0.209026
4368,18281.0,0.451447,-0.370209,-0.221167
4369,18282.0,-0.180375,-0.344397,-0.209513
4370,18283.0,-0.469514,2.852058,0.023900


In [14]:
x = rfm_data.iloc[:, 1:4]
x

Unnamed: 0,Recency,Frequency,Monetary
0,2.004230,-0.391720,-0.231001
1,-0.094704,0.382657,0.293432
2,-0.683691,-0.266959,-0.012316
3,-1.283386,-0.086271,-0.017146
4,1.832888,-0.327188,-0.190312
...,...,...,...
4367,1.490205,-0.357303,-0.209026
4368,0.451447,-0.370209,-0.221167
4369,-0.180375,-0.344397,-0.209513
4370,-0.469514,2.852058,0.023900


### Find the Optimal value of K

In [15]:
wcsse = []
number_clusters = [i for i in range(1,11)]
for clusters_num in number_clusters:
    model = KMeans(clusters_num, n_init=10)
    model.fit(x)
    wcsse_iter = model.inertia_
    wcsse.append(wcsse_iter)
fig = go.Figure(data=go.Scatter(x=number_clusters, y=wcsse, mode='lines+markers'))
fig.update_layout(
    title="Finding K with the Elbow Technique",
    xaxis_title="Value of K",
    yaxis_title="Within Cluster Sum of Squared Distances",
)
fig.show()


### Cluster the Data

In [16]:
model = KMeans(n_clusters=5, n_init=10)
rfm_data['ClusteredData'] = model.fit_predict(x)
rfm_data.groupby('ClusteredData')['ClusteredData'].agg('count')

ClusteredData
0    2827
1    1294
2       5
3       4
4     242
Name: ClusteredData, dtype: int64

### Explore the Clusters

In [17]:
cluster_0 = rfm_data[rfm_data['ClusteredData'] == 0]
cluster_1 = rfm_data[rfm_data['ClusteredData'] == 1]
cluster_2 = rfm_data[rfm_data['ClusteredData'] == 2]
cluster_3 = rfm_data[rfm_data['ClusteredData'] == 3]
cluster_4 = rfm_data[rfm_data['ClusteredData'] == 4]
cluster_4

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,ClusteredData
66,12428.0,-0.319590,0.890304,0.727482,4
71,12433.0,-0.512349,1.406555,1.396550,4
86,12451.0,0.087346,1.126919,0.856013,4
100,12471.0,-0.490932,1.884087,2.049359,4
101,12472.0,0.183726,1.281794,0.526990,4
...,...,...,...,...,...
4238,18109.0,-0.512349,1.552826,0.733337,4
4245,18118.0,-0.512349,5.123563,0.449882,4
4249,18122.0,-0.587311,1.165638,-0.008791,4
4324,18223.0,-0.448096,0.886002,0.537424,4


### Visualize the Clusters

In [18]:
trace0 = go.Scatter3d(
        x=cluster_0.Recency,
        y=cluster_0.Frequency,
        z=cluster_0.Monetary,
        text=cluster_0.CustomerID,
        mode="markers",
        marker=dict(
                    size=10,
                    color="rgb(255,0,0)",  # set color to desired value 
        ),
        name="Cluster 0"
)
trace1 = go.Scatter3d(
        x=cluster_1.Recency,
        y=cluster_1.Frequency,
        z=cluster_1.Monetary,
        text=cluster_1['CustomerID'],
        mode="markers",
        marker=dict(
                    size=10,
                    color="rgb(0,255,0)",  
        ),
        name="Cluster 1"
)
trace2 = go.Scatter3d(
        x=cluster_2.Recency,
        y=cluster_2.Frequency,
        z=cluster_2.Monetary,
        text=cluster_2['CustomerID'],
        mode="markers",
        marker=dict(
                    size=10,
                    color="rgb(0,0,255)", 
        ),
        name="Cluster 2"
)
trace3 = go.Scatter3d(
        x=cluster_3.Recency,
        y=cluster_3.Frequency,
        z=cluster_3.Monetary,
        text=cluster_3['CustomerID'],
        mode="markers",
        marker=dict(
                    size=10,
                    color="rgb(255,255,0)", 
        ),
        name="Cluster 3"
)
trace4 = go.Scatter3d(
        x=cluster_4.Recency,
        y=cluster_4.Frequency,
        z=cluster_4.Monetary,
        text=cluster_4['CustomerID'],
        mode="markers",
        marker=dict(
                    size=10,
                    color="rgb(127,0,255)",
        ),
        name="Cluster 4"
)

# Create a list of traces
data_plot_km = [trace0,trace1,trace2,trace3,trace4]

# Create a Figure object
layout = go.Layout (
        margin=dict(l=0, r=0, b=30, t=30), title='Customer Segmentation',
        scene = dict(
            xaxis = dict (title = "Recency"),
            yaxis = dict (title = "Frequency"),
            zaxis = dict (title = "Amount Spent")
        )
    )
fig = go.Figure(data = data_plot_km, layout = layout)

# Update the data point labels
fig.update_traces( hovertemplate=" Customer ID:%{text} <br> Recency:%{x} <br> Frequency:%{y} <br> Amount Spent:%{z} ")
fig.show()