In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
url = "/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv"
data = pd.read_csv(url)
data.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats
from mpl_toolkits import mplot3d
from matplotlib.lines import Line2D

import warnings
warnings.filterwarnings('ignore')

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 

pd.set_option("display.float_format", lambda x: "{:.3f}".format(x)) #Limiting floats output to 3 decimal points
pd.set_option("display.max_columns", None)

# Goal
You want to understand your customers to figure out the next marketing stratergy. The data does not provide output y value, it is upto us to find the data patterns.

# Basic EDA

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.describe(include="all")

In [None]:
data.Gender.value_counts(normalize=True)

In [None]:
sns.boxplot(data.Gender, data['Annual Income (k$)'])

In [None]:
sns.boxplot(data.Gender, data['Spending Score (1-100)'])

We have more females in the data and have more 30-40 years old female in the data which makes sense their Q1 percentile of spending score is higher than male as they have more income compare to 20 years old group.

In [None]:
plt.figure(1 , figsize = (15 , 7))
n = 0 
for cols in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
    n += 1 
    plt.subplot(1 , 3 , n)
    plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
    sns.boxplot(x = cols , y = 'Gender' , data = data , palette = 'vlag')
    sns.swarmplot(x = cols , y = 'Gender' , data = data)
    plt.ylabel('Gender' if n == 1 else '')
    plt.title('Boxplots & Swarmplots' if n == 2 else '')
plt.show()

We can already see the clear clusters with the comnbination of anual income and the spending scors.

In [None]:
sns.pairplot(data.drop("CustomerID", axis=1), hue="Gender") 

# Prepare Data for the model
The data is clean already, just need to perform feature scale before fitting it to the Kmeans.
K-means needs to compute means, and the mean value is not meaningful if we include the categorical data. For this reason, I will remove Gender column from the model.

In [None]:
data.info()

In [None]:
X = data.drop(["Gender", "CustomerID"], axis=1) #droped CustomerID as it is not useful for our model

Since Age, income, and spending score aren't really comparable anyway, I am going to give equal weight to them so that Kmeans converge faster. The reason that the scaled data converges faster because Kmeans are a gradient-based optimization algorithm. The idea is that if different features have different scales, then derivatives tend to align along directions with higher variance, which leads to poorer/slower convergence.

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)

# Build a clustering model

In [None]:
# Using the elbow method to find the optimal number of clusters
wcss = [] #Sum of squared distances of samples to their closest cluster center.
for i in np.arange(1,10,1):
    clf = KMeans(n_clusters=i, init="k-means++", random_state=42, n_jobs=-1)
    clf.fit(X)
    wcss.append(clf.inertia_)

plt.plot(np.arange(1,10,1), wcss, 'bo', linestyle='dashed')
plt.title("Elbow methos to find the optimal number of clusters")
plt.xlabel("No. of clusters")
plt.ylabel("WCSS")
plt.show()

We choose six clusters based on the plot above for our model.

In [None]:
model = KMeans(n_clusters=6, init="k-means++", random_state=42, n_jobs=-1)
data["cluster"] = model.fit_predict(X)
#labels = model.labels_

In [None]:
model.cluster_centers_


In [None]:
# get centroids
c1 = model.cluster_centers_[:, 0] #centroid values of the first feature : Age
c2 = model.cluster_centers_[:, 1] #centroid values of the second feature: Income
c3 = model.cluster_centers_[:, 2] #centroid values of the third feature :Spending Score

## add to df
data['cen1'] = data.cluster.map({0:c1[0], 1:c1[1], 2:c1[2], 3:c1[3], 4:c1[4], 5:c1[5]})
data['cen2'] = data.cluster.map({0:c2[0], 1:c2[1], 2:c2[2], 3:c2[3], 4:c2[4], 5:c2[5]})
data['cen3'] = data.cluster.map({0:c3[0], 1:c3[1], 2:c3[2], 3:c3[3], 4:c3[4], 5:c3[5]})

# define and map colors
colors = ["#2ec4b6", "#ffbe0b", "#fb5607", "#ff006e","#8338ec","#3a86ff"]
data['c'] = data.cluster.map({0:colors[0], 1:colors[1], 2:colors[2], 3:colors[3], 4:colors[4], 5:colors[5]})

# Visualising the clusters (3D)

In [None]:
data = data.rename(columns={"Annual Income (k$)": "Income", "Spending Score (1-100)":"SpendingScore"})

In [None]:
# Create main container with size of 12,12
fig = plt.figure(figsize=(12,12))
plt.subplots_adjust(bottom = 0., left = 0, top = 1., right = 1)


# Create first axes, the top-left plot 
sub1 = fig.add_subplot(2,2,1) # two rows, two columns, fist cell
sub1.scatter(data.Age, data.SpendingScore, c=data.c, s=data.SpendingScore)
sub1.set_xlabel("Age")
sub1.set_ylabel("Spending Scores")
# create a list of legend elemntes / markers / records
legend_elements = [Line2D([0], [0], marker='o', color='w', label='Cluster {}'.format(i+1), markerfacecolor=mcolor) for i, mcolor in enumerate(colors)]
sub1.legend(handles=legend_elements, loc='upper right')

# Create second axes, the top-right plot 
sub2 = fig.add_subplot(2,2,2) # two rows, two columns, fist cell
sub2.scatter(data.Income, data.SpendingScore, c=data.c, s=data.SpendingScore)
sub2.set_xlabel("Income")
sub2.set_ylabel("Spending Scores")
# create a list of legend elemntes / markers / records
legend_elements = [Line2D([0], [0], marker='o', color='w', label='Cluster {}'.format(i+1), markerfacecolor=mcolor) for i, mcolor in enumerate(colors)]
sub2.legend(handles=legend_elements, loc='upper right')



# Create second axes, the top-right plot 
sub3 = fig.add_subplot(2,2,(3,4), projection="3d") 
scat_plot = sub3.scatter(data.Age, data.Income, data.SpendingScore, c=data.c, s=data.SpendingScore)
sub3.set_title("Customer segmentation")
sub3.set_xlabel("Age")
sub3.set_ylabel("Anual Income")
sub3.set_zlabel("Spending Scores")
sub3.grid(False)
# create a list of legend elemntes / markers / records
legend_elements = [Line2D([0], [0], marker='o', color='w', label='Cluster {}'.format(i+1), markerfacecolor=mcolor) for i, mcolor in enumerate(colors)]
sub3.legend(handles=legend_elements, loc='best')


# cb = plt.colorbar(scat_plot, pad=0.2)
# cb.set_ticks([3, 0, 4, 5, 1, 2])
# cb.set_ticklabels(["cluster1", "cluster2", "cluster3", "cluster4", "cluster5", "cluster6"])
plt.show()

* Cluster4 (pink): These are young people in their 20's who has lower income but with higher spending scores. We should not encourage them buy more if we are being socially responsible as a company. However, this group does not have children or elders to support in general, so it makes sense they spend more money on goods for themseves. (either keep the current marketing stratergy or reduce the amount of targeted campaigns for this group.)

* Cluster 2 (yellow): These are 30-40 years old who have high incomes and they also spend more for themselves or for their families. (keep the current marketing stratergy)
 
* Cluster 5 and 6 (blue and purple dots): People in this group has average income. Their spending score stays in the middle regardless of their age. (keep the current marketing stratergy)
 
* Cluster 1 (green): People in this group has average income and low spending scores regardless of their age. (keep the current marketing stratergy)
 
* Cluster 3 (orange): People in this group has high income and low spending scores regardless of their age. (It would be worth trying to actively target this group for future campaigns.)





# Credits:
* When to scale features: https://stats.stackexchange.com/questions/89809/is-it-important-to-scale-data-before-clustering/89995#89995?newreg=fdd328d88bfe44d5afe265d45ee9729c
* [Plot clusters by Thiago Carvalho](https://towardsdatascience.com/visualizing-clusters-with-pythons-matplolib-35ae03d87489)