In [None]:
import random 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans 
from sklearn.datasets import make_blobs 
%matplotlib inline

In [None]:
np.random.seed(0)

In [None]:
# Create our own dataset
# n_samples: The total number of points equally divided among clusters
# centers: The number of centers to generate, or the fixed center locations
# cluster_std: The standard deviation of the clusters
X, y = make_blobs(n_samples=5000, centers=[[4,4], [-2, -1], [2, -3], [1, 1]], cluster_std=0.9)

plt.scatter(X[:, 0], X[:, 1], marker='.')

In [None]:
# Setting up K Means
# 1) init: Initialization method of the centroids: Value will be k-means++
# k-means++: Selects initial cluster centers for k-mean clustering in a smart way to speed up convergence
# 2) n_clusters: The number of clusters to form as well as the number of centroids to generate
# n_clusters = 4, means we have 4 centers
# 3) n_init: Number of time the k-means algorithm will be run with different centroid seeds. The final result will be the best
# output of n_init consecutive runs in terms of inertia.
# n_init = 12
k_means = KMeans(init = "k-means++", n_clusters = 4, n_init = 12)

k_means.fit(X)

# Grab the labels for each point in the model
k_means_labels = k_means.labels_
k_means_labels

# Get the coordinates of the cluster centers 
k_means_cluster_centers = k_means.cluster_centers_
k_means_cluster_centers

In [None]:
# Plot the K Means model
# Initialize the plot with the specified dimensions.
fig = plt.figure(figsize=(6, 4))

# Colors uses a color map, which will produce an array of colors based on
# the number of labels there are. We use set(k_means_labels) to get the
# unique labels.
colors = plt.cm.Spectral(np.linspace(0, 1, len(set(k_means_labels))))

# Create a plot
ax = fig.add_subplot(1, 1, 1)

# For loop that plots the data points and centroids.
# k will range from 0-3, which will match the possible clusters that each
# data point is in.
for k, col in zip(range(len([[4,4], [-2, -1], [2, -3], [1, 1]])), colors):

    # Create a list of all data points, where the data points that are 
    # in the cluster (ex. cluster 0) are labeled as true, else they are
    # labeled as false.
    my_members = (k_means_labels == k)
    
    # Define the centroid, or cluster center.
    cluster_center = k_means_cluster_centers[k]
    
    # Plots the datapoints with color col.
    ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.')
    
    # Plots the centroids with specified color, but with a darker outline
    ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,  markeredgecolor='k', markersize=6)

# Title of the plot
ax.set_title('KMeans')

# Remove x-axis ticks
ax.set_xticks(())

# Remove y-axis ticks
ax.set_yticks(())

# Show the plot
plt.show()


In [None]:
# Customer Segmentation with K Means
import pandas as pd
cust_df = pd.read_csv("Cust_Segmentation.csv")
cust_df.head()

# Drop the address column as it is a categorical variable. The k-means algorithm isn't directly applicable to categorical 
# variables because the Euclidean distance function isn't really meaningful for discrete variables
df = cust_df.drop('Address', axis=1)
df.head()

In [None]:
# Normalize the dataset
# Normalization is a statistical method that helps methematical-based algorithms to interpret features with different magnitudes
# and distributions equally
from sklearn.preprocessing import StandardScaler
X = df.values[:,1:]
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
Clus_dataSet

In [None]:
# Modeling
# Apply k means on our dataset, and take a look at cluster labels

clusterNum = 3
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(X)
labels = k_means.labels_
print(labels)

In [None]:
# Assign the labels to each row in the dataframe
df["Clus_km"] = labels
df.head(5)

# Check the centroid values by averaging the features in each cluster
df.groupby('Clus_km').mean()

In [None]:
# Look at the distribution of customers based on their age and income
area = np.pi * ( X[:, 1])**2  
plt.scatter(X[:, 0], X[:, 3], s=area, c=labels.astype(np.float), alpha=0.5)
plt.xlabel('Age', fontsize=18)
plt.ylabel('Income', fontsize=16)

plt.show()

# 3D plot
from mpl_toolkits.mplot3d import Axes3D 
fig = plt.figure(1, figsize=(8, 6))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

plt.cla()
# plt.ylabel('Age', fontsize=18)
# plt.xlabel('Income', fontsize=16)
# plt.zlabel('Education', fontsize=16)
ax.set_xlabel('Education')
ax.set_ylabel('Age')
ax.set_zlabel('Income')

ax.scatter(X[:, 1], X[:, 0], X[:, 3], c= labels.astype(np.float))
